novel-downloader 1.4.4__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +2 -2
  3. novel_downloader/cli/config.py +1 -83
  4. novel_downloader/cli/download.py +4 -5
  5. novel_downloader/cli/export.py +4 -1
  6. novel_downloader/cli/main.py +2 -0
  7. novel_downloader/cli/search.py +123 -0
  8. novel_downloader/config/__init__.py +3 -10
  9. novel_downloader/config/adapter.py +190 -54
  10. novel_downloader/config/loader.py +2 -3
  11. novel_downloader/core/__init__.py +13 -13
  12. novel_downloader/core/downloaders/__init__.py +10 -11
  13. novel_downloader/core/downloaders/base.py +152 -26
  14. novel_downloader/core/downloaders/biquge.py +5 -1
  15. novel_downloader/core/downloaders/common.py +157 -378
  16. novel_downloader/core/downloaders/esjzone.py +5 -1
  17. novel_downloader/core/downloaders/linovelib.py +5 -1
  18. novel_downloader/core/downloaders/qianbi.py +291 -4
  19. novel_downloader/core/downloaders/qidian.py +199 -285
  20. novel_downloader/core/downloaders/registry.py +67 -0
  21. novel_downloader/core/downloaders/sfacg.py +5 -1
  22. novel_downloader/core/downloaders/yamibo.py +5 -1
  23. novel_downloader/core/exporters/__init__.py +10 -11
  24. novel_downloader/core/exporters/base.py +87 -7
  25. novel_downloader/core/exporters/biquge.py +5 -8
  26. novel_downloader/core/exporters/common/__init__.py +2 -2
  27. novel_downloader/core/exporters/common/epub.py +82 -166
  28. novel_downloader/core/exporters/common/main_exporter.py +0 -60
  29. novel_downloader/core/exporters/common/txt.py +82 -83
  30. novel_downloader/core/exporters/epub_util.py +157 -1330
  31. novel_downloader/core/exporters/esjzone.py +5 -8
  32. novel_downloader/core/exporters/linovelib/__init__.py +2 -2
  33. novel_downloader/core/exporters/linovelib/epub.py +157 -212
  34. novel_downloader/core/exporters/linovelib/main_exporter.py +2 -59
  35. novel_downloader/core/exporters/linovelib/txt.py +67 -63
  36. novel_downloader/core/exporters/qianbi.py +5 -8
  37. novel_downloader/core/exporters/qidian.py +14 -4
  38. novel_downloader/core/exporters/registry.py +53 -0
  39. novel_downloader/core/exporters/sfacg.py +5 -8
  40. novel_downloader/core/exporters/txt_util.py +67 -0
  41. novel_downloader/core/exporters/yamibo.py +5 -8
  42. novel_downloader/core/fetchers/__init__.py +19 -24
  43. novel_downloader/core/fetchers/base/__init__.py +3 -3
  44. novel_downloader/core/fetchers/base/browser.py +23 -4
  45. novel_downloader/core/fetchers/base/session.py +30 -5
  46. novel_downloader/core/fetchers/biquge/__init__.py +3 -3
  47. novel_downloader/core/fetchers/biquge/browser.py +5 -0
  48. novel_downloader/core/fetchers/biquge/session.py +6 -1
  49. novel_downloader/core/fetchers/esjzone/__init__.py +3 -3
  50. novel_downloader/core/fetchers/esjzone/browser.py +5 -0
  51. novel_downloader/core/fetchers/esjzone/session.py +6 -1
  52. novel_downloader/core/fetchers/linovelib/__init__.py +3 -3
  53. novel_downloader/core/fetchers/linovelib/browser.py +6 -1
  54. novel_downloader/core/fetchers/linovelib/session.py +6 -1
  55. novel_downloader/core/fetchers/qianbi/__init__.py +3 -3
  56. novel_downloader/core/fetchers/qianbi/browser.py +5 -0
  57. novel_downloader/core/fetchers/qianbi/session.py +5 -0
  58. novel_downloader/core/fetchers/qidian/__init__.py +3 -3
  59. novel_downloader/core/fetchers/qidian/browser.py +12 -4
  60. novel_downloader/core/fetchers/qidian/session.py +11 -3
  61. novel_downloader/core/fetchers/registry.py +71 -0
  62. novel_downloader/core/fetchers/sfacg/__init__.py +3 -3
  63. novel_downloader/core/fetchers/sfacg/browser.py +5 -0
  64. novel_downloader/core/fetchers/sfacg/session.py +5 -0
  65. novel_downloader/core/fetchers/yamibo/__init__.py +3 -3
  66. novel_downloader/core/fetchers/yamibo/browser.py +5 -0
  67. novel_downloader/core/fetchers/yamibo/session.py +6 -1
  68. novel_downloader/core/interfaces/__init__.py +7 -5
  69. novel_downloader/core/interfaces/searcher.py +18 -0
  70. novel_downloader/core/parsers/__init__.py +10 -11
  71. novel_downloader/core/parsers/{biquge/main_parser.py → biquge.py} +7 -2
  72. novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +7 -2
  73. novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +7 -2
  74. novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +7 -2
  75. novel_downloader/core/parsers/qidian/__init__.py +2 -2
  76. novel_downloader/core/parsers/qidian/chapter_encrypted.py +23 -21
  77. novel_downloader/core/parsers/qidian/chapter_normal.py +1 -1
  78. novel_downloader/core/parsers/qidian/main_parser.py +10 -21
  79. novel_downloader/core/parsers/qidian/utils/__init__.py +11 -11
  80. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +5 -6
  81. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
  82. novel_downloader/core/parsers/registry.py +68 -0
  83. novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +7 -2
  84. novel_downloader/core/parsers/{yamibo/main_parser.py → yamibo.py} +7 -2
  85. novel_downloader/core/searchers/__init__.py +20 -0
  86. novel_downloader/core/searchers/base.py +92 -0
  87. novel_downloader/core/searchers/biquge.py +83 -0
  88. novel_downloader/core/searchers/esjzone.py +84 -0
  89. novel_downloader/core/searchers/qianbi.py +131 -0
  90. novel_downloader/core/searchers/qidian.py +87 -0
  91. novel_downloader/core/searchers/registry.py +63 -0
  92. novel_downloader/locales/en.json +12 -4
  93. novel_downloader/locales/zh.json +12 -4
  94. novel_downloader/models/__init__.py +4 -30
  95. novel_downloader/models/config.py +12 -6
  96. novel_downloader/models/search.py +16 -0
  97. novel_downloader/models/types.py +0 -2
  98. novel_downloader/resources/config/settings.toml +31 -4
  99. novel_downloader/resources/css_styles/intro.css +83 -0
  100. novel_downloader/resources/css_styles/main.css +30 -89
  101. novel_downloader/utils/__init__.py +52 -0
  102. novel_downloader/utils/chapter_storage.py +244 -224
  103. novel_downloader/utils/constants.py +1 -21
  104. novel_downloader/utils/epub/__init__.py +34 -0
  105. novel_downloader/utils/epub/builder.py +377 -0
  106. novel_downloader/utils/epub/constants.py +77 -0
  107. novel_downloader/utils/epub/documents.py +403 -0
  108. novel_downloader/utils/epub/models.py +134 -0
  109. novel_downloader/utils/epub/utils.py +212 -0
  110. novel_downloader/utils/file_utils/__init__.py +10 -14
  111. novel_downloader/utils/file_utils/io.py +20 -51
  112. novel_downloader/utils/file_utils/normalize.py +2 -2
  113. novel_downloader/utils/file_utils/sanitize.py +2 -3
  114. novel_downloader/utils/fontocr/__init__.py +5 -5
  115. novel_downloader/utils/{hash_store.py → fontocr/hash_store.py} +4 -3
  116. novel_downloader/utils/{hash_utils.py → fontocr/hash_utils.py} +2 -2
  117. novel_downloader/utils/fontocr/ocr_v1.py +13 -1
  118. novel_downloader/utils/fontocr/ocr_v2.py +13 -1
  119. novel_downloader/utils/fontocr/ocr_v3.py +744 -0
  120. novel_downloader/utils/i18n.py +2 -0
  121. novel_downloader/utils/logger.py +2 -0
  122. novel_downloader/utils/network.py +110 -251
  123. novel_downloader/utils/state.py +1 -0
  124. novel_downloader/utils/text_utils/__init__.py +18 -17
  125. novel_downloader/utils/text_utils/diff_display.py +4 -5
  126. novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
  127. novel_downloader/utils/text_utils/text_cleaner.py +179 -0
  128. novel_downloader/utils/text_utils/truncate_utils.py +62 -0
  129. novel_downloader/utils/time_utils/__init__.py +3 -3
  130. novel_downloader/utils/time_utils/datetime_utils.py +4 -5
  131. novel_downloader/utils/time_utils/sleep_utils.py +2 -3
  132. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/METADATA +2 -2
  133. novel_downloader-1.5.0.dist-info/RECORD +164 -0
  134. novel_downloader/config/site_rules.py +0 -94
  135. novel_downloader/core/factory/__init__.py +0 -20
  136. novel_downloader/core/factory/downloader.py +0 -73
  137. novel_downloader/core/factory/exporter.py +0 -58
  138. novel_downloader/core/factory/fetcher.py +0 -96
  139. novel_downloader/core/factory/parser.py +0 -86
  140. novel_downloader/core/fetchers/common/__init__.py +0 -14
  141. novel_downloader/core/fetchers/common/browser.py +0 -79
  142. novel_downloader/core/fetchers/common/session.py +0 -79
  143. novel_downloader/core/parsers/biquge/__init__.py +0 -10
  144. novel_downloader/core/parsers/common/__init__.py +0 -13
  145. novel_downloader/core/parsers/common/helper.py +0 -323
  146. novel_downloader/core/parsers/common/main_parser.py +0 -106
  147. novel_downloader/core/parsers/esjzone/__init__.py +0 -10
  148. novel_downloader/core/parsers/linovelib/__init__.py +0 -10
  149. novel_downloader/core/parsers/qianbi/__init__.py +0 -10
  150. novel_downloader/core/parsers/sfacg/__init__.py +0 -10
  151. novel_downloader/core/parsers/yamibo/__init__.py +0 -10
  152. novel_downloader/models/browser.py +0 -21
  153. novel_downloader/models/site_rules.py +0 -99
  154. novel_downloader/models/tasks.py +0 -33
  155. novel_downloader/resources/css_styles/volume-intro.css +0 -56
  156. novel_downloader/resources/json/replace_word_map.json +0 -4
  157. novel_downloader/resources/text/blacklist.txt +0 -22
  158. novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
  159. novel_downloader/utils/text_utils/font_mapping.py +0 -28
  160. novel_downloader/utils/text_utils/text_cleaning.py +0 -107
  161. novel_downloader-1.4.4.dist-info/RECORD +0 -165
  162. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/WHEEL +0 -0
  163. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/entry_points.txt +0 -0
  164. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/licenses/LICENSE +0 -0
  165. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/top_level.txt +0 -0
@@ -1,79 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.fetchers.common.browser
4
- ---------------------------------------------
5
-
6
- """
7
-
8
- from typing import Any
9
-
10
- from novel_downloader.core.fetchers.base import BaseBrowser
11
- from novel_downloader.models import FetcherConfig, SiteProfile
12
-
13
-
14
- class CommonBrowser(BaseBrowser):
15
- """
16
- A common async browser for handling site-specific HTTP requests.
17
- """
18
-
19
- def __init__(
20
- self,
21
- site: str,
22
- profile: SiteProfile,
23
- config: FetcherConfig,
24
- reuse_page: bool = False,
25
- **kwargs: Any,
26
- ) -> None:
27
- super().__init__(site, config, reuse_page, **kwargs)
28
- self._profile = profile
29
-
30
- async def get_book_info(
31
- self,
32
- book_id: str,
33
- **kwargs: Any,
34
- ) -> list[str]:
35
- """
36
- Fetch the raw HTML of the book info page asynchronously.
37
-
38
- :param book_id: The book identifier.
39
- :return: The page content as a string.
40
- """
41
- url = self.book_info_url(book_id=book_id)
42
- return [await self.fetch(url, **kwargs)]
43
-
44
- async def get_book_chapter(
45
- self,
46
- book_id: str,
47
- chapter_id: str,
48
- **kwargs: Any,
49
- ) -> list[str]:
50
- """
51
- Fetch the raw HTML of a single chapter asynchronously.
52
-
53
- :param book_id: The book identifier.
54
- :param chapter_id: The chapter identifier.
55
- :return: The chapter content as a string.
56
- """
57
- url = self.chapter_url(book_id=book_id, chapter_id=chapter_id)
58
- return [await self.fetch(url, **kwargs)]
59
-
60
- def book_info_url(self, book_id: str) -> str:
61
- """
62
- Construct the URL for fetching a book's info page.
63
-
64
- :param book_id: The identifier of the book.
65
- :return: Fully qualified URL for the book info page.
66
- """
67
- return self._profile["book_info_url"].format(book_id=book_id)
68
-
69
- def chapter_url(self, book_id: str, chapter_id: str) -> str:
70
- """
71
- Construct the URL for fetching a specific chapter.
72
-
73
- :param book_id: The identifier of the book.
74
- :param chapter_id: The identifier of the chapter.
75
- :return: Fully qualified chapter URL.
76
- """
77
- return self._profile["chapter_url"].format(
78
- book_id=book_id, chapter_id=chapter_id
79
- )
@@ -1,79 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.fetchers.common.session
4
- ---------------------------------------------
5
-
6
- """
7
-
8
- from typing import Any
9
-
10
- from novel_downloader.core.fetchers.base import BaseSession
11
- from novel_downloader.models import FetcherConfig, SiteProfile
12
-
13
-
14
- class CommonSession(BaseSession):
15
- """
16
- A common async session for handling site-specific HTTP requests.
17
- """
18
-
19
- def __init__(
20
- self,
21
- site: str,
22
- profile: SiteProfile,
23
- config: FetcherConfig,
24
- cookies: dict[str, str] | None = None,
25
- **kwargs: Any,
26
- ) -> None:
27
- super().__init__(site, config, cookies, **kwargs)
28
- self._profile = profile
29
-
30
- async def get_book_info(
31
- self,
32
- book_id: str,
33
- **kwargs: Any,
34
- ) -> list[str]:
35
- """
36
- Fetch the raw HTML of the book info page asynchronously.
37
-
38
- :param book_id: The book identifier.
39
- :return: The page content as a string.
40
- """
41
- url = self.book_info_url(book_id=book_id)
42
- return [await self.fetch(url, **kwargs)]
43
-
44
- async def get_book_chapter(
45
- self,
46
- book_id: str,
47
- chapter_id: str,
48
- **kwargs: Any,
49
- ) -> list[str]:
50
- """
51
- Fetch the raw HTML of a single chapter asynchronously.
52
-
53
- :param book_id: The book identifier.
54
- :param chapter_id: The chapter identifier.
55
- :return: The chapter content as a string.
56
- """
57
- url = self.chapter_url(book_id=book_id, chapter_id=chapter_id)
58
- return [await self.fetch(url, **kwargs)]
59
-
60
- def book_info_url(self, book_id: str) -> str:
61
- """
62
- Construct the URL for fetching a book's info page.
63
-
64
- :param book_id: The identifier of the book.
65
- :return: Fully qualified URL for the book info page.
66
- """
67
- return self._profile["book_info_url"].format(book_id=book_id)
68
-
69
- def chapter_url(self, book_id: str, chapter_id: str) -> str:
70
- """
71
- Construct the URL for fetching a specific chapter.
72
-
73
- :param book_id: The identifier of the book.
74
- :param chapter_id: The identifier of the chapter.
75
- :return: Fully qualified chapter URL.
76
- """
77
- return self._profile["chapter_url"].format(
78
- book_id=book_id, chapter_id=chapter_id
79
- )
@@ -1,10 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.biquge
4
- ------------------------------------
5
-
6
- """
7
-
8
- from .main_parser import BiqugeParser
9
-
10
- __all__ = ["BiqugeParser"]
@@ -1,13 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.common
4
- ------------------------------------
5
-
6
- This module provides a CommonParser class that implements
7
- general-purpose parsing logic for extracting novel metadata
8
- and chapter content based on site-specific rules.
9
- """
10
-
11
- from .main_parser import CommonParser
12
-
13
- __all__ = ["CommonParser"]
@@ -1,323 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.common.helper
4
- -------------------------------------------
5
-
6
- Shared utility functions for parsing Common pages.
7
- """
8
-
9
- import logging
10
- import re
11
- from collections.abc import Iterable, Iterator
12
- from typing import Any, cast
13
-
14
- from bs4 import BeautifulSoup, Tag
15
-
16
- from novel_downloader.models import (
17
- BookInfoRules,
18
- FieldRules,
19
- RuleStep,
20
- VolumesRules,
21
- )
22
-
23
- logger = logging.getLogger(__name__)
24
-
25
-
26
- def html_to_soup(html_str: str) -> BeautifulSoup:
27
- """
28
- Convert an HTML string to a BeautifulSoup object with fallback.
29
-
30
- :param html_str: Raw HTML string.
31
- :return: Parsed BeautifulSoup object.
32
- """
33
- try:
34
- return BeautifulSoup(html_str, "lxml")
35
- except Exception as e:
36
- logger.warning("[Parser] lxml parse failed, falling back: %s", e)
37
- return BeautifulSoup(html_str, "html.parser")
38
-
39
-
40
- class HTMLExtractor:
41
- """
42
- HTML extraction engine that applies a sequence of RuleSteps to
43
- pull data out of a page.
44
- """
45
-
46
- def __init__(self, html: str):
47
- self._html = html
48
- self._soup = html_to_soup(html)
49
-
50
- def extract_book_info(self, rules: BookInfoRules) -> dict[str, Any]:
51
- """
52
- Extract structured book information from HTML according to the given rules.
53
-
54
- Only non-empty fields in the rules are processed.
55
-
56
- :param rules: Extraction configuration specifying how to extract.
57
- :return: A dictionary containing extracted book information.
58
- """
59
- book_info: dict[str, Any] = {}
60
-
61
- for field_name, field_rules in rules.items():
62
- if field_rules is None:
63
- continue
64
-
65
- if field_name == "volumes":
66
- book_info[field_name] = self.extract_volumes_structure(
67
- cast(VolumesRules, field_rules)
68
- )
69
- else:
70
- steps = cast(FieldRules, field_rules)["steps"]
71
- book_info[field_name] = self.extract_field(steps)
72
-
73
- return book_info
74
-
75
- def extract_field(self, steps: list[RuleStep]) -> str:
76
- """
77
- Execute a list of extraction steps on the given HTML.
78
-
79
- - If any step yields None, stops processing further steps.
80
- - At the end, always returns a str:
81
- * If current is a list, converts items to text and joins with '\n'.
82
- * If current is a Tag, extracts its .get_text().
83
- * Else, uses str().
84
- """
85
-
86
- def flatten_list(items: Iterable[Any]) -> Iterator[Any]:
87
- for item in items:
88
- if isinstance(item, list):
89
- yield from flatten_list(item)
90
- else:
91
- yield item
92
-
93
- def to_text(item: Any) -> str:
94
- if isinstance(item, Tag):
95
- return str(item.get_text().strip())
96
- return str(item).strip()
97
-
98
- current: Any = self._soup
99
-
100
- for step in steps:
101
- t = step.get("type")
102
- if t == "select_one":
103
- sel = step.get("selector")
104
- current = current.select_one(sel) if sel else None
105
-
106
- elif t == "select":
107
- sel = step.get("selector")
108
- lst = current.select(sel) if sel else []
109
- idx = step.get("index")
110
- current = lst[idx] if idx is not None and idx < len(lst) else lst
111
-
112
- elif t == "exclude":
113
- sel = step.get("selector")
114
- for elem in current.select(sel or ""):
115
- elem.decompose()
116
-
117
- elif t == "find":
118
- nm = step.get("name")
119
- attrs = step.get("attrs") or {}
120
- current = current.find(nm, attrs=attrs)
121
-
122
- elif t == "find_all":
123
- nm = step.get("name")
124
- attrs = step.get("attrs") or {}
125
- lst = current.find_all(nm, attrs=attrs, limit=step.get("limit"))
126
- idx = step.get("index")
127
- current = lst[idx] if idx is not None and idx < len(lst) else lst
128
-
129
- elif t == "text":
130
- if isinstance(current, list):
131
- current = [elem.get_text() for elem in current]
132
- elif isinstance(current, Tag):
133
- current = current.get_text()
134
-
135
- elif t == "strip":
136
- chars = step.get("chars")
137
- if isinstance(current, list):
138
- current = [c.strip(chars) for c in current]
139
- elif isinstance(current, str):
140
- current = current.strip(chars)
141
-
142
- elif t == "regex":
143
- txt = str(current or "")
144
- pat = step.get("pattern") or ""
145
- flags = step.get("flags")
146
- flags = flags if flags is not None else 0
147
- match = re.compile(pat, flags).search(txt)
148
- if match:
149
- template = step.get("template")
150
- if template:
151
- s = template
152
- for i in range(1, len(match.groups()) + 1):
153
- s = s.replace(f"${i}", match.group(i) or "")
154
- current = s
155
- else:
156
- grp = step.get("group")
157
- grp = grp if grp is not None else 0
158
- current = match.group(grp)
159
- else:
160
- current = ""
161
-
162
- elif t == "replace":
163
- old = step.get("old")
164
- old = old if old is not None else ""
165
-
166
- new = step.get("new")
167
- new = new if new is not None else ""
168
-
169
- cnt = step.get("count")
170
- cnt = cnt if cnt is not None else -1
171
-
172
- if isinstance(current, list):
173
- current = [c.replace(old, new, cnt) for c in current]
174
- elif isinstance(current, str):
175
- current = current.replace(old, new, cnt)
176
-
177
- elif t == "split":
178
- sep = step.get("sep", "")
179
- idx = step.get("index")
180
- idx = idx if idx is not None else 0
181
- parts = (current or "").split(sep)
182
- current = parts[idx] if idx < len(parts) else ""
183
-
184
- elif t == "join":
185
- sep = step.get("sep")
186
- sep = sep if sep is not None else ""
187
- if isinstance(current, list):
188
- current = sep.join(current)
189
-
190
- elif t == "attr":
191
- name = step.get("attr") or ""
192
- if isinstance(current, list):
193
- current = [elem.get(name, "") for elem in current]
194
- elif isinstance(current, Tag):
195
- current = current.get(name, "")
196
-
197
- else:
198
- raise ValueError(f"Unsupported step type: {t}")
199
-
200
- if current is None:
201
- break
202
-
203
- # Final normalization
204
- if isinstance(current, list):
205
- flat = list(flatten_list(current))
206
- texts = [to_text(x) for x in flat if x is not None]
207
- return "\n".join(texts)
208
- if isinstance(current, Tag):
209
- return str(current.get_text().strip())
210
- return str(current or "").strip()
211
-
212
- def extract_mixed_volumes(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
213
- """
214
- Special mode: mixed <volume> and <chapter> under same parent.
215
- (e.g., dt / dd pattern in BiQuGe)
216
- """
217
- list_selector = volume_rule.get("list_selector")
218
- volume_selector = volume_rule.get("volume_selector")
219
- volume_name_steps = volume_rule.get("volume_name_steps")
220
- chapter_selector = volume_rule["chapter_selector"]
221
- chapter_steps_list = volume_rule["chapter_steps"]
222
-
223
- if not (
224
- list_selector and volume_selector and chapter_selector and volume_name_steps
225
- ):
226
- raise ValueError(
227
- "volume_mode='mixed' 时, 必须提供 list_selector, volume_selector, "
228
- "chapter_selector 和 volume_name_steps"
229
- )
230
-
231
- volumes: list[dict[str, Any]] = []
232
- current_volume: dict[str, Any] | None = None
233
- if not chapter_steps_list:
234
- chapter_steps_list = []
235
- chapter_info_steps = {item["key"]: item["steps"] for item in chapter_steps_list}
236
-
237
- list_area = self._soup.select_one(list_selector)
238
- if not list_area:
239
- raise ValueError(f"找不到 list_selector: {list_selector}")
240
-
241
- for elem in list_area.find_all(
242
- [volume_selector, chapter_selector], recursive=True
243
- ):
244
- if not isinstance(elem, Tag):
245
- continue
246
- if elem.name == volume_selector:
247
- extractor = HTMLExtractor(str(elem))
248
- volume_name = extractor.extract_field(volume_name_steps)
249
- current_volume = {"volume_name": volume_name, "chapters": []}
250
- volumes.append(current_volume)
251
-
252
- elif elem.name == chapter_selector and current_volume is not None:
253
- chap_extractor = HTMLExtractor(str(elem))
254
- chapter_data = {}
255
- for field, steps in chapter_info_steps.items():
256
- chapter_data[field] = chap_extractor.extract_field(steps)
257
- current_volume["chapters"].append(chapter_data)
258
-
259
- return volumes
260
-
261
- def extract_volume_blocks(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
262
- volume_selector = volume_rule.get("volume_selector")
263
- volume_name_steps = volume_rule.get("volume_name_steps")
264
- chapter_selector = volume_rule["chapter_selector"]
265
- chapter_steps_list = volume_rule["chapter_steps"]
266
- if not (volume_selector and volume_name_steps):
267
- raise ValueError(
268
- "has_volume=True 时, 必须提供 volume_selector 和 volume_name_steps"
269
- )
270
- volumes = []
271
- chapter_info_steps = {item["key"]: item["steps"] for item in chapter_steps_list}
272
- for vol in self._soup.select(volume_selector):
273
- extractor = HTMLExtractor(str(vol))
274
- volume_name = extractor.extract_field(volume_name_steps)
275
-
276
- chapters = []
277
- for chap in vol.select(chapter_selector):
278
- chap_extractor = HTMLExtractor(str(chap))
279
- chapter_data = {}
280
- for field, steps in chapter_info_steps.items():
281
- chapter_data[field] = chap_extractor.extract_field(steps)
282
- chapters.append(chapter_data)
283
-
284
- volumes.append({"volume_name": volume_name, "chapters": chapters})
285
-
286
- return volumes
287
-
288
- def extract_flat_chapters(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
289
- chapter_selector = volume_rule["chapter_selector"]
290
- chapter_steps_list = volume_rule["chapter_steps"]
291
- volume_selector = volume_rule.get("volume_selector")
292
- volumes = []
293
- chapter_info_steps = {item["key"]: item["steps"] for item in chapter_steps_list}
294
-
295
- if volume_selector:
296
- candidates = self._soup.select(volume_selector)
297
- else:
298
- candidates = [self._soup]
299
-
300
- all_chapters = []
301
- for area in candidates:
302
- for chap in area.select(chapter_selector):
303
- chap_extractor = HTMLExtractor(str(chap))
304
- chapter_data = {}
305
- for field, steps in chapter_info_steps.items():
306
- chapter_data[field] = chap_extractor.extract_field(steps)
307
- all_chapters.append(chapter_data)
308
-
309
- volumes.append({"volume_name": "未分卷", "chapters": all_chapters})
310
-
311
- return volumes
312
-
313
- def extract_volumes_structure(
314
- self, volume_rule: VolumesRules
315
- ) -> list[dict[str, Any]]:
316
- volume_mode = volume_rule.get("volume_mode", "normal")
317
- if volume_mode == "mixed":
318
- return self.extract_mixed_volumes(volume_rule)
319
-
320
- if volume_rule.get("has_volume", True):
321
- return self.extract_volume_blocks(volume_rule)
322
- else:
323
- return self.extract_flat_chapters(volume_rule)
@@ -1,106 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.common.main_parser
4
- ------------------------------------------------
5
-
6
- This package provides parsing components for handling
7
- Common pages.
8
- """
9
-
10
- from typing import Any
11
-
12
- from novel_downloader.core.parsers.base import BaseParser
13
- from novel_downloader.models import (
14
- ChapterDict,
15
- ParserConfig,
16
- SiteRules,
17
- )
18
-
19
- # from .helper import HTMLExtractor
20
-
21
-
22
- class CommonParser(BaseParser):
23
- """
24
- CommonParser extends BaseParser to support site-specific parsing rules.
25
-
26
- It accepts additional site information and site-specific rules during initialization
27
- """
28
-
29
- def __init__(self, config: ParserConfig, site: str, site_rule: SiteRules):
30
- """
31
- Initialize the parser with configuration, site name, and site-specific rules.
32
-
33
- :param config: ParserConfig object controlling parsing behavior.
34
- :param site: Name of the site this parser is targeting.
35
- :param site_rule: SiteRules object containing parsing rules for the site.
36
- """
37
- super().__init__(config)
38
- self._site = site
39
- self._site_rule = site_rule
40
-
41
- def parse_book_info(
42
- self,
43
- html_list: list[str],
44
- **kwargs: Any,
45
- ) -> dict[str, Any]:
46
- """
47
- Parse a book info page and extract metadata and chapter structure.
48
-
49
- :param html_list: Raw HTML of the book info page.
50
- :return: Parsed metadata and chapter structure as a dictionary.
51
- """
52
- if not html_list:
53
- return {}
54
- # extractor = HTMLExtractor(html_list[0])
55
- # rules = self._site_rule["book_info"]
56
- # return extractor.extract_book_info(rules)
57
- return {}
58
-
59
- def parse_chapter(
60
- self,
61
- html_list: list[str],
62
- chapter_id: str,
63
- **kwargs: Any,
64
- ) -> ChapterDict | None:
65
- """
66
- Parse a single chapter page and extract clean text or simplified HTML.
67
-
68
- :param html_list: Raw HTML of the chapter page.
69
- :param chapter_id: Identifier of the chapter being parsed.
70
- :return: Cleaned chapter content as plain text or minimal HTML.
71
- """
72
- if not html_list:
73
- return None
74
- # extractor = HTMLExtractor(html_list[0])
75
- # chapter_rules = self._site_rule["chapter"]
76
-
77
- # # 必须有正文内容
78
- # content_steps = chapter_rules.get("content")
79
- # if not content_steps:
80
- # raise ValueError(f"No chapter content steps for site: {self._site}")
81
-
82
- # title_steps = chapter_rules.get("title")
83
- # title = extractor.extract_field(title_steps["steps"]) if title_steps else ""
84
- # content = extractor.extract_field(content_steps["steps"])
85
- # if not content:
86
- # return None
87
-
88
- # return {
89
- # "id": chapter_id,
90
- # "title": title or "Untitled",
91
- # "content": content,
92
- # "extra": {
93
- # "site": self._site,
94
- # },
95
- # }
96
- return None
97
-
98
- @property
99
- def site(self) -> str:
100
- """Return the site name."""
101
- return self._site
102
-
103
- @property
104
- def site_rule(self) -> SiteRules:
105
- """Return the site-specific rules."""
106
- return self._site_rule
@@ -1,10 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.esjzone
4
- -------------------------------------
5
-
6
- """
7
-
8
- from .main_parser import EsjzoneParser
9
-
10
- __all__ = ["EsjzoneParser"]
@@ -1,10 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.linovelib
4
- ---------------------------------------
5
-
6
- """
7
-
8
- from .main_parser import LinovelibParser
9
-
10
- __all__ = ["LinovelibParser"]
@@ -1,10 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.qianbi
4
- ------------------------------------
5
-
6
- """
7
-
8
- from .main_parser import QianbiParser
9
-
10
- __all__ = ["QianbiParser"]
@@ -1,10 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.sfacg
4
- -----------------------------------
5
-
6
- """
7
-
8
- from .main_parser import SfacgParser
9
-
10
- __all__ = ["SfacgParser"]
@@ -1,10 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.yamibo
4
- ------------------------------------
5
-
6
- """
7
-
8
- from .main_parser import YamiboParser
9
-
10
- __all__ = ["YamiboParser"]
@@ -1,21 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.models.browser
4
- -------------------------------
5
-
6
- """
7
-
8
- from pathlib import Path
9
- from typing import TypedDict
10
-
11
- from playwright.async_api import ViewportSize
12
-
13
-
14
- class NewContextOptions(TypedDict, total=False):
15
- user_agent: str
16
- locale: str
17
- storage_state: Path
18
- viewport: ViewportSize
19
- java_script_enabled: bool
20
- ignore_https_errors: bool
21
- extra_http_headers: dict[str, str]