novel-downloader 1.4.5__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +2 -4
- novel_downloader/cli/clean.py +21 -88
- novel_downloader/cli/config.py +27 -104
- novel_downloader/cli/download.py +78 -66
- novel_downloader/cli/export.py +20 -21
- novel_downloader/cli/main.py +3 -1
- novel_downloader/cli/search.py +120 -0
- novel_downloader/cli/ui.py +156 -0
- novel_downloader/config/__init__.py +10 -14
- novel_downloader/config/adapter.py +195 -99
- novel_downloader/config/{loader.py → file_io.py} +53 -27
- novel_downloader/core/__init__.py +14 -13
- novel_downloader/core/archived/deqixs/fetcher.py +115 -0
- novel_downloader/core/archived/deqixs/parser.py +132 -0
- novel_downloader/core/archived/deqixs/searcher.py +89 -0
- novel_downloader/core/archived/qidian/searcher.py +79 -0
- novel_downloader/core/archived/wanbengo/searcher.py +98 -0
- novel_downloader/core/archived/xshbook/searcher.py +93 -0
- novel_downloader/core/downloaders/__init__.py +8 -30
- novel_downloader/core/downloaders/base.py +182 -30
- novel_downloader/core/downloaders/common.py +217 -384
- novel_downloader/core/downloaders/qianbi.py +332 -4
- novel_downloader/core/downloaders/qidian.py +250 -290
- novel_downloader/core/downloaders/registry.py +69 -0
- novel_downloader/core/downloaders/signals.py +46 -0
- novel_downloader/core/exporters/__init__.py +8 -26
- novel_downloader/core/exporters/base.py +107 -31
- novel_downloader/core/exporters/common/__init__.py +3 -4
- novel_downloader/core/exporters/common/epub.py +92 -171
- novel_downloader/core/exporters/common/main_exporter.py +14 -67
- novel_downloader/core/exporters/common/txt.py +90 -86
- novel_downloader/core/exporters/epub_util.py +184 -1327
- novel_downloader/core/exporters/linovelib/__init__.py +3 -2
- novel_downloader/core/exporters/linovelib/epub.py +165 -222
- novel_downloader/core/exporters/linovelib/main_exporter.py +10 -71
- novel_downloader/core/exporters/linovelib/txt.py +76 -66
- novel_downloader/core/exporters/qidian.py +15 -11
- novel_downloader/core/exporters/registry.py +55 -0
- novel_downloader/core/exporters/txt_util.py +67 -0
- novel_downloader/core/fetchers/__init__.py +57 -56
- novel_downloader/core/fetchers/aaatxt.py +83 -0
- novel_downloader/core/fetchers/{biquge/session.py → b520.py} +10 -10
- novel_downloader/core/fetchers/{base/session.py → base.py} +63 -47
- novel_downloader/core/fetchers/biquyuedu.py +83 -0
- novel_downloader/core/fetchers/dxmwx.py +110 -0
- novel_downloader/core/fetchers/eightnovel.py +139 -0
- novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +23 -11
- novel_downloader/core/fetchers/guidaye.py +85 -0
- novel_downloader/core/fetchers/hetushu.py +92 -0
- novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +22 -26
- novel_downloader/core/fetchers/ixdzs8.py +113 -0
- novel_downloader/core/fetchers/jpxs123.py +101 -0
- novel_downloader/core/fetchers/{biquge/browser.py → lewenn.py} +15 -15
- novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +16 -12
- novel_downloader/core/fetchers/piaotia.py +105 -0
- novel_downloader/core/fetchers/qbtr.py +101 -0
- novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +9 -9
- novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +55 -40
- novel_downloader/core/fetchers/quanben5.py +92 -0
- novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
- novel_downloader/core/fetchers/registry.py +60 -0
- novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +11 -9
- novel_downloader/core/fetchers/shencou.py +106 -0
- novel_downloader/core/fetchers/{common/browser.py → shuhaige.py} +24 -19
- novel_downloader/core/fetchers/tongrenquan.py +84 -0
- novel_downloader/core/fetchers/ttkan.py +95 -0
- novel_downloader/core/fetchers/{common/session.py → wanbengo.py} +21 -17
- novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
- novel_downloader/core/fetchers/xiguashuwu.py +177 -0
- novel_downloader/core/fetchers/xs63b.py +171 -0
- novel_downloader/core/fetchers/xshbook.py +85 -0
- novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +23 -11
- novel_downloader/core/fetchers/yibige.py +114 -0
- novel_downloader/core/interfaces/__init__.py +8 -14
- novel_downloader/core/interfaces/downloader.py +6 -2
- novel_downloader/core/interfaces/exporter.py +7 -7
- novel_downloader/core/interfaces/fetcher.py +4 -17
- novel_downloader/core/interfaces/parser.py +5 -6
- novel_downloader/core/interfaces/searcher.py +26 -0
- novel_downloader/core/parsers/__init__.py +58 -22
- novel_downloader/core/parsers/aaatxt.py +132 -0
- novel_downloader/core/parsers/b520.py +116 -0
- novel_downloader/core/parsers/base.py +63 -12
- novel_downloader/core/parsers/biquyuedu.py +133 -0
- novel_downloader/core/parsers/dxmwx.py +162 -0
- novel_downloader/core/parsers/eightnovel.py +224 -0
- novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +67 -67
- novel_downloader/core/parsers/guidaye.py +128 -0
- novel_downloader/core/parsers/hetushu.py +139 -0
- novel_downloader/core/parsers/i25zw.py +137 -0
- novel_downloader/core/parsers/ixdzs8.py +186 -0
- novel_downloader/core/parsers/jpxs123.py +137 -0
- novel_downloader/core/parsers/lewenn.py +142 -0
- novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +54 -65
- novel_downloader/core/parsers/piaotia.py +189 -0
- novel_downloader/core/parsers/qbtr.py +136 -0
- novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +54 -51
- novel_downloader/core/parsers/qidian/__init__.py +2 -2
- novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +290 -346
- novel_downloader/core/parsers/qidian/chapter_normal.py +25 -56
- novel_downloader/core/parsers/qidian/main_parser.py +19 -57
- novel_downloader/core/parsers/qidian/utils/__init__.py +12 -11
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +6 -7
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
- novel_downloader/core/parsers/quanben5.py +103 -0
- novel_downloader/core/parsers/registry.py +57 -0
- novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +46 -48
- novel_downloader/core/parsers/shencou.py +215 -0
- novel_downloader/core/parsers/shuhaige.py +111 -0
- novel_downloader/core/parsers/tongrenquan.py +116 -0
- novel_downloader/core/parsers/ttkan.py +132 -0
- novel_downloader/core/parsers/wanbengo.py +191 -0
- novel_downloader/core/parsers/xiaoshuowu.py +173 -0
- novel_downloader/core/parsers/xiguashuwu.py +435 -0
- novel_downloader/core/parsers/xs63b.py +161 -0
- novel_downloader/core/parsers/xshbook.py +134 -0
- novel_downloader/core/parsers/yamibo.py +155 -0
- novel_downloader/core/parsers/yibige.py +166 -0
- novel_downloader/core/searchers/__init__.py +51 -0
- novel_downloader/core/searchers/aaatxt.py +107 -0
- novel_downloader/core/searchers/b520.py +84 -0
- novel_downloader/core/searchers/base.py +168 -0
- novel_downloader/core/searchers/dxmwx.py +105 -0
- novel_downloader/core/searchers/eightnovel.py +84 -0
- novel_downloader/core/searchers/esjzone.py +102 -0
- novel_downloader/core/searchers/hetushu.py +92 -0
- novel_downloader/core/searchers/i25zw.py +93 -0
- novel_downloader/core/searchers/ixdzs8.py +107 -0
- novel_downloader/core/searchers/jpxs123.py +107 -0
- novel_downloader/core/searchers/piaotia.py +100 -0
- novel_downloader/core/searchers/qbtr.py +106 -0
- novel_downloader/core/searchers/qianbi.py +165 -0
- novel_downloader/core/searchers/quanben5.py +144 -0
- novel_downloader/core/searchers/registry.py +79 -0
- novel_downloader/core/searchers/shuhaige.py +124 -0
- novel_downloader/core/searchers/tongrenquan.py +110 -0
- novel_downloader/core/searchers/ttkan.py +92 -0
- novel_downloader/core/searchers/xiaoshuowu.py +122 -0
- novel_downloader/core/searchers/xiguashuwu.py +95 -0
- novel_downloader/core/searchers/xs63b.py +104 -0
- novel_downloader/locales/en.json +36 -79
- novel_downloader/locales/zh.json +37 -80
- novel_downloader/models/__init__.py +23 -50
- novel_downloader/models/book.py +44 -0
- novel_downloader/models/config.py +16 -43
- novel_downloader/models/login.py +1 -1
- novel_downloader/models/search.py +21 -0
- novel_downloader/resources/config/settings.toml +39 -74
- novel_downloader/resources/css_styles/intro.css +83 -0
- novel_downloader/resources/css_styles/main.css +30 -89
- novel_downloader/resources/json/xiguashuwu.json +718 -0
- novel_downloader/utils/__init__.py +43 -0
- novel_downloader/utils/chapter_storage.py +247 -226
- novel_downloader/utils/constants.py +5 -50
- novel_downloader/utils/cookies.py +6 -18
- novel_downloader/utils/crypto_utils/__init__.py +13 -0
- novel_downloader/utils/crypto_utils/aes_util.py +90 -0
- novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
- novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
- novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
- novel_downloader/utils/epub/__init__.py +34 -0
- novel_downloader/utils/epub/builder.py +377 -0
- novel_downloader/utils/epub/constants.py +118 -0
- novel_downloader/utils/epub/documents.py +297 -0
- novel_downloader/utils/epub/models.py +120 -0
- novel_downloader/utils/epub/utils.py +179 -0
- novel_downloader/utils/file_utils/__init__.py +5 -30
- novel_downloader/utils/file_utils/io.py +9 -150
- novel_downloader/utils/file_utils/normalize.py +2 -2
- novel_downloader/utils/file_utils/sanitize.py +2 -7
- novel_downloader/utils/fontocr.py +207 -0
- novel_downloader/utils/i18n.py +2 -0
- novel_downloader/utils/logger.py +10 -16
- novel_downloader/utils/network.py +111 -252
- novel_downloader/utils/state.py +5 -90
- novel_downloader/utils/text_utils/__init__.py +16 -21
- novel_downloader/utils/text_utils/diff_display.py +6 -9
- novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
- novel_downloader/utils/text_utils/text_cleaner.py +179 -0
- novel_downloader/utils/text_utils/truncate_utils.py +62 -0
- novel_downloader/utils/time_utils/__init__.py +6 -12
- novel_downloader/utils/time_utils/datetime_utils.py +23 -33
- novel_downloader/utils/time_utils/sleep_utils.py +5 -10
- novel_downloader/web/__init__.py +13 -0
- novel_downloader/web/components/__init__.py +11 -0
- novel_downloader/web/components/navigation.py +35 -0
- novel_downloader/web/main.py +66 -0
- novel_downloader/web/pages/__init__.py +17 -0
- novel_downloader/web/pages/download.py +78 -0
- novel_downloader/web/pages/progress.py +147 -0
- novel_downloader/web/pages/search.py +329 -0
- novel_downloader/web/services/__init__.py +17 -0
- novel_downloader/web/services/client_dialog.py +164 -0
- novel_downloader/web/services/cred_broker.py +113 -0
- novel_downloader/web/services/cred_models.py +35 -0
- novel_downloader/web/services/task_manager.py +264 -0
- novel_downloader-2.0.0.dist-info/METADATA +171 -0
- novel_downloader-2.0.0.dist-info/RECORD +210 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
- novel_downloader/config/site_rules.py +0 -94
- novel_downloader/core/downloaders/biquge.py +0 -25
- novel_downloader/core/downloaders/esjzone.py +0 -25
- novel_downloader/core/downloaders/linovelib.py +0 -25
- novel_downloader/core/downloaders/sfacg.py +0 -25
- novel_downloader/core/downloaders/yamibo.py +0 -25
- novel_downloader/core/exporters/biquge.py +0 -25
- novel_downloader/core/exporters/esjzone.py +0 -25
- novel_downloader/core/exporters/qianbi.py +0 -25
- novel_downloader/core/exporters/sfacg.py +0 -25
- novel_downloader/core/exporters/yamibo.py +0 -25
- novel_downloader/core/factory/__init__.py +0 -20
- novel_downloader/core/factory/downloader.py +0 -73
- novel_downloader/core/factory/exporter.py +0 -58
- novel_downloader/core/factory/fetcher.py +0 -96
- novel_downloader/core/factory/parser.py +0 -86
- novel_downloader/core/fetchers/base/__init__.py +0 -14
- novel_downloader/core/fetchers/base/browser.py +0 -403
- novel_downloader/core/fetchers/biquge/__init__.py +0 -14
- novel_downloader/core/fetchers/common/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/browser.py +0 -204
- novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
- novel_downloader/core/fetchers/linovelib/browser.py +0 -193
- novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/browser.py +0 -318
- novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
- novel_downloader/core/fetchers/sfacg/browser.py +0 -189
- novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
- novel_downloader/core/fetchers/yamibo/browser.py +0 -229
- novel_downloader/core/parsers/biquge/__init__.py +0 -10
- novel_downloader/core/parsers/biquge/main_parser.py +0 -134
- novel_downloader/core/parsers/common/__init__.py +0 -13
- novel_downloader/core/parsers/common/helper.py +0 -323
- novel_downloader/core/parsers/common/main_parser.py +0 -106
- novel_downloader/core/parsers/esjzone/__init__.py +0 -10
- novel_downloader/core/parsers/linovelib/__init__.py +0 -10
- novel_downloader/core/parsers/qianbi/__init__.py +0 -10
- novel_downloader/core/parsers/sfacg/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/main_parser.py +0 -194
- novel_downloader/models/browser.py +0 -21
- novel_downloader/models/chapter.py +0 -25
- novel_downloader/models/site_rules.py +0 -99
- novel_downloader/models/tasks.py +0 -33
- novel_downloader/models/types.py +0 -15
- novel_downloader/resources/css_styles/volume-intro.css +0 -56
- novel_downloader/resources/json/replace_word_map.json +0 -4
- novel_downloader/resources/text/blacklist.txt +0 -22
- novel_downloader/tui/__init__.py +0 -7
- novel_downloader/tui/app.py +0 -32
- novel_downloader/tui/main.py +0 -17
- novel_downloader/tui/screens/__init__.py +0 -14
- novel_downloader/tui/screens/home.py +0 -198
- novel_downloader/tui/screens/login.py +0 -74
- novel_downloader/tui/styles/home_layout.tcss +0 -79
- novel_downloader/tui/widgets/richlog_handler.py +0 -24
- novel_downloader/utils/cache.py +0 -24
- novel_downloader/utils/fontocr/__init__.py +0 -22
- novel_downloader/utils/fontocr/model_loader.py +0 -69
- novel_downloader/utils/fontocr/ocr_v1.py +0 -303
- novel_downloader/utils/fontocr/ocr_v2.py +0 -752
- novel_downloader/utils/hash_store.py +0 -279
- novel_downloader/utils/hash_utils.py +0 -103
- novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
- novel_downloader/utils/text_utils/font_mapping.py +0 -28
- novel_downloader/utils/text_utils/text_cleaning.py +0 -107
- novel_downloader-1.4.5.dist-info/METADATA +0 -196
- novel_downloader-1.4.5.dist-info/RECORD +0 -165
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
"""
|
3
|
-
novel_downloader.core.parsers.sfacg
|
4
|
-
|
3
|
+
novel_downloader.core.parsers.sfacg
|
4
|
+
-----------------------------------
|
5
5
|
|
6
6
|
"""
|
7
7
|
|
@@ -10,18 +10,33 @@ from typing import Any
|
|
10
10
|
from lxml import html
|
11
11
|
|
12
12
|
from novel_downloader.core.parsers.base import BaseParser
|
13
|
-
from novel_downloader.
|
14
|
-
|
15
|
-
|
13
|
+
from novel_downloader.core.parsers.registry import register_parser
|
14
|
+
from novel_downloader.models import (
|
15
|
+
BookInfoDict,
|
16
|
+
ChapterDict,
|
17
|
+
ChapterInfoDict,
|
18
|
+
VolumeInfoDict,
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
@register_parser(
|
23
|
+
site_keys=["sfacg"],
|
24
|
+
)
|
16
25
|
class SfacgParser(BaseParser):
|
17
|
-
"""
|
26
|
+
"""
|
27
|
+
Parser for sfacg book pages.
|
28
|
+
"""
|
18
29
|
|
19
30
|
# Book info XPaths
|
20
31
|
_BOOK_NAME_XPATH = '//ul[@class="book_info"]//span[@class="book_newtitle"]/text()'
|
21
32
|
_AUTHOR_INFO_XPATH = '//ul[@class="book_info"]//span[@class="book_info3"]/text()'
|
22
33
|
_UPDATE_TIME_XPATH = '//ul[@class="book_info"]//span[@class="book_info3"]/br/following-sibling::text()' # noqa: E501
|
23
34
|
_COVER_URL_XPATH = '//ul[@class="book_info"]//li/img/@src'
|
24
|
-
_STATUS_XPATH = '//ul[@class="book_info"]//div[@class="book_info2"]/span/text()'
|
35
|
+
# _STATUS_XPATH = '//ul[@class="book_info"]//div[@class="book_info2"]/span/text()'
|
36
|
+
_STATUS_XPATH = (
|
37
|
+
'//ul[@class="book_info"]//div[@class="book_info2"]/span/text()'
|
38
|
+
' and (contains(., "完结") or contains(., "连载"))]/text()'
|
39
|
+
)
|
25
40
|
_SUMMARY_XPATH = '//ul[@class="book_profile"]/li[@class="book_bk_qs1"]/text()'
|
26
41
|
|
27
42
|
# Catalog XPaths
|
@@ -42,54 +57,35 @@ class SfacgParser(BaseParser):
|
|
42
57
|
self,
|
43
58
|
html_list: list[str],
|
44
59
|
**kwargs: Any,
|
45
|
-
) ->
|
46
|
-
"""
|
47
|
-
Parse a book info page and extract metadata and chapter structure.
|
48
|
-
|
49
|
-
:param html_list: Raw HTML of the book info page.
|
50
|
-
:return: Parsed metadata and chapter structure as a dictionary.
|
51
|
-
"""
|
60
|
+
) -> BookInfoDict | None:
|
52
61
|
if len(html_list) < 2:
|
53
|
-
return
|
62
|
+
return None
|
54
63
|
|
55
64
|
info_tree = html.fromstring(html_list[0])
|
56
65
|
catalog_tree = html.fromstring(html_list[1])
|
57
66
|
|
58
|
-
result: dict[str, Any] = {}
|
59
|
-
|
60
67
|
# Book metadata
|
61
|
-
book_name = info_tree.xpath(self._BOOK_NAME_XPATH)
|
62
|
-
result["book_name"] = book_name[0].strip() if book_name else ""
|
68
|
+
book_name = self._first_str(info_tree.xpath(self._BOOK_NAME_XPATH))
|
63
69
|
|
64
|
-
|
65
|
-
|
66
|
-
result["word_count"] = (
|
67
|
-
book_info3[0].split("/")[1].strip()
|
68
|
-
if book_info3 and len(book_info3[0].split("/")) > 1
|
69
|
-
else ""
|
70
|
-
)
|
70
|
+
book_info3_str = self._first_str(info_tree.xpath(self._AUTHOR_INFO_XPATH))
|
71
|
+
author, _, word_count = (p.strip() for p in book_info3_str.partition("/"))
|
71
72
|
|
72
|
-
|
73
|
-
result["update_time"] = book_info3_br[0].strip() if book_info3_br else ""
|
73
|
+
update_time = self._first_str(info_tree.xpath(self._UPDATE_TIME_XPATH))
|
74
74
|
|
75
|
-
cover_url = info_tree.xpath(self._COVER_URL_XPATH)
|
76
|
-
result["cover_url"] = "https:" + cover_url[0] if cover_url else ""
|
75
|
+
cover_url = "https:" + self._first_str(info_tree.xpath(self._COVER_URL_XPATH))
|
77
76
|
|
78
|
-
serial_status = info_tree.xpath(self._STATUS_XPATH)
|
79
|
-
result["serial_status"] = next(
|
80
|
-
(s for s in serial_status if "完结" in s or "连载" in s), ""
|
81
|
-
)
|
77
|
+
serial_status = self._first_str(info_tree.xpath(self._STATUS_XPATH))
|
82
78
|
|
83
|
-
|
84
|
-
|
79
|
+
summary_elem = info_tree.xpath(self._SUMMARY_XPATH)
|
80
|
+
summary = "".join(summary_elem).strip()
|
85
81
|
|
86
82
|
# Chapter structure
|
87
83
|
volume_titles = catalog_tree.xpath(self._VOLUME_TITLE_XPATH)
|
88
84
|
volume_blocks = catalog_tree.xpath(self._VOLUME_CONTENT_XPATH)
|
89
85
|
|
90
|
-
volumes = []
|
86
|
+
volumes: list[VolumeInfoDict] = []
|
91
87
|
for vol_title, vol_block in zip(volume_titles, volume_blocks, strict=False):
|
92
|
-
chapters = []
|
88
|
+
chapters: list[ChapterInfoDict] = []
|
93
89
|
for a in vol_block.xpath(self._CHAPTER_LIST_XPATH):
|
94
90
|
href = a.xpath("./@href")[0] if a.xpath("./@href") else ""
|
95
91
|
title = "".join(a.xpath(".//li//text()")).strip()
|
@@ -107,9 +103,18 @@ class SfacgParser(BaseParser):
|
|
107
103
|
"chapters": chapters,
|
108
104
|
}
|
109
105
|
)
|
110
|
-
result["volumes"] = volumes
|
111
106
|
|
112
|
-
return
|
107
|
+
return {
|
108
|
+
"book_name": book_name,
|
109
|
+
"author": author,
|
110
|
+
"cover_url": cover_url,
|
111
|
+
"update_time": update_time,
|
112
|
+
"word_count": word_count,
|
113
|
+
"serial_status": serial_status,
|
114
|
+
"summary": summary,
|
115
|
+
"volumes": volumes,
|
116
|
+
"extra": {},
|
117
|
+
}
|
113
118
|
|
114
119
|
def parse_chapter(
|
115
120
|
self,
|
@@ -117,13 +122,6 @@ class SfacgParser(BaseParser):
|
|
117
122
|
chapter_id: str,
|
118
123
|
**kwargs: Any,
|
119
124
|
) -> ChapterDict | None:
|
120
|
-
"""
|
121
|
-
Parse a single chapter page and extract clean text or simplified HTML.
|
122
|
-
|
123
|
-
:param html_list: Raw HTML of the chapter page.
|
124
|
-
:param chapter_id: Identifier of the chapter being parsed.
|
125
|
-
:return: Cleaned chapter content as plain text or minimal HTML.
|
126
|
-
"""
|
127
125
|
if not html_list:
|
128
126
|
return None
|
129
127
|
keywords = [
|
@@ -151,7 +149,7 @@ class SfacgParser(BaseParser):
|
|
151
149
|
raw_text_parts = tree.xpath(self._CHAPTER_TEXT_XPATH)
|
152
150
|
content_lines = [txt.strip() for txt in raw_text_parts if txt.strip()]
|
153
151
|
|
154
|
-
content = "\n
|
152
|
+
content = "\n".join(content_lines).strip()
|
155
153
|
if not content:
|
156
154
|
return None
|
157
155
|
|
@@ -0,0 +1,215 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.shencou
|
4
|
+
-------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any
|
9
|
+
|
10
|
+
from lxml import etree, html
|
11
|
+
|
12
|
+
from novel_downloader.core.parsers.base import BaseParser
|
13
|
+
from novel_downloader.core.parsers.registry import register_parser
|
14
|
+
from novel_downloader.models import (
|
15
|
+
BookInfoDict,
|
16
|
+
ChapterDict,
|
17
|
+
VolumeInfoDict,
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
@register_parser(
|
22
|
+
site_keys=["shencou"],
|
23
|
+
)
|
24
|
+
class ShencouParser(BaseParser):
|
25
|
+
"""
|
26
|
+
Parser for 神凑轻小说 book pages.
|
27
|
+
"""
|
28
|
+
|
29
|
+
def parse_book_info(
|
30
|
+
self,
|
31
|
+
html_list: list[str],
|
32
|
+
**kwargs: Any,
|
33
|
+
) -> BookInfoDict | None:
|
34
|
+
if len(html_list) < 2:
|
35
|
+
return None
|
36
|
+
|
37
|
+
info_tree = html.fromstring(html_list[0])
|
38
|
+
catalog_tree = html.fromstring(html_list[1])
|
39
|
+
|
40
|
+
# --- Metadata ---
|
41
|
+
raw_name = self._first_str(info_tree.xpath("//span//a/text()"))
|
42
|
+
book_name = raw_name[:-2] if raw_name.endswith("小说") else raw_name
|
43
|
+
|
44
|
+
author = self._first_str(
|
45
|
+
info_tree.xpath('//td[contains(text(),"小说作者")]/text()'),
|
46
|
+
replaces=[("小说作者:", "")],
|
47
|
+
)
|
48
|
+
|
49
|
+
cover_url = self._first_str(
|
50
|
+
info_tree.xpath('//a[contains(@href,"/files/article/image")]/img/@src')
|
51
|
+
)
|
52
|
+
|
53
|
+
# word count
|
54
|
+
word_count = self._first_str(
|
55
|
+
info_tree.xpath('//td[contains(text(),"全文长度")]/text()'),
|
56
|
+
replaces=[("全文长度:", "")],
|
57
|
+
)
|
58
|
+
|
59
|
+
# update time
|
60
|
+
update_time = self._first_str(
|
61
|
+
info_tree.xpath('//td[contains(text(),"最后更新")]/text()'),
|
62
|
+
replaces=[("最后更新:", "")],
|
63
|
+
)
|
64
|
+
|
65
|
+
# serial status
|
66
|
+
serial_status = self._first_str(
|
67
|
+
info_tree.xpath('//td[contains(text(),"写作进度")]/text()'),
|
68
|
+
replaces=[("写作进度:", "")],
|
69
|
+
)
|
70
|
+
|
71
|
+
# summary
|
72
|
+
raw_detail = self._norm_space(
|
73
|
+
info_tree.xpath('string(//td[@width="80%" and @valign="top"])')
|
74
|
+
)
|
75
|
+
summary = ""
|
76
|
+
if "内容简介:" in raw_detail and "本书公告:" in raw_detail:
|
77
|
+
intro = raw_detail.split("内容简介:", 1)[1]
|
78
|
+
summary = intro.split("本书公告:", 1)[0].strip()
|
79
|
+
|
80
|
+
# --- Catalog / Chapters ---
|
81
|
+
volumes: list[VolumeInfoDict] = []
|
82
|
+
curr_vol: VolumeInfoDict = {"volume_name": "未命名卷", "chapters": []}
|
83
|
+
|
84
|
+
# Walk through volume headers (.zjbox) and lists (.zjlist4) in document order
|
85
|
+
for elem in catalog_tree.xpath(
|
86
|
+
'//div[@class="zjbox"] | //div[@class="zjlist4"]'
|
87
|
+
):
|
88
|
+
cls_attr = elem.get("class", "")
|
89
|
+
if "zjbox" in cls_attr:
|
90
|
+
# before starting new volume, save the previous if it has chapters
|
91
|
+
if curr_vol["chapters"]:
|
92
|
+
volumes.append(curr_vol)
|
93
|
+
# start a new volume
|
94
|
+
vol_name = elem.xpath(".//h2/text()")[0].strip()
|
95
|
+
curr_vol = {"volume_name": vol_name, "chapters": []}
|
96
|
+
elif "zjlist4" in cls_attr:
|
97
|
+
# collect all <li><a> entries under this list
|
98
|
+
for a in elem.xpath(".//ol/li/a"):
|
99
|
+
url = a.get("href").strip()
|
100
|
+
title = a.text_content().strip()
|
101
|
+
# '203740.html' -> '203740'
|
102
|
+
chap_id = url.split(".")[0]
|
103
|
+
curr_vol["chapters"].append(
|
104
|
+
{
|
105
|
+
"title": title,
|
106
|
+
"url": url,
|
107
|
+
"chapterId": chap_id,
|
108
|
+
}
|
109
|
+
)
|
110
|
+
|
111
|
+
# append last volume if not empty
|
112
|
+
if curr_vol["chapters"]:
|
113
|
+
volumes.append(curr_vol)
|
114
|
+
|
115
|
+
return {
|
116
|
+
"book_name": book_name,
|
117
|
+
"author": author,
|
118
|
+
"cover_url": cover_url,
|
119
|
+
"update_time": update_time,
|
120
|
+
"summary": summary,
|
121
|
+
"volumes": volumes,
|
122
|
+
"word_count": word_count,
|
123
|
+
"serial_status": serial_status,
|
124
|
+
"extra": {},
|
125
|
+
}
|
126
|
+
|
127
|
+
def parse_chapter(
|
128
|
+
self,
|
129
|
+
html_list: list[str],
|
130
|
+
chapter_id: str,
|
131
|
+
**kwargs: Any,
|
132
|
+
) -> ChapterDict | None:
|
133
|
+
if not html_list:
|
134
|
+
return None
|
135
|
+
|
136
|
+
tree = html.fromstring(html_list[0])
|
137
|
+
title = self._first_str(tree.xpath("//h1/text()"))
|
138
|
+
if not title:
|
139
|
+
return None
|
140
|
+
|
141
|
+
# strip book-name prefix if present
|
142
|
+
bc = tree.xpath('//div[@id="breadCrumb"]//a/text()')
|
143
|
+
if len(bc) >= 2:
|
144
|
+
book_name = bc[1].strip()
|
145
|
+
title = title.removeprefix(book_name).lstrip(" ::–—-").strip()
|
146
|
+
|
147
|
+
anchors = tree.xpath('//div[@id="BookSee_Right"]')
|
148
|
+
if not anchors:
|
149
|
+
return None
|
150
|
+
marker = anchors[0]
|
151
|
+
|
152
|
+
lines: list[str] = []
|
153
|
+
|
154
|
+
def _append_text(text: str) -> None:
|
155
|
+
for ln in text.replace("\xa0", " ").splitlines():
|
156
|
+
ln2 = ln.strip()
|
157
|
+
if ln2:
|
158
|
+
lines.append(ln2)
|
159
|
+
|
160
|
+
if marker.tail:
|
161
|
+
_append_text(marker.tail)
|
162
|
+
|
163
|
+
# 4. Walk through siblings until <!--over-->
|
164
|
+
node = marker
|
165
|
+
while True:
|
166
|
+
sib = node.getnext()
|
167
|
+
if sib is None:
|
168
|
+
break
|
169
|
+
node = sib
|
170
|
+
|
171
|
+
# Stop on the closing comment
|
172
|
+
if isinstance(sib, etree._Comment) and "over" in (sib.text or ""):
|
173
|
+
break
|
174
|
+
|
175
|
+
# Process comment tails (e.g. <!--go--> tail)
|
176
|
+
if isinstance(sib, etree._Comment):
|
177
|
+
if sib.tail:
|
178
|
+
_append_text(sib.tail)
|
179
|
+
continue
|
180
|
+
|
181
|
+
if isinstance(sib, html.HtmlElement):
|
182
|
+
# tag = sib.tag.lower()
|
183
|
+
tag = str(sib.tag).lower()
|
184
|
+
cls = sib.get("class", "") or ""
|
185
|
+
|
186
|
+
if tag == "div" and "divimage" in cls:
|
187
|
+
srcs = sib.xpath(".//img/@src")
|
188
|
+
if srcs:
|
189
|
+
lines.append(f'<img src="{srcs[0]}" />')
|
190
|
+
# text after the div
|
191
|
+
if sib.tail:
|
192
|
+
_append_text(sib.tail)
|
193
|
+
continue
|
194
|
+
|
195
|
+
if tag == "br":
|
196
|
+
if sib.tail:
|
197
|
+
_append_text(sib.tail)
|
198
|
+
continue
|
199
|
+
|
200
|
+
text = sib.text_content()
|
201
|
+
_append_text(text)
|
202
|
+
if sib.tail:
|
203
|
+
_append_text(sib.tail)
|
204
|
+
continue
|
205
|
+
|
206
|
+
content = "\n".join(lines)
|
207
|
+
if not content:
|
208
|
+
return None
|
209
|
+
|
210
|
+
return {
|
211
|
+
"id": chapter_id,
|
212
|
+
"title": title,
|
213
|
+
"content": content,
|
214
|
+
"extra": {"site": "shencou"},
|
215
|
+
}
|
@@ -0,0 +1,111 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.shuhaige
|
4
|
+
--------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any
|
9
|
+
|
10
|
+
from lxml import html
|
11
|
+
|
12
|
+
from novel_downloader.core.parsers.base import BaseParser
|
13
|
+
from novel_downloader.core.parsers.registry import register_parser
|
14
|
+
from novel_downloader.models import (
|
15
|
+
BookInfoDict,
|
16
|
+
ChapterDict,
|
17
|
+
ChapterInfoDict,
|
18
|
+
VolumeInfoDict,
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
@register_parser(
|
23
|
+
site_keys=["shuhaige"],
|
24
|
+
)
|
25
|
+
class ShuhaigeParser(BaseParser):
|
26
|
+
"""
|
27
|
+
Parser for 书海阁小说网 book pages.
|
28
|
+
"""
|
29
|
+
|
30
|
+
def parse_book_info(
|
31
|
+
self,
|
32
|
+
html_list: list[str],
|
33
|
+
**kwargs: Any,
|
34
|
+
) -> BookInfoDict | None:
|
35
|
+
if not html_list:
|
36
|
+
return None
|
37
|
+
|
38
|
+
tree = html.fromstring(html_list[0])
|
39
|
+
|
40
|
+
book_name = self._first_str(tree.xpath('//div[@id="info"]/h1/text()'))
|
41
|
+
author = self._first_str(tree.xpath('//div[@id="info"]/p[1]/a/text()'))
|
42
|
+
|
43
|
+
cover_url = self._first_str(tree.xpath('//div[@id="fmimg"]/img/@src'))
|
44
|
+
|
45
|
+
update_time = self._first_str(
|
46
|
+
tree.xpath('//div[@id="info"]/p[3]/text()'),
|
47
|
+
replaces=[("最后更新:", "")],
|
48
|
+
)
|
49
|
+
|
50
|
+
summary = self._first_str(tree.xpath('//div[@id="intro"]/p[1]/text()'))
|
51
|
+
|
52
|
+
book_type = self._first_str(tree.xpath('//div[@class="con_top"]/a[2]/text()'))
|
53
|
+
tags = [book_type] if book_type else []
|
54
|
+
|
55
|
+
chapters: list[ChapterInfoDict] = [
|
56
|
+
{
|
57
|
+
"title": (a.text or "").strip(),
|
58
|
+
"url": (a.get("href") or "").strip(),
|
59
|
+
"chapterId": (a.get("href") or "").rsplit("/", 1)[-1].split(".", 1)[0],
|
60
|
+
}
|
61
|
+
for a in tree.xpath(
|
62
|
+
'//div[@id="list"]/dl/dt[contains(., "正文")]/following-sibling::dd/a'
|
63
|
+
)
|
64
|
+
]
|
65
|
+
|
66
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
67
|
+
|
68
|
+
return {
|
69
|
+
"book_name": book_name,
|
70
|
+
"author": author,
|
71
|
+
"cover_url": cover_url,
|
72
|
+
"update_time": update_time,
|
73
|
+
"tags": tags,
|
74
|
+
"summary": summary,
|
75
|
+
"volumes": volumes,
|
76
|
+
"extra": {},
|
77
|
+
}
|
78
|
+
|
79
|
+
def parse_chapter(
|
80
|
+
self,
|
81
|
+
html_list: list[str],
|
82
|
+
chapter_id: str,
|
83
|
+
**kwargs: Any,
|
84
|
+
) -> ChapterDict | None:
|
85
|
+
if not html_list:
|
86
|
+
return None
|
87
|
+
tree = html.fromstring(html_list[0])
|
88
|
+
|
89
|
+
title = self._first_str(tree.xpath('//div[@class="bookname"]/h1/text()'))
|
90
|
+
if not title:
|
91
|
+
title = f"第 {chapter_id} 章"
|
92
|
+
|
93
|
+
content_elem = tree.xpath('//div[@id="content"]')
|
94
|
+
if not content_elem:
|
95
|
+
return None
|
96
|
+
paragraphs = [
|
97
|
+
"".join(p.itertext()).strip() for p in content_elem[0].xpath(".//p")
|
98
|
+
]
|
99
|
+
if paragraphs and "www.shuhaige.net" in paragraphs[-1]:
|
100
|
+
paragraphs.pop()
|
101
|
+
|
102
|
+
content = "\n".join(paragraphs)
|
103
|
+
if not content.strip():
|
104
|
+
return None
|
105
|
+
|
106
|
+
return {
|
107
|
+
"id": chapter_id,
|
108
|
+
"title": title,
|
109
|
+
"content": content,
|
110
|
+
"extra": {"site": "shuhaige"},
|
111
|
+
}
|
@@ -0,0 +1,116 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.tongrenquan
|
4
|
+
-----------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any
|
9
|
+
|
10
|
+
from lxml import html
|
11
|
+
|
12
|
+
from novel_downloader.core.parsers.base import BaseParser
|
13
|
+
from novel_downloader.core.parsers.registry import register_parser
|
14
|
+
from novel_downloader.models import (
|
15
|
+
BookInfoDict,
|
16
|
+
ChapterDict,
|
17
|
+
ChapterInfoDict,
|
18
|
+
VolumeInfoDict,
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
@register_parser(
|
23
|
+
site_keys=["tongrenquan"],
|
24
|
+
)
|
25
|
+
class TongrenquanParser(BaseParser):
|
26
|
+
"""
|
27
|
+
Parser for 同人圈 book pages.
|
28
|
+
"""
|
29
|
+
|
30
|
+
BASE_URL = "https://www.tongrenquan.org"
|
31
|
+
|
32
|
+
def parse_book_info(
|
33
|
+
self,
|
34
|
+
html_list: list[str],
|
35
|
+
**kwargs: Any,
|
36
|
+
) -> BookInfoDict | None:
|
37
|
+
if not html_list:
|
38
|
+
return None
|
39
|
+
|
40
|
+
tree = html.fromstring(html_list[0])
|
41
|
+
|
42
|
+
# Metadata
|
43
|
+
book_name = self._first_str(tree.xpath('//div[@class="infos"]/h1/text()'))
|
44
|
+
author = self._first_str(
|
45
|
+
tree.xpath('//div[@class="date"]/span/text()'),
|
46
|
+
replaces=[("作者:", "")],
|
47
|
+
)
|
48
|
+
cover_url = self.BASE_URL + self._first_str(
|
49
|
+
tree.xpath('//div[@class="pic"]//img/@src')
|
50
|
+
)
|
51
|
+
update_time = self._first_str(
|
52
|
+
tree.xpath('//div[@class="date"]/text()'),
|
53
|
+
replaces=[("日期:", "")],
|
54
|
+
)
|
55
|
+
|
56
|
+
# Summary (collapse text within the <p> tag)
|
57
|
+
paras = tree.xpath('//div[@class="infos"]/p//text()')
|
58
|
+
summary = "\n".join(p.strip() for p in paras if p.strip())
|
59
|
+
|
60
|
+
# Chapters extraction
|
61
|
+
chapters: list[ChapterInfoDict] = []
|
62
|
+
for a in tree.xpath('//div[contains(@class,"book_list")]//ul//li/a'):
|
63
|
+
url = a.get("href", "").strip()
|
64
|
+
title = a.text_content().strip()
|
65
|
+
# General pattern: /category/bookId/chapterId.html
|
66
|
+
# '/tongren/7562/462.html' -> '462'
|
67
|
+
chapter_id = url.rstrip(".html").split("/")[-1]
|
68
|
+
chapters.append({"title": title, "url": url, "chapterId": chapter_id})
|
69
|
+
|
70
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
71
|
+
|
72
|
+
return {
|
73
|
+
"book_name": book_name,
|
74
|
+
"author": author,
|
75
|
+
"cover_url": cover_url,
|
76
|
+
"update_time": update_time,
|
77
|
+
"tags": ["同人小说"],
|
78
|
+
"summary": summary,
|
79
|
+
"volumes": volumes,
|
80
|
+
"extra": {},
|
81
|
+
}
|
82
|
+
|
83
|
+
def parse_chapter(
|
84
|
+
self,
|
85
|
+
html_list: list[str],
|
86
|
+
chapter_id: str,
|
87
|
+
**kwargs: Any,
|
88
|
+
) -> ChapterDict | None:
|
89
|
+
if not html_list:
|
90
|
+
return None
|
91
|
+
|
92
|
+
tree = html.fromstring(html_list[0])
|
93
|
+
|
94
|
+
raw_title = self._first_str(
|
95
|
+
tree.xpath('//div[contains(@class,"read_chapterName")]//h1/text()')
|
96
|
+
)
|
97
|
+
|
98
|
+
book_name = self._first_str(
|
99
|
+
tree.xpath('//div[contains(@class,"readTop")]//a[last()]/text()')
|
100
|
+
)
|
101
|
+
|
102
|
+
title = raw_title.replace(book_name, "").strip()
|
103
|
+
|
104
|
+
# Extract paragraphs of content
|
105
|
+
paras = tree.xpath('//div[contains(@class,"read_chapterDetail")]/p')
|
106
|
+
texts = [p.text_content().strip() for p in paras if p.text_content().strip()]
|
107
|
+
content = "\n".join(texts)
|
108
|
+
if not content:
|
109
|
+
return None
|
110
|
+
|
111
|
+
return {
|
112
|
+
"id": chapter_id,
|
113
|
+
"title": title,
|
114
|
+
"content": content,
|
115
|
+
"extra": {"site": "tongrenquan"},
|
116
|
+
}
|