novel-downloader 1.5.0__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +1 -3
- novel_downloader/cli/clean.py +21 -88
- novel_downloader/cli/config.py +26 -21
- novel_downloader/cli/download.py +79 -66
- novel_downloader/cli/export.py +17 -21
- novel_downloader/cli/main.py +1 -1
- novel_downloader/cli/search.py +62 -65
- novel_downloader/cli/ui.py +156 -0
- novel_downloader/config/__init__.py +8 -5
- novel_downloader/config/adapter.py +206 -209
- novel_downloader/config/{loader.py → file_io.py} +53 -26
- novel_downloader/core/__init__.py +5 -5
- novel_downloader/core/archived/deqixs/fetcher.py +115 -0
- novel_downloader/core/archived/deqixs/parser.py +132 -0
- novel_downloader/core/archived/deqixs/searcher.py +89 -0
- novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
- novel_downloader/core/archived/wanbengo/searcher.py +98 -0
- novel_downloader/core/archived/xshbook/searcher.py +93 -0
- novel_downloader/core/downloaders/__init__.py +3 -24
- novel_downloader/core/downloaders/base.py +49 -23
- novel_downloader/core/downloaders/common.py +191 -137
- novel_downloader/core/downloaders/qianbi.py +187 -146
- novel_downloader/core/downloaders/qidian.py +187 -141
- novel_downloader/core/downloaders/registry.py +4 -2
- novel_downloader/core/downloaders/signals.py +46 -0
- novel_downloader/core/exporters/__init__.py +3 -20
- novel_downloader/core/exporters/base.py +33 -37
- novel_downloader/core/exporters/common/__init__.py +1 -2
- novel_downloader/core/exporters/common/epub.py +15 -10
- novel_downloader/core/exporters/common/main_exporter.py +19 -12
- novel_downloader/core/exporters/common/txt.py +17 -12
- novel_downloader/core/exporters/epub_util.py +59 -29
- novel_downloader/core/exporters/linovelib/__init__.py +1 -0
- novel_downloader/core/exporters/linovelib/epub.py +23 -25
- novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
- novel_downloader/core/exporters/linovelib/txt.py +20 -14
- novel_downloader/core/exporters/qidian.py +2 -8
- novel_downloader/core/exporters/registry.py +4 -2
- novel_downloader/core/exporters/txt_util.py +7 -7
- novel_downloader/core/fetchers/__init__.py +54 -48
- novel_downloader/core/fetchers/aaatxt.py +83 -0
- novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
- novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
- novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
- novel_downloader/core/fetchers/dxmwx.py +110 -0
- novel_downloader/core/fetchers/eightnovel.py +139 -0
- novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
- novel_downloader/core/fetchers/guidaye.py +85 -0
- novel_downloader/core/fetchers/hetushu.py +92 -0
- novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
- novel_downloader/core/fetchers/ixdzs8.py +113 -0
- novel_downloader/core/fetchers/jpxs123.py +101 -0
- novel_downloader/core/fetchers/lewenn.py +83 -0
- novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
- novel_downloader/core/fetchers/piaotia.py +105 -0
- novel_downloader/core/fetchers/qbtr.py +101 -0
- novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
- novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +56 -64
- novel_downloader/core/fetchers/quanben5.py +92 -0
- novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
- novel_downloader/core/fetchers/registry.py +5 -16
- novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
- novel_downloader/core/fetchers/shencou.py +106 -0
- novel_downloader/core/fetchers/shuhaige.py +84 -0
- novel_downloader/core/fetchers/tongrenquan.py +84 -0
- novel_downloader/core/fetchers/ttkan.py +95 -0
- novel_downloader/core/fetchers/wanbengo.py +83 -0
- novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
- novel_downloader/core/fetchers/xiguashuwu.py +177 -0
- novel_downloader/core/fetchers/xs63b.py +171 -0
- novel_downloader/core/fetchers/xshbook.py +85 -0
- novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
- novel_downloader/core/fetchers/yibige.py +114 -0
- novel_downloader/core/interfaces/__init__.py +1 -9
- novel_downloader/core/interfaces/downloader.py +6 -2
- novel_downloader/core/interfaces/exporter.py +7 -7
- novel_downloader/core/interfaces/fetcher.py +6 -19
- novel_downloader/core/interfaces/parser.py +7 -8
- novel_downloader/core/interfaces/searcher.py +9 -1
- novel_downloader/core/parsers/__init__.py +49 -12
- novel_downloader/core/parsers/aaatxt.py +132 -0
- novel_downloader/core/parsers/b520.py +116 -0
- novel_downloader/core/parsers/base.py +64 -12
- novel_downloader/core/parsers/biquyuedu.py +133 -0
- novel_downloader/core/parsers/dxmwx.py +162 -0
- novel_downloader/core/parsers/eightnovel.py +224 -0
- novel_downloader/core/parsers/esjzone.py +64 -69
- novel_downloader/core/parsers/guidaye.py +128 -0
- novel_downloader/core/parsers/hetushu.py +139 -0
- novel_downloader/core/parsers/i25zw.py +137 -0
- novel_downloader/core/parsers/ixdzs8.py +186 -0
- novel_downloader/core/parsers/jpxs123.py +137 -0
- novel_downloader/core/parsers/lewenn.py +142 -0
- novel_downloader/core/parsers/linovelib.py +48 -64
- novel_downloader/core/parsers/piaotia.py +189 -0
- novel_downloader/core/parsers/qbtr.py +136 -0
- novel_downloader/core/parsers/qianbi.py +48 -50
- novel_downloader/core/parsers/qidian/main_parser.py +756 -48
- novel_downloader/core/parsers/qidian/utils/__init__.py +3 -21
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +4 -4
- novel_downloader/core/parsers/quanben5.py +103 -0
- novel_downloader/core/parsers/registry.py +5 -16
- novel_downloader/core/parsers/sfacg.py +38 -45
- novel_downloader/core/parsers/shencou.py +215 -0
- novel_downloader/core/parsers/shuhaige.py +111 -0
- novel_downloader/core/parsers/tongrenquan.py +116 -0
- novel_downloader/core/parsers/ttkan.py +132 -0
- novel_downloader/core/parsers/wanbengo.py +191 -0
- novel_downloader/core/parsers/xiaoshuowu.py +173 -0
- novel_downloader/core/parsers/xiguashuwu.py +429 -0
- novel_downloader/core/parsers/xs63b.py +161 -0
- novel_downloader/core/parsers/xshbook.py +134 -0
- novel_downloader/core/parsers/yamibo.py +87 -131
- novel_downloader/core/parsers/yibige.py +166 -0
- novel_downloader/core/searchers/__init__.py +34 -3
- novel_downloader/core/searchers/aaatxt.py +107 -0
- novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
- novel_downloader/core/searchers/base.py +112 -36
- novel_downloader/core/searchers/dxmwx.py +105 -0
- novel_downloader/core/searchers/eightnovel.py +84 -0
- novel_downloader/core/searchers/esjzone.py +43 -25
- novel_downloader/core/searchers/hetushu.py +92 -0
- novel_downloader/core/searchers/i25zw.py +93 -0
- novel_downloader/core/searchers/ixdzs8.py +107 -0
- novel_downloader/core/searchers/jpxs123.py +107 -0
- novel_downloader/core/searchers/piaotia.py +100 -0
- novel_downloader/core/searchers/qbtr.py +106 -0
- novel_downloader/core/searchers/qianbi.py +74 -40
- novel_downloader/core/searchers/quanben5.py +144 -0
- novel_downloader/core/searchers/registry.py +24 -8
- novel_downloader/core/searchers/shuhaige.py +124 -0
- novel_downloader/core/searchers/tongrenquan.py +110 -0
- novel_downloader/core/searchers/ttkan.py +92 -0
- novel_downloader/core/searchers/xiaoshuowu.py +122 -0
- novel_downloader/core/searchers/xiguashuwu.py +95 -0
- novel_downloader/core/searchers/xs63b.py +104 -0
- novel_downloader/locales/en.json +34 -85
- novel_downloader/locales/zh.json +35 -86
- novel_downloader/models/__init__.py +21 -22
- novel_downloader/models/book.py +44 -0
- novel_downloader/models/config.py +4 -37
- novel_downloader/models/login.py +1 -1
- novel_downloader/models/search.py +5 -0
- novel_downloader/resources/config/settings.toml +8 -70
- novel_downloader/resources/json/xiguashuwu.json +718 -0
- novel_downloader/utils/__init__.py +13 -24
- novel_downloader/utils/chapter_storage.py +5 -5
- novel_downloader/utils/constants.py +4 -31
- novel_downloader/utils/cookies.py +38 -35
- novel_downloader/utils/crypto_utils/__init__.py +7 -0
- novel_downloader/utils/crypto_utils/aes_util.py +90 -0
- novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
- novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
- novel_downloader/utils/crypto_utils/rc4.py +54 -0
- novel_downloader/utils/epub/__init__.py +3 -4
- novel_downloader/utils/epub/builder.py +6 -6
- novel_downloader/utils/epub/constants.py +62 -21
- novel_downloader/utils/epub/documents.py +95 -201
- novel_downloader/utils/epub/models.py +8 -22
- novel_downloader/utils/epub/utils.py +73 -106
- novel_downloader/utils/file_utils/__init__.py +2 -23
- novel_downloader/utils/file_utils/io.py +53 -188
- novel_downloader/utils/file_utils/normalize.py +1 -7
- novel_downloader/utils/file_utils/sanitize.py +4 -15
- novel_downloader/utils/fontocr/__init__.py +5 -14
- novel_downloader/utils/fontocr/core.py +216 -0
- novel_downloader/utils/fontocr/loader.py +50 -0
- novel_downloader/utils/logger.py +81 -65
- novel_downloader/utils/network.py +17 -41
- novel_downloader/utils/state.py +4 -90
- novel_downloader/utils/text_utils/__init__.py +1 -7
- novel_downloader/utils/text_utils/diff_display.py +5 -7
- novel_downloader/utils/text_utils/text_cleaner.py +39 -30
- novel_downloader/utils/text_utils/truncate_utils.py +3 -14
- novel_downloader/utils/time_utils/__init__.py +5 -11
- novel_downloader/utils/time_utils/datetime_utils.py +20 -29
- novel_downloader/utils/time_utils/sleep_utils.py +55 -49
- novel_downloader/web/__init__.py +13 -0
- novel_downloader/web/components/__init__.py +11 -0
- novel_downloader/web/components/navigation.py +35 -0
- novel_downloader/web/main.py +66 -0
- novel_downloader/web/pages/__init__.py +17 -0
- novel_downloader/web/pages/download.py +78 -0
- novel_downloader/web/pages/progress.py +147 -0
- novel_downloader/web/pages/search.py +329 -0
- novel_downloader/web/services/__init__.py +17 -0
- novel_downloader/web/services/client_dialog.py +164 -0
- novel_downloader/web/services/cred_broker.py +113 -0
- novel_downloader/web/services/cred_models.py +35 -0
- novel_downloader/web/services/task_manager.py +264 -0
- novel_downloader-2.0.1.dist-info/METADATA +172 -0
- novel_downloader-2.0.1.dist-info/RECORD +206 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/entry_points.txt +1 -1
- novel_downloader/core/downloaders/biquge.py +0 -29
- novel_downloader/core/downloaders/esjzone.py +0 -29
- novel_downloader/core/downloaders/linovelib.py +0 -29
- novel_downloader/core/downloaders/sfacg.py +0 -29
- novel_downloader/core/downloaders/yamibo.py +0 -29
- novel_downloader/core/exporters/biquge.py +0 -22
- novel_downloader/core/exporters/esjzone.py +0 -22
- novel_downloader/core/exporters/qianbi.py +0 -22
- novel_downloader/core/exporters/sfacg.py +0 -22
- novel_downloader/core/exporters/yamibo.py +0 -22
- novel_downloader/core/fetchers/base/__init__.py +0 -14
- novel_downloader/core/fetchers/base/browser.py +0 -422
- novel_downloader/core/fetchers/biquge/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/browser.py +0 -209
- novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
- novel_downloader/core/fetchers/linovelib/browser.py +0 -198
- novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/browser.py +0 -326
- novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
- novel_downloader/core/fetchers/sfacg/browser.py +0 -194
- novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
- novel_downloader/core/fetchers/yamibo/browser.py +0 -234
- novel_downloader/core/parsers/biquge.py +0 -139
- novel_downloader/core/parsers/qidian/book_info_parser.py +0 -90
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -528
- novel_downloader/core/parsers/qidian/chapter_normal.py +0 -157
- novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -114
- novel_downloader/models/chapter.py +0 -25
- novel_downloader/models/types.py +0 -13
- novel_downloader/tui/__init__.py +0 -7
- novel_downloader/tui/app.py +0 -32
- novel_downloader/tui/main.py +0 -17
- novel_downloader/tui/screens/__init__.py +0 -14
- novel_downloader/tui/screens/home.py +0 -198
- novel_downloader/tui/screens/login.py +0 -74
- novel_downloader/tui/styles/home_layout.tcss +0 -79
- novel_downloader/tui/widgets/richlog_handler.py +0 -24
- novel_downloader/utils/cache.py +0 -24
- novel_downloader/utils/crypto_utils.py +0 -71
- novel_downloader/utils/fontocr/hash_store.py +0 -280
- novel_downloader/utils/fontocr/hash_utils.py +0 -103
- novel_downloader/utils/fontocr/model_loader.py +0 -69
- novel_downloader/utils/fontocr/ocr_v1.py +0 -315
- novel_downloader/utils/fontocr/ocr_v2.py +0 -764
- novel_downloader/utils/fontocr/ocr_v3.py +0 -744
- novel_downloader-1.5.0.dist-info/METADATA +0 -196
- novel_downloader-1.5.0.dist-info/RECORD +0 -164
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/WHEEL +0 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/top_level.txt +0 -0
@@ -3,27 +3,9 @@
|
|
3
3
|
novel_downloader.core.parsers.qidian.utils
|
4
4
|
------------------------------------------
|
5
5
|
|
6
|
+
Utility functions and helpers for parsing and decrypting Qidian novel pages
|
6
7
|
"""
|
7
8
|
|
8
|
-
__all__ = [
|
9
|
-
"find_ssr_page_context",
|
10
|
-
"extract_chapter_info",
|
11
|
-
"is_restricted_page",
|
12
|
-
"vip_status",
|
13
|
-
"can_view_chapter",
|
14
|
-
"is_encrypted",
|
15
|
-
"is_duplicated",
|
16
|
-
"QidianNodeDecryptor",
|
17
|
-
"get_decryptor",
|
18
|
-
]
|
9
|
+
__all__ = ["get_decryptor"]
|
19
10
|
|
20
|
-
from .
|
21
|
-
can_view_chapter,
|
22
|
-
extract_chapter_info,
|
23
|
-
find_ssr_page_context,
|
24
|
-
is_duplicated,
|
25
|
-
is_encrypted,
|
26
|
-
is_restricted_page,
|
27
|
-
vip_status,
|
28
|
-
)
|
29
|
-
from .node_decryptor import QidianNodeDecryptor, get_decryptor
|
11
|
+
from .node_decryptor import get_decryptor
|
@@ -25,7 +25,7 @@ import requests
|
|
25
25
|
from novel_downloader.utils.constants import JS_SCRIPT_DIR
|
26
26
|
|
27
27
|
DEST_ROOT: Final[Path] = JS_SCRIPT_DIR
|
28
|
-
GITHUB_OWNER: Final = "
|
28
|
+
GITHUB_OWNER: Final = "saudadez21"
|
29
29
|
GITHUB_REPO: Final = "qidian-decryptor"
|
30
30
|
RELEASE_VERSION: Final = "v1.0.1"
|
31
31
|
BASE_URL: Final = f"https://github.com/{GITHUB_OWNER}/{GITHUB_REPO}/releases/download/{RELEASE_VERSION}"
|
@@ -36,10 +36,10 @@ class QidianNodeDecryptor:
|
|
36
36
|
3. Download the remote Fock module JS if not already present.
|
37
37
|
|
38
38
|
Calling `decrypt()` will:
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
39
|
+
* Write a temp JSON input file with [ciphertext, chapter_id, fkp, fuid].
|
40
|
+
* Spawn `node qidian_decrypt_node.js <in> <out>`.
|
41
|
+
* Read and return the decrypted text.
|
42
|
+
* Clean up the temp files.
|
43
43
|
"""
|
44
44
|
|
45
45
|
QIDIAN_FOCK_JS_URL: str = (
|
@@ -0,0 +1,103 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.quanben5
|
4
|
+
--------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from datetime import datetime
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
from lxml import html
|
12
|
+
|
13
|
+
from novel_downloader.core.parsers.base import BaseParser
|
14
|
+
from novel_downloader.core.parsers.registry import register_parser
|
15
|
+
from novel_downloader.models import (
|
16
|
+
BookInfoDict,
|
17
|
+
ChapterDict,
|
18
|
+
ChapterInfoDict,
|
19
|
+
VolumeInfoDict,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
@register_parser(
|
24
|
+
site_keys=["quanben5"],
|
25
|
+
)
|
26
|
+
class Quanben5Parser(BaseParser):
|
27
|
+
"""
|
28
|
+
Parser for 全本小说网 book pages.
|
29
|
+
"""
|
30
|
+
|
31
|
+
def parse_book_info(
|
32
|
+
self,
|
33
|
+
html_list: list[str],
|
34
|
+
**kwargs: Any,
|
35
|
+
) -> BookInfoDict | None:
|
36
|
+
if not html_list:
|
37
|
+
return None
|
38
|
+
|
39
|
+
tree = html.fromstring(html_list[0])
|
40
|
+
book_name = self._first_str(tree.xpath("//h3/span/text()"))
|
41
|
+
author = self._first_str(
|
42
|
+
tree.xpath(
|
43
|
+
'//p[@class="info"][contains(., "作者")]/span[@class="author"]/text()'
|
44
|
+
)
|
45
|
+
)
|
46
|
+
cover_url = self._first_str(tree.xpath('//div[@class="pic"]/img/@src'))
|
47
|
+
category = self._first_str(
|
48
|
+
tree.xpath('//p[@class="info"][contains(., "类别")]/span/text()')
|
49
|
+
)
|
50
|
+
tags = [category] if category else []
|
51
|
+
update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
52
|
+
summary = self._first_str(tree.xpath('//p[@class="description"]/text()'))
|
53
|
+
|
54
|
+
chapters: list[ChapterInfoDict] = []
|
55
|
+
for li in tree.xpath('//ul[@class="list"]/li'):
|
56
|
+
link = li.xpath(".//a")[0]
|
57
|
+
href = link.get("href", "").strip()
|
58
|
+
title = self._first_str(link.xpath(".//span/text()"))
|
59
|
+
# '/n/toutian/83840.html' -> '83840'
|
60
|
+
chapter_id = href.rstrip(".html").split("/")[-1]
|
61
|
+
chapters.append({"title": title, "url": href, "chapterId": chapter_id})
|
62
|
+
|
63
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
64
|
+
|
65
|
+
return {
|
66
|
+
"book_name": book_name,
|
67
|
+
"author": author,
|
68
|
+
"cover_url": cover_url,
|
69
|
+
"update_time": update_time,
|
70
|
+
"tags": tags,
|
71
|
+
"summary": summary,
|
72
|
+
"volumes": volumes,
|
73
|
+
"extra": {},
|
74
|
+
}
|
75
|
+
|
76
|
+
def parse_chapter(
|
77
|
+
self,
|
78
|
+
html_list: list[str],
|
79
|
+
chapter_id: str,
|
80
|
+
**kwargs: Any,
|
81
|
+
) -> ChapterDict | None:
|
82
|
+
if not html_list:
|
83
|
+
return None
|
84
|
+
|
85
|
+
tree = html.fromstring(html_list[0])
|
86
|
+
|
87
|
+
# Extract the chapter title
|
88
|
+
title = self._first_str(tree.xpath('//h1[@class="title1"]/text()'))
|
89
|
+
|
90
|
+
# Extract all <p> text within the content container
|
91
|
+
paragraphs = tree.xpath('//div[@id="content"]/p/text()')
|
92
|
+
# Clean whitespace and join with double newlines
|
93
|
+
content = "\n".join(p.strip() for p in paragraphs if p.strip())
|
94
|
+
|
95
|
+
if not content:
|
96
|
+
return None
|
97
|
+
|
98
|
+
return {
|
99
|
+
"id": chapter_id,
|
100
|
+
"title": title,
|
101
|
+
"content": content,
|
102
|
+
"extra": {"site": "quanben5"},
|
103
|
+
}
|
@@ -3,6 +3,7 @@
|
|
3
3
|
novel_downloader.core.parsers.registry
|
4
4
|
--------------------------------------
|
5
5
|
|
6
|
+
Registry and factory helpers for creating site-specific parsers.
|
6
7
|
"""
|
7
8
|
|
8
9
|
__all__ = ["register_parser", "get_parser"]
|
@@ -16,27 +17,24 @@ from novel_downloader.models import ParserConfig
|
|
16
17
|
ParserBuilder = Callable[[ParserConfig], ParserProtocol]
|
17
18
|
|
18
19
|
P = TypeVar("P", bound=ParserProtocol)
|
19
|
-
_PARSER_MAP: dict[str,
|
20
|
+
_PARSER_MAP: dict[str, ParserBuilder] = {}
|
20
21
|
|
21
22
|
|
22
23
|
def register_parser(
|
23
24
|
site_keys: Sequence[str],
|
24
|
-
backends: Sequence[str],
|
25
25
|
) -> Callable[[type[P]], type[P]]:
|
26
26
|
"""
|
27
27
|
Decorator to register a parser class under given keys.
|
28
28
|
|
29
29
|
:param site_keys: Sequence of site identifiers
|
30
|
-
:param backends:
|
30
|
+
:param backends: Sequence of backend types
|
31
31
|
:return: A class decorator that populates _PARSER_MAP.
|
32
32
|
"""
|
33
33
|
|
34
34
|
def decorator(cls: type[P]) -> type[P]:
|
35
35
|
for site in site_keys:
|
36
36
|
site_lower = site.lower()
|
37
|
-
|
38
|
-
for backend in backends:
|
39
|
-
bucket[backend] = cls
|
37
|
+
_PARSER_MAP[site_lower] = cls
|
40
38
|
return cls
|
41
39
|
|
42
40
|
return decorator
|
@@ -52,17 +50,8 @@ def get_parser(site: str, config: ParserConfig) -> ParserProtocol:
|
|
52
50
|
"""
|
53
51
|
site_key = site.lower()
|
54
52
|
try:
|
55
|
-
|
53
|
+
parser_cls = _PARSER_MAP[site_key]
|
56
54
|
except KeyError as err:
|
57
55
|
raise ValueError(f"Unsupported site: {site!r}") from err
|
58
56
|
|
59
|
-
mode = config.mode
|
60
|
-
try:
|
61
|
-
parser_cls = backend_map[mode]
|
62
|
-
except KeyError as err:
|
63
|
-
raise ValueError(
|
64
|
-
f"Unsupported parser mode {mode!r} for site {site!r}. "
|
65
|
-
f"Available modes: {list(backend_map)}"
|
66
|
-
) from err
|
67
|
-
|
68
57
|
return parser_cls(config)
|
@@ -11,22 +11,32 @@ from lxml import html
|
|
11
11
|
|
12
12
|
from novel_downloader.core.parsers.base import BaseParser
|
13
13
|
from novel_downloader.core.parsers.registry import register_parser
|
14
|
-
from novel_downloader.models import
|
14
|
+
from novel_downloader.models import (
|
15
|
+
BookInfoDict,
|
16
|
+
ChapterDict,
|
17
|
+
ChapterInfoDict,
|
18
|
+
VolumeInfoDict,
|
19
|
+
)
|
15
20
|
|
16
21
|
|
17
22
|
@register_parser(
|
18
23
|
site_keys=["sfacg"],
|
19
|
-
backends=["session", "browser"],
|
20
24
|
)
|
21
25
|
class SfacgParser(BaseParser):
|
22
|
-
"""
|
26
|
+
"""
|
27
|
+
Parser for sfacg book pages.
|
28
|
+
"""
|
23
29
|
|
24
30
|
# Book info XPaths
|
25
31
|
_BOOK_NAME_XPATH = '//ul[@class="book_info"]//span[@class="book_newtitle"]/text()'
|
26
32
|
_AUTHOR_INFO_XPATH = '//ul[@class="book_info"]//span[@class="book_info3"]/text()'
|
27
33
|
_UPDATE_TIME_XPATH = '//ul[@class="book_info"]//span[@class="book_info3"]/br/following-sibling::text()' # noqa: E501
|
28
34
|
_COVER_URL_XPATH = '//ul[@class="book_info"]//li/img/@src'
|
29
|
-
_STATUS_XPATH = '//ul[@class="book_info"]//div[@class="book_info2"]/span/text()'
|
35
|
+
# _STATUS_XPATH = '//ul[@class="book_info"]//div[@class="book_info2"]/span/text()'
|
36
|
+
_STATUS_XPATH = (
|
37
|
+
'//ul[@class="book_info"]//div[@class="book_info2"]/span/text()'
|
38
|
+
' and (contains(., "完结") or contains(., "连载"))]/text()'
|
39
|
+
)
|
30
40
|
_SUMMARY_XPATH = '//ul[@class="book_profile"]/li[@class="book_bk_qs1"]/text()'
|
31
41
|
|
32
42
|
# Catalog XPaths
|
@@ -47,54 +57,35 @@ class SfacgParser(BaseParser):
|
|
47
57
|
self,
|
48
58
|
html_list: list[str],
|
49
59
|
**kwargs: Any,
|
50
|
-
) ->
|
51
|
-
"""
|
52
|
-
Parse a book info page and extract metadata and chapter structure.
|
53
|
-
|
54
|
-
:param html_list: Raw HTML of the book info page.
|
55
|
-
:return: Parsed metadata and chapter structure as a dictionary.
|
56
|
-
"""
|
60
|
+
) -> BookInfoDict | None:
|
57
61
|
if len(html_list) < 2:
|
58
|
-
return
|
62
|
+
return None
|
59
63
|
|
60
64
|
info_tree = html.fromstring(html_list[0])
|
61
65
|
catalog_tree = html.fromstring(html_list[1])
|
62
66
|
|
63
|
-
result: dict[str, Any] = {}
|
64
|
-
|
65
67
|
# Book metadata
|
66
|
-
book_name = info_tree.xpath(self._BOOK_NAME_XPATH)
|
67
|
-
result["book_name"] = book_name[0].strip() if book_name else ""
|
68
|
+
book_name = self._first_str(info_tree.xpath(self._BOOK_NAME_XPATH))
|
68
69
|
|
69
|
-
|
70
|
-
|
71
|
-
result["word_count"] = (
|
72
|
-
book_info3[0].split("/")[1].strip()
|
73
|
-
if book_info3 and len(book_info3[0].split("/")) > 1
|
74
|
-
else ""
|
75
|
-
)
|
70
|
+
book_info3_str = self._first_str(info_tree.xpath(self._AUTHOR_INFO_XPATH))
|
71
|
+
author, _, word_count = (p.strip() for p in book_info3_str.partition("/"))
|
76
72
|
|
77
|
-
|
78
|
-
result["update_time"] = book_info3_br[0].strip() if book_info3_br else ""
|
73
|
+
update_time = self._first_str(info_tree.xpath(self._UPDATE_TIME_XPATH))
|
79
74
|
|
80
|
-
cover_url = info_tree.xpath(self._COVER_URL_XPATH)
|
81
|
-
result["cover_url"] = "https:" + cover_url[0] if cover_url else ""
|
75
|
+
cover_url = "https:" + self._first_str(info_tree.xpath(self._COVER_URL_XPATH))
|
82
76
|
|
83
|
-
serial_status = info_tree.xpath(self._STATUS_XPATH)
|
84
|
-
result["serial_status"] = next(
|
85
|
-
(s for s in serial_status if "完结" in s or "连载" in s), ""
|
86
|
-
)
|
77
|
+
serial_status = self._first_str(info_tree.xpath(self._STATUS_XPATH))
|
87
78
|
|
88
|
-
|
89
|
-
|
79
|
+
summary_elem = info_tree.xpath(self._SUMMARY_XPATH)
|
80
|
+
summary = "".join(summary_elem).strip()
|
90
81
|
|
91
82
|
# Chapter structure
|
92
83
|
volume_titles = catalog_tree.xpath(self._VOLUME_TITLE_XPATH)
|
93
84
|
volume_blocks = catalog_tree.xpath(self._VOLUME_CONTENT_XPATH)
|
94
85
|
|
95
|
-
volumes = []
|
86
|
+
volumes: list[VolumeInfoDict] = []
|
96
87
|
for vol_title, vol_block in zip(volume_titles, volume_blocks, strict=False):
|
97
|
-
chapters = []
|
88
|
+
chapters: list[ChapterInfoDict] = []
|
98
89
|
for a in vol_block.xpath(self._CHAPTER_LIST_XPATH):
|
99
90
|
href = a.xpath("./@href")[0] if a.xpath("./@href") else ""
|
100
91
|
title = "".join(a.xpath(".//li//text()")).strip()
|
@@ -112,9 +103,18 @@ class SfacgParser(BaseParser):
|
|
112
103
|
"chapters": chapters,
|
113
104
|
}
|
114
105
|
)
|
115
|
-
result["volumes"] = volumes
|
116
106
|
|
117
|
-
return
|
107
|
+
return {
|
108
|
+
"book_name": book_name,
|
109
|
+
"author": author,
|
110
|
+
"cover_url": cover_url,
|
111
|
+
"update_time": update_time,
|
112
|
+
"word_count": word_count,
|
113
|
+
"serial_status": serial_status,
|
114
|
+
"summary": summary,
|
115
|
+
"volumes": volumes,
|
116
|
+
"extra": {},
|
117
|
+
}
|
118
118
|
|
119
119
|
def parse_chapter(
|
120
120
|
self,
|
@@ -122,13 +122,6 @@ class SfacgParser(BaseParser):
|
|
122
122
|
chapter_id: str,
|
123
123
|
**kwargs: Any,
|
124
124
|
) -> ChapterDict | None:
|
125
|
-
"""
|
126
|
-
Parse a single chapter page and extract clean text or simplified HTML.
|
127
|
-
|
128
|
-
:param html_list: Raw HTML of the chapter page.
|
129
|
-
:param chapter_id: Identifier of the chapter being parsed.
|
130
|
-
:return: Cleaned chapter content as plain text or minimal HTML.
|
131
|
-
"""
|
132
125
|
if not html_list:
|
133
126
|
return None
|
134
127
|
keywords = [
|
@@ -156,7 +149,7 @@ class SfacgParser(BaseParser):
|
|
156
149
|
raw_text_parts = tree.xpath(self._CHAPTER_TEXT_XPATH)
|
157
150
|
content_lines = [txt.strip() for txt in raw_text_parts if txt.strip()]
|
158
151
|
|
159
|
-
content = "\n
|
152
|
+
content = "\n".join(content_lines).strip()
|
160
153
|
if not content:
|
161
154
|
return None
|
162
155
|
|
@@ -0,0 +1,215 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.shencou
|
4
|
+
-------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any
|
9
|
+
|
10
|
+
from lxml import etree, html
|
11
|
+
|
12
|
+
from novel_downloader.core.parsers.base import BaseParser
|
13
|
+
from novel_downloader.core.parsers.registry import register_parser
|
14
|
+
from novel_downloader.models import (
|
15
|
+
BookInfoDict,
|
16
|
+
ChapterDict,
|
17
|
+
VolumeInfoDict,
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
@register_parser(
|
22
|
+
site_keys=["shencou"],
|
23
|
+
)
|
24
|
+
class ShencouParser(BaseParser):
|
25
|
+
"""
|
26
|
+
Parser for 神凑轻小说 book pages.
|
27
|
+
"""
|
28
|
+
|
29
|
+
def parse_book_info(
|
30
|
+
self,
|
31
|
+
html_list: list[str],
|
32
|
+
**kwargs: Any,
|
33
|
+
) -> BookInfoDict | None:
|
34
|
+
if len(html_list) < 2:
|
35
|
+
return None
|
36
|
+
|
37
|
+
info_tree = html.fromstring(html_list[0])
|
38
|
+
catalog_tree = html.fromstring(html_list[1])
|
39
|
+
|
40
|
+
# --- Metadata ---
|
41
|
+
raw_name = self._first_str(info_tree.xpath("//span//a/text()"))
|
42
|
+
book_name = raw_name[:-2] if raw_name.endswith("小说") else raw_name
|
43
|
+
|
44
|
+
author = self._first_str(
|
45
|
+
info_tree.xpath('//td[contains(text(),"小说作者")]/text()'),
|
46
|
+
replaces=[("小说作者:", "")],
|
47
|
+
)
|
48
|
+
|
49
|
+
cover_url = self._first_str(
|
50
|
+
info_tree.xpath('//a[contains(@href,"/files/article/image")]/img/@src')
|
51
|
+
)
|
52
|
+
|
53
|
+
# word count
|
54
|
+
word_count = self._first_str(
|
55
|
+
info_tree.xpath('//td[contains(text(),"全文长度")]/text()'),
|
56
|
+
replaces=[("全文长度:", "")],
|
57
|
+
)
|
58
|
+
|
59
|
+
# update time
|
60
|
+
update_time = self._first_str(
|
61
|
+
info_tree.xpath('//td[contains(text(),"最后更新")]/text()'),
|
62
|
+
replaces=[("最后更新:", "")],
|
63
|
+
)
|
64
|
+
|
65
|
+
# serial status
|
66
|
+
serial_status = self._first_str(
|
67
|
+
info_tree.xpath('//td[contains(text(),"写作进度")]/text()'),
|
68
|
+
replaces=[("写作进度:", "")],
|
69
|
+
)
|
70
|
+
|
71
|
+
# summary
|
72
|
+
raw_detail = self._norm_space(
|
73
|
+
info_tree.xpath('string(//td[@width="80%" and @valign="top"])')
|
74
|
+
)
|
75
|
+
summary = ""
|
76
|
+
if "内容简介:" in raw_detail and "本书公告:" in raw_detail:
|
77
|
+
intro = raw_detail.split("内容简介:", 1)[1]
|
78
|
+
summary = intro.split("本书公告:", 1)[0].strip()
|
79
|
+
|
80
|
+
# --- Catalog / Chapters ---
|
81
|
+
volumes: list[VolumeInfoDict] = []
|
82
|
+
curr_vol: VolumeInfoDict = {"volume_name": "未命名卷", "chapters": []}
|
83
|
+
|
84
|
+
# Walk through volume headers (.zjbox) and lists (.zjlist4) in document order
|
85
|
+
for elem in catalog_tree.xpath(
|
86
|
+
'//div[@class="zjbox"] | //div[@class="zjlist4"]'
|
87
|
+
):
|
88
|
+
cls_attr = elem.get("class", "")
|
89
|
+
if "zjbox" in cls_attr:
|
90
|
+
# before starting new volume, save the previous if it has chapters
|
91
|
+
if curr_vol["chapters"]:
|
92
|
+
volumes.append(curr_vol)
|
93
|
+
# start a new volume
|
94
|
+
vol_name = elem.xpath(".//h2/text()")[0].strip()
|
95
|
+
curr_vol = {"volume_name": vol_name, "chapters": []}
|
96
|
+
elif "zjlist4" in cls_attr:
|
97
|
+
# collect all <li><a> entries under this list
|
98
|
+
for a in elem.xpath(".//ol/li/a"):
|
99
|
+
url = a.get("href").strip()
|
100
|
+
title = a.text_content().strip()
|
101
|
+
# '203740.html' -> '203740'
|
102
|
+
chap_id = url.split(".")[0]
|
103
|
+
curr_vol["chapters"].append(
|
104
|
+
{
|
105
|
+
"title": title,
|
106
|
+
"url": url,
|
107
|
+
"chapterId": chap_id,
|
108
|
+
}
|
109
|
+
)
|
110
|
+
|
111
|
+
# append last volume if not empty
|
112
|
+
if curr_vol["chapters"]:
|
113
|
+
volumes.append(curr_vol)
|
114
|
+
|
115
|
+
return {
|
116
|
+
"book_name": book_name,
|
117
|
+
"author": author,
|
118
|
+
"cover_url": cover_url,
|
119
|
+
"update_time": update_time,
|
120
|
+
"summary": summary,
|
121
|
+
"volumes": volumes,
|
122
|
+
"word_count": word_count,
|
123
|
+
"serial_status": serial_status,
|
124
|
+
"extra": {},
|
125
|
+
}
|
126
|
+
|
127
|
+
def parse_chapter(
|
128
|
+
self,
|
129
|
+
html_list: list[str],
|
130
|
+
chapter_id: str,
|
131
|
+
**kwargs: Any,
|
132
|
+
) -> ChapterDict | None:
|
133
|
+
if not html_list:
|
134
|
+
return None
|
135
|
+
|
136
|
+
tree = html.fromstring(html_list[0])
|
137
|
+
title = self._first_str(tree.xpath("//h1/text()"))
|
138
|
+
if not title:
|
139
|
+
return None
|
140
|
+
|
141
|
+
# strip book-name prefix if present
|
142
|
+
bc = tree.xpath('//div[@id="breadCrumb"]//a/text()')
|
143
|
+
if len(bc) >= 2:
|
144
|
+
book_name = bc[1].strip()
|
145
|
+
title = title.removeprefix(book_name).lstrip(" ::–—-").strip()
|
146
|
+
|
147
|
+
anchors = tree.xpath('//div[@id="BookSee_Right"]')
|
148
|
+
if not anchors:
|
149
|
+
return None
|
150
|
+
marker = anchors[0]
|
151
|
+
|
152
|
+
lines: list[str] = []
|
153
|
+
|
154
|
+
def _append_text(text: str) -> None:
|
155
|
+
for ln in text.replace("\xa0", " ").splitlines():
|
156
|
+
ln2 = ln.strip()
|
157
|
+
if ln2:
|
158
|
+
lines.append(ln2)
|
159
|
+
|
160
|
+
if marker.tail:
|
161
|
+
_append_text(marker.tail)
|
162
|
+
|
163
|
+
# 4. Walk through siblings until <!--over-->
|
164
|
+
node = marker
|
165
|
+
while True:
|
166
|
+
sib = node.getnext()
|
167
|
+
if sib is None:
|
168
|
+
break
|
169
|
+
node = sib
|
170
|
+
|
171
|
+
# Stop on the closing comment
|
172
|
+
if isinstance(sib, etree._Comment) and "over" in (sib.text or ""):
|
173
|
+
break
|
174
|
+
|
175
|
+
# Process comment tails (e.g. <!--go--> tail)
|
176
|
+
if isinstance(sib, etree._Comment):
|
177
|
+
if sib.tail:
|
178
|
+
_append_text(sib.tail)
|
179
|
+
continue
|
180
|
+
|
181
|
+
if isinstance(sib, html.HtmlElement):
|
182
|
+
# tag = sib.tag.lower()
|
183
|
+
tag = str(sib.tag).lower()
|
184
|
+
cls = sib.get("class", "") or ""
|
185
|
+
|
186
|
+
if tag == "div" and "divimage" in cls:
|
187
|
+
srcs = sib.xpath(".//img/@src")
|
188
|
+
if srcs:
|
189
|
+
lines.append(f'<img src="{srcs[0]}" />')
|
190
|
+
# text after the div
|
191
|
+
if sib.tail:
|
192
|
+
_append_text(sib.tail)
|
193
|
+
continue
|
194
|
+
|
195
|
+
if tag == "br":
|
196
|
+
if sib.tail:
|
197
|
+
_append_text(sib.tail)
|
198
|
+
continue
|
199
|
+
|
200
|
+
text = sib.text_content()
|
201
|
+
_append_text(text)
|
202
|
+
if sib.tail:
|
203
|
+
_append_text(sib.tail)
|
204
|
+
continue
|
205
|
+
|
206
|
+
content = "\n".join(lines)
|
207
|
+
if not content:
|
208
|
+
return None
|
209
|
+
|
210
|
+
return {
|
211
|
+
"id": chapter_id,
|
212
|
+
"title": title,
|
213
|
+
"content": content,
|
214
|
+
"extra": {"site": "shencou"},
|
215
|
+
}
|
@@ -0,0 +1,111 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.shuhaige
|
4
|
+
--------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any
|
9
|
+
|
10
|
+
from lxml import html
|
11
|
+
|
12
|
+
from novel_downloader.core.parsers.base import BaseParser
|
13
|
+
from novel_downloader.core.parsers.registry import register_parser
|
14
|
+
from novel_downloader.models import (
|
15
|
+
BookInfoDict,
|
16
|
+
ChapterDict,
|
17
|
+
ChapterInfoDict,
|
18
|
+
VolumeInfoDict,
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
@register_parser(
|
23
|
+
site_keys=["shuhaige"],
|
24
|
+
)
|
25
|
+
class ShuhaigeParser(BaseParser):
|
26
|
+
"""
|
27
|
+
Parser for 书海阁小说网 book pages.
|
28
|
+
"""
|
29
|
+
|
30
|
+
def parse_book_info(
|
31
|
+
self,
|
32
|
+
html_list: list[str],
|
33
|
+
**kwargs: Any,
|
34
|
+
) -> BookInfoDict | None:
|
35
|
+
if not html_list:
|
36
|
+
return None
|
37
|
+
|
38
|
+
tree = html.fromstring(html_list[0])
|
39
|
+
|
40
|
+
book_name = self._first_str(tree.xpath('//div[@id="info"]/h1/text()'))
|
41
|
+
author = self._first_str(tree.xpath('//div[@id="info"]/p[1]/a/text()'))
|
42
|
+
|
43
|
+
cover_url = self._first_str(tree.xpath('//div[@id="fmimg"]/img/@src'))
|
44
|
+
|
45
|
+
update_time = self._first_str(
|
46
|
+
tree.xpath('//div[@id="info"]/p[3]/text()'),
|
47
|
+
replaces=[("最后更新:", "")],
|
48
|
+
)
|
49
|
+
|
50
|
+
summary = self._first_str(tree.xpath('//div[@id="intro"]/p[1]/text()'))
|
51
|
+
|
52
|
+
book_type = self._first_str(tree.xpath('//div[@class="con_top"]/a[2]/text()'))
|
53
|
+
tags = [book_type] if book_type else []
|
54
|
+
|
55
|
+
chapters: list[ChapterInfoDict] = [
|
56
|
+
{
|
57
|
+
"title": (a.text or "").strip(),
|
58
|
+
"url": (a.get("href") or "").strip(),
|
59
|
+
"chapterId": (a.get("href") or "").rsplit("/", 1)[-1].split(".", 1)[0],
|
60
|
+
}
|
61
|
+
for a in tree.xpath(
|
62
|
+
'//div[@id="list"]/dl/dt[contains(., "正文")]/following-sibling::dd/a'
|
63
|
+
)
|
64
|
+
]
|
65
|
+
|
66
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
67
|
+
|
68
|
+
return {
|
69
|
+
"book_name": book_name,
|
70
|
+
"author": author,
|
71
|
+
"cover_url": cover_url,
|
72
|
+
"update_time": update_time,
|
73
|
+
"tags": tags,
|
74
|
+
"summary": summary,
|
75
|
+
"volumes": volumes,
|
76
|
+
"extra": {},
|
77
|
+
}
|
78
|
+
|
79
|
+
def parse_chapter(
|
80
|
+
self,
|
81
|
+
html_list: list[str],
|
82
|
+
chapter_id: str,
|
83
|
+
**kwargs: Any,
|
84
|
+
) -> ChapterDict | None:
|
85
|
+
if not html_list:
|
86
|
+
return None
|
87
|
+
tree = html.fromstring(html_list[0])
|
88
|
+
|
89
|
+
title = self._first_str(tree.xpath('//div[@class="bookname"]/h1/text()'))
|
90
|
+
if not title:
|
91
|
+
title = f"第 {chapter_id} 章"
|
92
|
+
|
93
|
+
content_elem = tree.xpath('//div[@id="content"]')
|
94
|
+
if not content_elem:
|
95
|
+
return None
|
96
|
+
paragraphs = [
|
97
|
+
"".join(p.itertext()).strip() for p in content_elem[0].xpath(".//p")
|
98
|
+
]
|
99
|
+
if paragraphs and "www.shuhaige.net" in paragraphs[-1]:
|
100
|
+
paragraphs.pop()
|
101
|
+
|
102
|
+
content = "\n".join(paragraphs)
|
103
|
+
if not content.strip():
|
104
|
+
return None
|
105
|
+
|
106
|
+
return {
|
107
|
+
"id": chapter_id,
|
108
|
+
"title": title,
|
109
|
+
"content": content,
|
110
|
+
"extra": {"site": "shuhaige"},
|
111
|
+
}
|