novel-downloader 1.5.0__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +1 -3
- novel_downloader/cli/clean.py +21 -88
- novel_downloader/cli/config.py +26 -21
- novel_downloader/cli/download.py +79 -66
- novel_downloader/cli/export.py +17 -21
- novel_downloader/cli/main.py +1 -1
- novel_downloader/cli/search.py +62 -65
- novel_downloader/cli/ui.py +156 -0
- novel_downloader/config/__init__.py +8 -5
- novel_downloader/config/adapter.py +206 -209
- novel_downloader/config/{loader.py → file_io.py} +53 -26
- novel_downloader/core/__init__.py +5 -5
- novel_downloader/core/archived/deqixs/fetcher.py +115 -0
- novel_downloader/core/archived/deqixs/parser.py +132 -0
- novel_downloader/core/archived/deqixs/searcher.py +89 -0
- novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
- novel_downloader/core/archived/wanbengo/searcher.py +98 -0
- novel_downloader/core/archived/xshbook/searcher.py +93 -0
- novel_downloader/core/downloaders/__init__.py +3 -24
- novel_downloader/core/downloaders/base.py +49 -23
- novel_downloader/core/downloaders/common.py +191 -137
- novel_downloader/core/downloaders/qianbi.py +187 -146
- novel_downloader/core/downloaders/qidian.py +187 -141
- novel_downloader/core/downloaders/registry.py +4 -2
- novel_downloader/core/downloaders/signals.py +46 -0
- novel_downloader/core/exporters/__init__.py +3 -20
- novel_downloader/core/exporters/base.py +33 -37
- novel_downloader/core/exporters/common/__init__.py +1 -2
- novel_downloader/core/exporters/common/epub.py +15 -10
- novel_downloader/core/exporters/common/main_exporter.py +19 -12
- novel_downloader/core/exporters/common/txt.py +17 -12
- novel_downloader/core/exporters/epub_util.py +59 -29
- novel_downloader/core/exporters/linovelib/__init__.py +1 -0
- novel_downloader/core/exporters/linovelib/epub.py +23 -25
- novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
- novel_downloader/core/exporters/linovelib/txt.py +20 -14
- novel_downloader/core/exporters/qidian.py +2 -8
- novel_downloader/core/exporters/registry.py +4 -2
- novel_downloader/core/exporters/txt_util.py +7 -7
- novel_downloader/core/fetchers/__init__.py +54 -48
- novel_downloader/core/fetchers/aaatxt.py +83 -0
- novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
- novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
- novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
- novel_downloader/core/fetchers/dxmwx.py +110 -0
- novel_downloader/core/fetchers/eightnovel.py +139 -0
- novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
- novel_downloader/core/fetchers/guidaye.py +85 -0
- novel_downloader/core/fetchers/hetushu.py +92 -0
- novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
- novel_downloader/core/fetchers/ixdzs8.py +113 -0
- novel_downloader/core/fetchers/jpxs123.py +101 -0
- novel_downloader/core/fetchers/lewenn.py +83 -0
- novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
- novel_downloader/core/fetchers/piaotia.py +105 -0
- novel_downloader/core/fetchers/qbtr.py +101 -0
- novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
- novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +56 -64
- novel_downloader/core/fetchers/quanben5.py +92 -0
- novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
- novel_downloader/core/fetchers/registry.py +5 -16
- novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
- novel_downloader/core/fetchers/shencou.py +106 -0
- novel_downloader/core/fetchers/shuhaige.py +84 -0
- novel_downloader/core/fetchers/tongrenquan.py +84 -0
- novel_downloader/core/fetchers/ttkan.py +95 -0
- novel_downloader/core/fetchers/wanbengo.py +83 -0
- novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
- novel_downloader/core/fetchers/xiguashuwu.py +177 -0
- novel_downloader/core/fetchers/xs63b.py +171 -0
- novel_downloader/core/fetchers/xshbook.py +85 -0
- novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
- novel_downloader/core/fetchers/yibige.py +114 -0
- novel_downloader/core/interfaces/__init__.py +1 -9
- novel_downloader/core/interfaces/downloader.py +6 -2
- novel_downloader/core/interfaces/exporter.py +7 -7
- novel_downloader/core/interfaces/fetcher.py +6 -19
- novel_downloader/core/interfaces/parser.py +7 -8
- novel_downloader/core/interfaces/searcher.py +9 -1
- novel_downloader/core/parsers/__init__.py +49 -12
- novel_downloader/core/parsers/aaatxt.py +132 -0
- novel_downloader/core/parsers/b520.py +116 -0
- novel_downloader/core/parsers/base.py +64 -12
- novel_downloader/core/parsers/biquyuedu.py +133 -0
- novel_downloader/core/parsers/dxmwx.py +162 -0
- novel_downloader/core/parsers/eightnovel.py +224 -0
- novel_downloader/core/parsers/esjzone.py +64 -69
- novel_downloader/core/parsers/guidaye.py +128 -0
- novel_downloader/core/parsers/hetushu.py +139 -0
- novel_downloader/core/parsers/i25zw.py +137 -0
- novel_downloader/core/parsers/ixdzs8.py +186 -0
- novel_downloader/core/parsers/jpxs123.py +137 -0
- novel_downloader/core/parsers/lewenn.py +142 -0
- novel_downloader/core/parsers/linovelib.py +48 -64
- novel_downloader/core/parsers/piaotia.py +189 -0
- novel_downloader/core/parsers/qbtr.py +136 -0
- novel_downloader/core/parsers/qianbi.py +48 -50
- novel_downloader/core/parsers/qidian/main_parser.py +756 -48
- novel_downloader/core/parsers/qidian/utils/__init__.py +3 -21
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +4 -4
- novel_downloader/core/parsers/quanben5.py +103 -0
- novel_downloader/core/parsers/registry.py +5 -16
- novel_downloader/core/parsers/sfacg.py +38 -45
- novel_downloader/core/parsers/shencou.py +215 -0
- novel_downloader/core/parsers/shuhaige.py +111 -0
- novel_downloader/core/parsers/tongrenquan.py +116 -0
- novel_downloader/core/parsers/ttkan.py +132 -0
- novel_downloader/core/parsers/wanbengo.py +191 -0
- novel_downloader/core/parsers/xiaoshuowu.py +173 -0
- novel_downloader/core/parsers/xiguashuwu.py +429 -0
- novel_downloader/core/parsers/xs63b.py +161 -0
- novel_downloader/core/parsers/xshbook.py +134 -0
- novel_downloader/core/parsers/yamibo.py +87 -131
- novel_downloader/core/parsers/yibige.py +166 -0
- novel_downloader/core/searchers/__init__.py +34 -3
- novel_downloader/core/searchers/aaatxt.py +107 -0
- novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
- novel_downloader/core/searchers/base.py +112 -36
- novel_downloader/core/searchers/dxmwx.py +105 -0
- novel_downloader/core/searchers/eightnovel.py +84 -0
- novel_downloader/core/searchers/esjzone.py +43 -25
- novel_downloader/core/searchers/hetushu.py +92 -0
- novel_downloader/core/searchers/i25zw.py +93 -0
- novel_downloader/core/searchers/ixdzs8.py +107 -0
- novel_downloader/core/searchers/jpxs123.py +107 -0
- novel_downloader/core/searchers/piaotia.py +100 -0
- novel_downloader/core/searchers/qbtr.py +106 -0
- novel_downloader/core/searchers/qianbi.py +74 -40
- novel_downloader/core/searchers/quanben5.py +144 -0
- novel_downloader/core/searchers/registry.py +24 -8
- novel_downloader/core/searchers/shuhaige.py +124 -0
- novel_downloader/core/searchers/tongrenquan.py +110 -0
- novel_downloader/core/searchers/ttkan.py +92 -0
- novel_downloader/core/searchers/xiaoshuowu.py +122 -0
- novel_downloader/core/searchers/xiguashuwu.py +95 -0
- novel_downloader/core/searchers/xs63b.py +104 -0
- novel_downloader/locales/en.json +34 -85
- novel_downloader/locales/zh.json +35 -86
- novel_downloader/models/__init__.py +21 -22
- novel_downloader/models/book.py +44 -0
- novel_downloader/models/config.py +4 -37
- novel_downloader/models/login.py +1 -1
- novel_downloader/models/search.py +5 -0
- novel_downloader/resources/config/settings.toml +8 -70
- novel_downloader/resources/json/xiguashuwu.json +718 -0
- novel_downloader/utils/__init__.py +13 -24
- novel_downloader/utils/chapter_storage.py +5 -5
- novel_downloader/utils/constants.py +4 -31
- novel_downloader/utils/cookies.py +38 -35
- novel_downloader/utils/crypto_utils/__init__.py +7 -0
- novel_downloader/utils/crypto_utils/aes_util.py +90 -0
- novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
- novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
- novel_downloader/utils/crypto_utils/rc4.py +54 -0
- novel_downloader/utils/epub/__init__.py +3 -4
- novel_downloader/utils/epub/builder.py +6 -6
- novel_downloader/utils/epub/constants.py +62 -21
- novel_downloader/utils/epub/documents.py +95 -201
- novel_downloader/utils/epub/models.py +8 -22
- novel_downloader/utils/epub/utils.py +73 -106
- novel_downloader/utils/file_utils/__init__.py +2 -23
- novel_downloader/utils/file_utils/io.py +53 -188
- novel_downloader/utils/file_utils/normalize.py +1 -7
- novel_downloader/utils/file_utils/sanitize.py +4 -15
- novel_downloader/utils/fontocr/__init__.py +5 -14
- novel_downloader/utils/fontocr/core.py +216 -0
- novel_downloader/utils/fontocr/loader.py +50 -0
- novel_downloader/utils/logger.py +81 -65
- novel_downloader/utils/network.py +17 -41
- novel_downloader/utils/state.py +4 -90
- novel_downloader/utils/text_utils/__init__.py +1 -7
- novel_downloader/utils/text_utils/diff_display.py +5 -7
- novel_downloader/utils/text_utils/text_cleaner.py +39 -30
- novel_downloader/utils/text_utils/truncate_utils.py +3 -14
- novel_downloader/utils/time_utils/__init__.py +5 -11
- novel_downloader/utils/time_utils/datetime_utils.py +20 -29
- novel_downloader/utils/time_utils/sleep_utils.py +55 -49
- novel_downloader/web/__init__.py +13 -0
- novel_downloader/web/components/__init__.py +11 -0
- novel_downloader/web/components/navigation.py +35 -0
- novel_downloader/web/main.py +66 -0
- novel_downloader/web/pages/__init__.py +17 -0
- novel_downloader/web/pages/download.py +78 -0
- novel_downloader/web/pages/progress.py +147 -0
- novel_downloader/web/pages/search.py +329 -0
- novel_downloader/web/services/__init__.py +17 -0
- novel_downloader/web/services/client_dialog.py +164 -0
- novel_downloader/web/services/cred_broker.py +113 -0
- novel_downloader/web/services/cred_models.py +35 -0
- novel_downloader/web/services/task_manager.py +264 -0
- novel_downloader-2.0.1.dist-info/METADATA +172 -0
- novel_downloader-2.0.1.dist-info/RECORD +206 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/entry_points.txt +1 -1
- novel_downloader/core/downloaders/biquge.py +0 -29
- novel_downloader/core/downloaders/esjzone.py +0 -29
- novel_downloader/core/downloaders/linovelib.py +0 -29
- novel_downloader/core/downloaders/sfacg.py +0 -29
- novel_downloader/core/downloaders/yamibo.py +0 -29
- novel_downloader/core/exporters/biquge.py +0 -22
- novel_downloader/core/exporters/esjzone.py +0 -22
- novel_downloader/core/exporters/qianbi.py +0 -22
- novel_downloader/core/exporters/sfacg.py +0 -22
- novel_downloader/core/exporters/yamibo.py +0 -22
- novel_downloader/core/fetchers/base/__init__.py +0 -14
- novel_downloader/core/fetchers/base/browser.py +0 -422
- novel_downloader/core/fetchers/biquge/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/browser.py +0 -209
- novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
- novel_downloader/core/fetchers/linovelib/browser.py +0 -198
- novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/browser.py +0 -326
- novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
- novel_downloader/core/fetchers/sfacg/browser.py +0 -194
- novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
- novel_downloader/core/fetchers/yamibo/browser.py +0 -234
- novel_downloader/core/parsers/biquge.py +0 -139
- novel_downloader/core/parsers/qidian/book_info_parser.py +0 -90
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -528
- novel_downloader/core/parsers/qidian/chapter_normal.py +0 -157
- novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -114
- novel_downloader/models/chapter.py +0 -25
- novel_downloader/models/types.py +0 -13
- novel_downloader/tui/__init__.py +0 -7
- novel_downloader/tui/app.py +0 -32
- novel_downloader/tui/main.py +0 -17
- novel_downloader/tui/screens/__init__.py +0 -14
- novel_downloader/tui/screens/home.py +0 -198
- novel_downloader/tui/screens/login.py +0 -74
- novel_downloader/tui/styles/home_layout.tcss +0 -79
- novel_downloader/tui/widgets/richlog_handler.py +0 -24
- novel_downloader/utils/cache.py +0 -24
- novel_downloader/utils/crypto_utils.py +0 -71
- novel_downloader/utils/fontocr/hash_store.py +0 -280
- novel_downloader/utils/fontocr/hash_utils.py +0 -103
- novel_downloader/utils/fontocr/model_loader.py +0 -69
- novel_downloader/utils/fontocr/ocr_v1.py +0 -315
- novel_downloader/utils/fontocr/ocr_v2.py +0 -764
- novel_downloader/utils/fontocr/ocr_v3.py +0 -744
- novel_downloader-1.5.0.dist-info/METADATA +0 -196
- novel_downloader-1.5.0.dist-info/RECORD +0 -164
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/WHEEL +0 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/top_level.txt +0 -0
@@ -12,26 +12,20 @@ from lxml import html
|
|
12
12
|
|
13
13
|
from novel_downloader.core.parsers.base import BaseParser
|
14
14
|
from novel_downloader.core.parsers.registry import register_parser
|
15
|
-
from novel_downloader.models import
|
15
|
+
from novel_downloader.models import (
|
16
|
+
BookInfoDict,
|
17
|
+
ChapterDict,
|
18
|
+
VolumeInfoDict,
|
19
|
+
)
|
16
20
|
|
17
21
|
|
18
22
|
@register_parser(
|
19
23
|
site_keys=["esjzone"],
|
20
|
-
backends=["session", "browser"],
|
21
24
|
)
|
22
25
|
class EsjzoneParser(BaseParser):
|
23
|
-
"""
|
24
|
-
|
25
|
-
|
26
|
-
_BOOK_NAME_XPATH = '//h2[contains(@class, "text-normal")]/text()'
|
27
|
-
_AUTHOR_XPATH = '//li[strong[text()="作者:"]]/a/text()'
|
28
|
-
_COVER_URL_XPATH = '//div[contains(@class,"product-gallery")]//img/@src'
|
29
|
-
_UPDATE_TIME_XPATH = '//li[strong[text()="更新日期:"]]/text()'
|
30
|
-
_WORD_COUNT_XPATH = '//span[@id="txt"]/text()'
|
31
|
-
_TYPE_XPATH = '//li[strong[text()="類型:"]]/text()'
|
32
|
-
_ALT_NAME_XPATH = '//li[strong[text()="其他書名:"]]/text()'
|
33
|
-
_WEB_URL_XPATH = '//li[strong[text()="Web生肉:"]]/a/@href'
|
34
|
-
_SUMMARY_XPATH = '//div[@class="description"]/p//text()'
|
26
|
+
"""
|
27
|
+
Parser for esjzone book pages.
|
28
|
+
"""
|
35
29
|
|
36
30
|
# Chapter XPaths
|
37
31
|
_CHAPTER_TEXT_XPATH = 'string(//div[contains(@class, "forum-content")])'
|
@@ -40,14 +34,13 @@ class EsjzoneParser(BaseParser):
|
|
40
34
|
'//i[contains(@class, "icon-clock")]/following-sibling::text()',
|
41
35
|
'//i[contains(@class, "icon-pen-tool")]/following-sibling::text()',
|
42
36
|
]
|
43
|
-
|
44
37
|
_CHECK_FORUM_XPATH = '//div[@class="page-title"]//ul[@class="breadcrumbs"]/li[not(@class="slash")]//text()' # noqa: E501
|
45
38
|
|
46
39
|
def parse_book_info(
|
47
40
|
self,
|
48
41
|
html_list: list[str],
|
49
42
|
**kwargs: Any,
|
50
|
-
) ->
|
43
|
+
) -> BookInfoDict | None:
|
51
44
|
"""
|
52
45
|
Parse a book info page and extract metadata and chapter structure.
|
53
46
|
|
@@ -58,27 +51,40 @@ class EsjzoneParser(BaseParser):
|
|
58
51
|
:return: Parsed metadata and chapter structure as a dictionary.
|
59
52
|
"""
|
60
53
|
if not html_list or self._is_forum_page(html_list):
|
61
|
-
return
|
54
|
+
return None
|
55
|
+
|
62
56
|
tree = html.fromstring(html_list[0])
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
result["cover_url"] = self._get_text(tree, self._COVER_URL_XPATH)
|
68
|
-
result["update_time"] = self._get_text(tree, self._UPDATE_TIME_XPATH)
|
69
|
-
result["word_count"] = self._get_text(
|
70
|
-
tree, self._WORD_COUNT_XPATH, clean_comma=True
|
57
|
+
|
58
|
+
# --- Basic metadata ---
|
59
|
+
book_name = self._first_str(
|
60
|
+
tree.xpath('//h2[contains(@class,"text-normal")]/text()')
|
71
61
|
)
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
62
|
+
author = self._first_str(tree.xpath('//li[strong[text()="作者:"]]/a/text()'))
|
63
|
+
cover_url = self._first_str(
|
64
|
+
tree.xpath('//div[contains(@class,"product-gallery")]//img/@src')
|
65
|
+
)
|
66
|
+
update_time = self._first_str(
|
67
|
+
tree.xpath('//li[strong[text()="更新日期:"]]/text()')
|
68
|
+
) # noqa: E501
|
69
|
+
word_count = self._first_str(
|
70
|
+
tree.xpath('//span[@id="txt"]/text()'), replaces=[(",", "")]
|
71
|
+
)
|
72
|
+
book_type = self._first_str(tree.xpath('//li[strong[text()="類型:"]]/text()'))
|
73
|
+
alt_name = self._first_str(
|
74
|
+
tree.xpath('//li[strong[text()="其他書名:"]]/text()')
|
75
|
+
) # noqa: E501
|
76
|
+
web_url = self._first_str(tree.xpath('//li[strong[text()="Web生肉:"]]/a/@href'))
|
77
|
+
|
78
|
+
# Summary paragraphs
|
76
79
|
paras = tree.xpath('//div[@class="description"]/p')
|
77
80
|
texts = [p.xpath("string()").strip() for p in paras]
|
78
|
-
|
81
|
+
summary = "\n".join(t for t in texts if t)
|
79
82
|
|
80
|
-
|
81
|
-
|
83
|
+
current_vol: VolumeInfoDict = {
|
84
|
+
"volume_name": "單卷",
|
85
|
+
"chapters": [],
|
86
|
+
}
|
87
|
+
volumes: list[VolumeInfoDict] = [current_vol]
|
82
88
|
|
83
89
|
def _is_garbage_title(name: str) -> bool:
|
84
90
|
stripped = name.strip()
|
@@ -89,25 +95,18 @@ class EsjzoneParser(BaseParser):
|
|
89
95
|
if _is_garbage_title(name):
|
90
96
|
return
|
91
97
|
name = name.strip() or "未命名卷"
|
92
|
-
if
|
98
|
+
if current_vol and current_vol["volume_name"] == name:
|
93
99
|
return
|
94
100
|
current_vol = {"volume_name": name, "chapters": []}
|
95
101
|
volumes.append(current_vol)
|
96
102
|
|
97
|
-
_start_volume("單卷")
|
98
|
-
|
99
|
-
# nodes = tree.xpath('//div[@id="chapterList"]/details') + tree.xpath(
|
100
|
-
# '//div[@id="chapterList"]/*[not(self::details)]'
|
101
|
-
# )
|
102
103
|
nodes = tree.xpath('//div[@id="chapterList"]/*')
|
103
|
-
|
104
104
|
for node in nodes:
|
105
105
|
tag = node.tag.lower()
|
106
106
|
|
107
107
|
if tag == "details":
|
108
108
|
# ---- DETAILS-based layout ----
|
109
|
-
|
110
|
-
vol_name = summary.text if summary is not None else "未命名卷"
|
109
|
+
vol_name = node.xpath("string(./summary)").strip() or "未命名卷"
|
111
110
|
_start_volume(vol_name)
|
112
111
|
|
113
112
|
# all chapters inside this details
|
@@ -116,7 +115,11 @@ class EsjzoneParser(BaseParser):
|
|
116
115
|
href = a.get("href", "")
|
117
116
|
chap_id = href.rstrip("/").split("/")[-1].split(".", 1)[0]
|
118
117
|
current_vol["chapters"].append(
|
119
|
-
{
|
118
|
+
{
|
119
|
+
"title": title,
|
120
|
+
"url": href,
|
121
|
+
"chapterId": chap_id,
|
122
|
+
}
|
120
123
|
)
|
121
124
|
|
122
125
|
elif (
|
@@ -125,9 +128,9 @@ class EsjzoneParser(BaseParser):
|
|
125
128
|
or tag == "summary"
|
126
129
|
):
|
127
130
|
# Handle possible volume title markers:
|
128
|
-
#
|
129
|
-
#
|
130
|
-
#
|
131
|
+
# * <h2>: standard volume header
|
132
|
+
# * <p class="non">: alternative volume header style
|
133
|
+
# * <summary>: fallback for stray <summary> tags outside <details>
|
131
134
|
_start_volume(node.xpath("string()"))
|
132
135
|
|
133
136
|
elif tag == "a":
|
@@ -139,9 +142,21 @@ class EsjzoneParser(BaseParser):
|
|
139
142
|
{"title": title, "url": href, "chapterId": chap_id}
|
140
143
|
)
|
141
144
|
volumes = [vol for vol in volumes if vol["chapters"]]
|
142
|
-
result["volumes"] = volumes
|
143
145
|
|
144
|
-
return
|
146
|
+
return {
|
147
|
+
"book_name": book_name,
|
148
|
+
"author": author,
|
149
|
+
"cover_url": cover_url,
|
150
|
+
"update_time": update_time,
|
151
|
+
"summary": summary,
|
152
|
+
"tags": [book_type],
|
153
|
+
"word_count": word_count,
|
154
|
+
"volumes": volumes,
|
155
|
+
"extra": {
|
156
|
+
"alt_name": alt_name,
|
157
|
+
"web_url": web_url,
|
158
|
+
},
|
159
|
+
}
|
145
160
|
|
146
161
|
def parse_chapter(
|
147
162
|
self,
|
@@ -149,16 +164,9 @@ class EsjzoneParser(BaseParser):
|
|
149
164
|
chapter_id: str,
|
150
165
|
**kwargs: Any,
|
151
166
|
) -> ChapterDict | None:
|
152
|
-
"""
|
153
|
-
Parse a single chapter page and extract clean text or simplified HTML.
|
154
|
-
|
155
|
-
:param html_list: Raw HTML of the chapter page.
|
156
|
-
:param chapter_id: Identifier of the chapter being parsed.
|
157
|
-
:return: Cleaned chapter content as plain text or minimal HTML.
|
158
|
-
"""
|
159
167
|
if not html_list or self._is_forum_page(html_list):
|
160
168
|
return None
|
161
|
-
tree = html.fromstring(html_list[0]
|
169
|
+
tree = html.fromstring(html_list[0])
|
162
170
|
|
163
171
|
content_lines: list[str] = []
|
164
172
|
content_nodes = tree.xpath(self._CHAPTER_CONTENT_NODES_XPATH)
|
@@ -178,7 +186,7 @@ class EsjzoneParser(BaseParser):
|
|
178
186
|
content_lines.append(f'<img src="{src}" />')
|
179
187
|
|
180
188
|
content = (
|
181
|
-
"\n
|
189
|
+
"\n".join(content_lines).strip()
|
182
190
|
if content_lines
|
183
191
|
else tree.xpath(self._CHAPTER_TEXT_XPATH).strip()
|
184
192
|
)
|
@@ -216,16 +224,3 @@ class EsjzoneParser(BaseParser):
|
|
216
224
|
breadcrumb: list[str] = tree.xpath(self._CHECK_FORUM_XPATH)
|
217
225
|
breadcrumb = [s.strip() for s in breadcrumb if s.strip()]
|
218
226
|
return breadcrumb == ["Home", "論壇"]
|
219
|
-
|
220
|
-
@staticmethod
|
221
|
-
def _get_text(
|
222
|
-
tree: html.HtmlElement,
|
223
|
-
xpath: str,
|
224
|
-
join: bool = False,
|
225
|
-
clean_comma: bool = False,
|
226
|
-
) -> str:
|
227
|
-
data = tree.xpath(xpath)
|
228
|
-
if not data:
|
229
|
-
return ""
|
230
|
-
text = "\n".join(data) if join else data[0].strip()
|
231
|
-
return text.replace(",", "") if clean_comma else text
|
@@ -0,0 +1,128 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.guidaye
|
4
|
+
-------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from datetime import datetime
|
10
|
+
from typing import Any
|
11
|
+
|
12
|
+
from lxml import html
|
13
|
+
|
14
|
+
from novel_downloader.core.parsers.base import BaseParser
|
15
|
+
from novel_downloader.core.parsers.registry import register_parser
|
16
|
+
from novel_downloader.models import (
|
17
|
+
BookInfoDict,
|
18
|
+
ChapterDict,
|
19
|
+
VolumeInfoDict,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
@register_parser(
|
24
|
+
site_keys=["guidaye"],
|
25
|
+
)
|
26
|
+
class GuidayeParser(BaseParser):
|
27
|
+
"""
|
28
|
+
Parser for 名著阅读 book pages.
|
29
|
+
"""
|
30
|
+
|
31
|
+
BASE_URL = "https://b.guidaye.com"
|
32
|
+
|
33
|
+
def parse_book_info(
|
34
|
+
self,
|
35
|
+
html_list: list[str],
|
36
|
+
**kwargs: Any,
|
37
|
+
) -> BookInfoDict | None:
|
38
|
+
if not html_list:
|
39
|
+
return None
|
40
|
+
|
41
|
+
tree = html.fromstring(html_list[0])
|
42
|
+
|
43
|
+
# Book metadata
|
44
|
+
book_name = self._first_str(tree.xpath('//h1[@class="page-title"]/a/text()'))
|
45
|
+
author = self._first_str(
|
46
|
+
tree.xpath('//div[@id="category-description-author"]/a/text()')
|
47
|
+
)
|
48
|
+
cover_url = self.BASE_URL + self._first_str(
|
49
|
+
tree.xpath('//div[@id="category-description-image"]//img/@src')
|
50
|
+
)
|
51
|
+
|
52
|
+
# Summary paragraphs
|
53
|
+
summary = (
|
54
|
+
tree.xpath('string(//div[@id="category-description-text"])')
|
55
|
+
.replace("内容简介:", "", 1)
|
56
|
+
.strip()
|
57
|
+
)
|
58
|
+
|
59
|
+
# Chapter volumes & listings
|
60
|
+
volumes: list[VolumeInfoDict] = []
|
61
|
+
curr_vol: VolumeInfoDict = {"volume_name": "未命名卷", "chapters": []}
|
62
|
+
|
63
|
+
items = tree.xpath('//div[@class="entry-content"]/ul/*')
|
64
|
+
for elem in items:
|
65
|
+
if elem.tag.lower() == "h3":
|
66
|
+
# Flush previous volume
|
67
|
+
if curr_vol["chapters"]:
|
68
|
+
volumes.append(curr_vol)
|
69
|
+
curr_vol = {"volume_name": elem.text_content().strip(), "chapters": []}
|
70
|
+
elif elem.tag.lower() == "li":
|
71
|
+
link = elem.xpath(".//a")[0]
|
72
|
+
href = link.get("href", "").strip()
|
73
|
+
title = link.get("title", "").strip()
|
74
|
+
cid_match = re.search(r"/(\d+)\.html$", href)
|
75
|
+
chapter_id = cid_match.group(1) if cid_match else ""
|
76
|
+
curr_vol["chapters"].append(
|
77
|
+
{"title": title, "url": href, "chapterId": chapter_id}
|
78
|
+
)
|
79
|
+
|
80
|
+
# Append last volume
|
81
|
+
if curr_vol["chapters"]:
|
82
|
+
volumes.append(curr_vol)
|
83
|
+
|
84
|
+
# Timestamp of parsing
|
85
|
+
share_text = tree.xpath('string(//div[@id="category-description-share"])')
|
86
|
+
m = re.search(r"最近更新[::]\s*([\d-]+)", share_text)
|
87
|
+
update_time = m.group(1) if m else datetime.now().strftime("%Y-%m-%d")
|
88
|
+
|
89
|
+
return {
|
90
|
+
"book_name": book_name,
|
91
|
+
"author": author,
|
92
|
+
"cover_url": cover_url,
|
93
|
+
"update_time": update_time,
|
94
|
+
"summary": summary,
|
95
|
+
"volumes": volumes,
|
96
|
+
"extra": {},
|
97
|
+
}
|
98
|
+
|
99
|
+
def parse_chapter(
|
100
|
+
self,
|
101
|
+
html_list: list[str],
|
102
|
+
chapter_id: str,
|
103
|
+
**kwargs: Any,
|
104
|
+
) -> ChapterDict | None:
|
105
|
+
if not html_list:
|
106
|
+
return None
|
107
|
+
tree = html.fromstring(html_list[0])
|
108
|
+
|
109
|
+
# Title from entry-title
|
110
|
+
title = self._first_str(tree.xpath('//h1[@class="entry-title"]/text()'))
|
111
|
+
|
112
|
+
# Extract paragraphs within entry-content
|
113
|
+
full_text = tree.xpath('string(//div[@class="entry-content"])')
|
114
|
+
full_text = full_text.replace("\u00A0", " ")
|
115
|
+
|
116
|
+
# 3. Split into lines and clean up
|
117
|
+
lines = [line.strip() for line in full_text.splitlines() if line.strip()]
|
118
|
+
if not lines:
|
119
|
+
return None
|
120
|
+
|
121
|
+
content = "\n".join(lines)
|
122
|
+
|
123
|
+
return {
|
124
|
+
"id": chapter_id,
|
125
|
+
"title": title,
|
126
|
+
"content": content,
|
127
|
+
"extra": {"site": "guidaye"},
|
128
|
+
}
|
@@ -0,0 +1,139 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.hetushu
|
4
|
+
-------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from datetime import datetime
|
10
|
+
from typing import Any
|
11
|
+
|
12
|
+
from lxml import html
|
13
|
+
|
14
|
+
from novel_downloader.core.parsers.base import BaseParser
|
15
|
+
from novel_downloader.core.parsers.registry import register_parser
|
16
|
+
from novel_downloader.models import (
|
17
|
+
BookInfoDict,
|
18
|
+
ChapterDict,
|
19
|
+
VolumeInfoDict,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
@register_parser(
|
24
|
+
site_keys=["hetushu"],
|
25
|
+
)
|
26
|
+
class HetushuParser(BaseParser):
|
27
|
+
"""
|
28
|
+
Parser for 和图书 book pages.
|
29
|
+
"""
|
30
|
+
|
31
|
+
BASE_URL = "https://www.hetushu.com"
|
32
|
+
|
33
|
+
def parse_book_info(
|
34
|
+
self,
|
35
|
+
html_list: list[str],
|
36
|
+
**kwargs: Any,
|
37
|
+
) -> BookInfoDict | None:
|
38
|
+
if not html_list:
|
39
|
+
return None
|
40
|
+
|
41
|
+
tree = html.fromstring(html_list[0])
|
42
|
+
|
43
|
+
# --- Metadata ---
|
44
|
+
book_name = self._first_str(
|
45
|
+
tree.xpath('//div[contains(@class,"book_info")]/h2/text()')
|
46
|
+
)
|
47
|
+
author = self._first_str(
|
48
|
+
tree.xpath(
|
49
|
+
'//div[contains(@class,"book_info")]/div[contains(.,"作者")]/a/text()'
|
50
|
+
)
|
51
|
+
)
|
52
|
+
cover_url = self.BASE_URL + self._first_str(
|
53
|
+
tree.xpath('//div[contains(@class,"book_info")]//img/@src')
|
54
|
+
)
|
55
|
+
|
56
|
+
cls_attr = self._first_str(
|
57
|
+
tree.xpath('//div[contains(@class,"book_info")]/@class')
|
58
|
+
)
|
59
|
+
serial_status = "已完结" if "finish" in cls_attr else "连载中"
|
60
|
+
|
61
|
+
tags = [
|
62
|
+
a.strip()
|
63
|
+
for a in tree.xpath('//dl[@class="tag"]//dd/a/text()')
|
64
|
+
if a.strip()
|
65
|
+
]
|
66
|
+
|
67
|
+
paras = tree.xpath('//div[@class="intro"]/p/text()')
|
68
|
+
summary = "\n".join(p.strip() for p in paras if p.strip())
|
69
|
+
|
70
|
+
# --- Chapter volumes & listings ---
|
71
|
+
volumes: list[VolumeInfoDict] = []
|
72
|
+
curr_vol: VolumeInfoDict = {"volume_name": "未命名卷", "chapters": []}
|
73
|
+
|
74
|
+
for elem in tree.xpath('//dl[@id="dir"]/*'):
|
75
|
+
if elem.tag == "dt":
|
76
|
+
# Start a new volume
|
77
|
+
if curr_vol["chapters"]:
|
78
|
+
volumes.append(curr_vol)
|
79
|
+
curr_vol = {
|
80
|
+
"volume_name": elem.text_content().strip(),
|
81
|
+
"chapters": [],
|
82
|
+
}
|
83
|
+
elif elem.tag == "dd":
|
84
|
+
link = elem.xpath(".//a")[0]
|
85
|
+
href = link.get("href", "").strip()
|
86
|
+
title = link.get("title", "").strip()
|
87
|
+
# Extract numeric chapterId from the URL
|
88
|
+
m = re.search(r"/book/\d+/(?P<id>\d+)\.html", href)
|
89
|
+
chapter_id = m.group("id") if m else ""
|
90
|
+
curr_vol["chapters"].append(
|
91
|
+
{"title": title, "url": href, "chapterId": chapter_id}
|
92
|
+
)
|
93
|
+
|
94
|
+
# Append the last volume if it has any chapters
|
95
|
+
if curr_vol["chapters"]:
|
96
|
+
volumes.append(curr_vol)
|
97
|
+
|
98
|
+
update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
99
|
+
|
100
|
+
return {
|
101
|
+
"book_name": book_name,
|
102
|
+
"author": author,
|
103
|
+
"cover_url": cover_url,
|
104
|
+
"update_time": update_time,
|
105
|
+
"serial_status": serial_status,
|
106
|
+
"tags": tags,
|
107
|
+
"summary": summary,
|
108
|
+
"volumes": volumes,
|
109
|
+
"extra": {},
|
110
|
+
}
|
111
|
+
|
112
|
+
def parse_chapter(
|
113
|
+
self,
|
114
|
+
html_list: list[str],
|
115
|
+
chapter_id: str,
|
116
|
+
**kwargs: Any,
|
117
|
+
) -> ChapterDict | None:
|
118
|
+
if not html_list:
|
119
|
+
return None
|
120
|
+
|
121
|
+
tree = html.fromstring(html_list[0])
|
122
|
+
|
123
|
+
title = self._first_str(
|
124
|
+
tree.xpath('//div[@id="content"]//h2[@class="h2"]/text()')
|
125
|
+
)
|
126
|
+
|
127
|
+
paras = tree.xpath('//div[@id="content"]/div[not(@class)]/text()')
|
128
|
+
paragraph_texts = [p.strip() for p in paras if p.strip()]
|
129
|
+
|
130
|
+
content = "\n".join(paragraph_texts)
|
131
|
+
if not content.strip():
|
132
|
+
return None
|
133
|
+
|
134
|
+
return {
|
135
|
+
"id": chapter_id,
|
136
|
+
"title": title,
|
137
|
+
"content": content,
|
138
|
+
"extra": {"site": "hetushu"},
|
139
|
+
}
|
@@ -0,0 +1,137 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.i25zw
|
4
|
+
-----------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any
|
9
|
+
|
10
|
+
from lxml import html
|
11
|
+
|
12
|
+
from novel_downloader.core.parsers.base import BaseParser
|
13
|
+
from novel_downloader.core.parsers.registry import register_parser
|
14
|
+
from novel_downloader.models import (
|
15
|
+
BookInfoDict,
|
16
|
+
ChapterDict,
|
17
|
+
ChapterInfoDict,
|
18
|
+
VolumeInfoDict,
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
@register_parser(
|
23
|
+
site_keys=["i25zw"],
|
24
|
+
)
|
25
|
+
class I25zwParser(BaseParser):
|
26
|
+
"""
|
27
|
+
Parser for 25中文网 book-info pages.
|
28
|
+
"""
|
29
|
+
|
30
|
+
def parse_book_info(
|
31
|
+
self,
|
32
|
+
html_list: list[str],
|
33
|
+
**kwargs: Any,
|
34
|
+
) -> BookInfoDict | None:
|
35
|
+
if len(html_list) < 2:
|
36
|
+
return None
|
37
|
+
|
38
|
+
info_tree = html.fromstring(html_list[0])
|
39
|
+
catalog_tree = html.fromstring(html_list[1])
|
40
|
+
|
41
|
+
# Metadata extraction
|
42
|
+
book_name = self._first_str(info_tree.xpath("//h1[@class='f21h']/text()"))
|
43
|
+
author = self._first_str(info_tree.xpath("//h1[@class='f21h']/em/a/text()"))
|
44
|
+
cover_url = self._first_str(info_tree.xpath("//div[@class='pic']/img/@src"))
|
45
|
+
|
46
|
+
# Tags, status, word count, update time
|
47
|
+
tag = self._first_str(
|
48
|
+
info_tree.xpath("//b[contains(text(),'小说分类')]/parent::td/text()")
|
49
|
+
)
|
50
|
+
serial_status = self._first_str(
|
51
|
+
info_tree.xpath("//b[contains(text(),'小说状态')]/parent::td/text()")
|
52
|
+
)
|
53
|
+
word_count = self._first_str(
|
54
|
+
info_tree.xpath("//b[contains(text(),'全文字数')]/parent::td/text()")
|
55
|
+
)
|
56
|
+
raw_update = self._first_str(
|
57
|
+
info_tree.xpath("//b[contains(text(),'更新时间')]/parent::td/text()")
|
58
|
+
)
|
59
|
+
update_time = raw_update.strip("()")
|
60
|
+
|
61
|
+
# Summary from styled intro div
|
62
|
+
full_intro = info_tree.xpath("string(//div[@class='intro'][@style])").strip()
|
63
|
+
summary = full_intro.replace(f"关于{book_name}:", "", 1).strip()
|
64
|
+
|
65
|
+
# Chapter list extraction
|
66
|
+
dl = catalog_tree.xpath("//div[@id='list']/dl")[0]
|
67
|
+
# Full-text section dd's
|
68
|
+
dds = dl.xpath("./dd[preceding-sibling::dt[1][contains(., '正文')]]/a")
|
69
|
+
if not dds:
|
70
|
+
# Fallback to second <dt>'s following <dd>
|
71
|
+
dds = dl.xpath("./dt[2]/following-sibling::dd/a")
|
72
|
+
|
73
|
+
chapters: list[ChapterInfoDict] = []
|
74
|
+
for a in dds:
|
75
|
+
url = a.get("href", "").strip()
|
76
|
+
title = a.text_content().strip()
|
77
|
+
# '/311006/252845677.html' -> '252845677'
|
78
|
+
chapter_id = url.split("/")[-1].split(".")[0]
|
79
|
+
chapters.append(
|
80
|
+
{
|
81
|
+
"title": title,
|
82
|
+
"url": url,
|
83
|
+
"chapterId": chapter_id,
|
84
|
+
}
|
85
|
+
)
|
86
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
87
|
+
|
88
|
+
return {
|
89
|
+
"book_name": book_name,
|
90
|
+
"author": author,
|
91
|
+
"cover_url": cover_url,
|
92
|
+
"update_time": update_time,
|
93
|
+
"word_count": word_count,
|
94
|
+
"serial_status": serial_status,
|
95
|
+
"tags": [tag] if tag else [],
|
96
|
+
"summary": summary,
|
97
|
+
"volumes": volumes,
|
98
|
+
"extra": {},
|
99
|
+
}
|
100
|
+
|
101
|
+
def parse_chapter(
|
102
|
+
self,
|
103
|
+
html_list: list[str],
|
104
|
+
chapter_id: str,
|
105
|
+
**kwargs: Any,
|
106
|
+
) -> ChapterDict | None:
|
107
|
+
if not html_list:
|
108
|
+
return None
|
109
|
+
|
110
|
+
tree = html.fromstring(html_list[0])
|
111
|
+
|
112
|
+
title_text = self._first_str(
|
113
|
+
tree.xpath("//div[@class='zhangjieming']/h1/text()")
|
114
|
+
)
|
115
|
+
|
116
|
+
content_divs = tree.xpath("//div[@id='content']")
|
117
|
+
if not content_divs:
|
118
|
+
return None
|
119
|
+
content_div = content_divs[0]
|
120
|
+
|
121
|
+
# Only select direct <p> children to avoid nav links
|
122
|
+
paragraphs = []
|
123
|
+
for p in content_div.xpath("./p"):
|
124
|
+
text = p.text_content().strip()
|
125
|
+
if text:
|
126
|
+
paragraphs.append(text)
|
127
|
+
|
128
|
+
content_text = "\n".join(paragraphs)
|
129
|
+
if not content_text.strip():
|
130
|
+
return None
|
131
|
+
|
132
|
+
return {
|
133
|
+
"id": chapter_id,
|
134
|
+
"title": title_text,
|
135
|
+
"content": content_text,
|
136
|
+
"extra": {"site": "i25zw"},
|
137
|
+
}
|