novel-downloader 1.4.5__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +2 -4
- novel_downloader/cli/clean.py +21 -88
- novel_downloader/cli/config.py +27 -104
- novel_downloader/cli/download.py +78 -66
- novel_downloader/cli/export.py +20 -21
- novel_downloader/cli/main.py +3 -1
- novel_downloader/cli/search.py +120 -0
- novel_downloader/cli/ui.py +156 -0
- novel_downloader/config/__init__.py +10 -14
- novel_downloader/config/adapter.py +195 -99
- novel_downloader/config/{loader.py → file_io.py} +53 -27
- novel_downloader/core/__init__.py +14 -13
- novel_downloader/core/archived/deqixs/fetcher.py +115 -0
- novel_downloader/core/archived/deqixs/parser.py +132 -0
- novel_downloader/core/archived/deqixs/searcher.py +89 -0
- novel_downloader/core/archived/qidian/searcher.py +79 -0
- novel_downloader/core/archived/wanbengo/searcher.py +98 -0
- novel_downloader/core/archived/xshbook/searcher.py +93 -0
- novel_downloader/core/downloaders/__init__.py +8 -30
- novel_downloader/core/downloaders/base.py +182 -30
- novel_downloader/core/downloaders/common.py +217 -384
- novel_downloader/core/downloaders/qianbi.py +332 -4
- novel_downloader/core/downloaders/qidian.py +250 -290
- novel_downloader/core/downloaders/registry.py +69 -0
- novel_downloader/core/downloaders/signals.py +46 -0
- novel_downloader/core/exporters/__init__.py +8 -26
- novel_downloader/core/exporters/base.py +107 -31
- novel_downloader/core/exporters/common/__init__.py +3 -4
- novel_downloader/core/exporters/common/epub.py +92 -171
- novel_downloader/core/exporters/common/main_exporter.py +14 -67
- novel_downloader/core/exporters/common/txt.py +90 -86
- novel_downloader/core/exporters/epub_util.py +184 -1327
- novel_downloader/core/exporters/linovelib/__init__.py +3 -2
- novel_downloader/core/exporters/linovelib/epub.py +165 -222
- novel_downloader/core/exporters/linovelib/main_exporter.py +10 -71
- novel_downloader/core/exporters/linovelib/txt.py +76 -66
- novel_downloader/core/exporters/qidian.py +15 -11
- novel_downloader/core/exporters/registry.py +55 -0
- novel_downloader/core/exporters/txt_util.py +67 -0
- novel_downloader/core/fetchers/__init__.py +57 -56
- novel_downloader/core/fetchers/aaatxt.py +83 -0
- novel_downloader/core/fetchers/{biquge/session.py → b520.py} +10 -10
- novel_downloader/core/fetchers/{base/session.py → base.py} +63 -47
- novel_downloader/core/fetchers/biquyuedu.py +83 -0
- novel_downloader/core/fetchers/dxmwx.py +110 -0
- novel_downloader/core/fetchers/eightnovel.py +139 -0
- novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +23 -11
- novel_downloader/core/fetchers/guidaye.py +85 -0
- novel_downloader/core/fetchers/hetushu.py +92 -0
- novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +22 -26
- novel_downloader/core/fetchers/ixdzs8.py +113 -0
- novel_downloader/core/fetchers/jpxs123.py +101 -0
- novel_downloader/core/fetchers/{biquge/browser.py → lewenn.py} +15 -15
- novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +16 -12
- novel_downloader/core/fetchers/piaotia.py +105 -0
- novel_downloader/core/fetchers/qbtr.py +101 -0
- novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +9 -9
- novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +55 -40
- novel_downloader/core/fetchers/quanben5.py +92 -0
- novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
- novel_downloader/core/fetchers/registry.py +60 -0
- novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +11 -9
- novel_downloader/core/fetchers/shencou.py +106 -0
- novel_downloader/core/fetchers/{common/browser.py → shuhaige.py} +24 -19
- novel_downloader/core/fetchers/tongrenquan.py +84 -0
- novel_downloader/core/fetchers/ttkan.py +95 -0
- novel_downloader/core/fetchers/{common/session.py → wanbengo.py} +21 -17
- novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
- novel_downloader/core/fetchers/xiguashuwu.py +177 -0
- novel_downloader/core/fetchers/xs63b.py +171 -0
- novel_downloader/core/fetchers/xshbook.py +85 -0
- novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +23 -11
- novel_downloader/core/fetchers/yibige.py +114 -0
- novel_downloader/core/interfaces/__init__.py +8 -14
- novel_downloader/core/interfaces/downloader.py +6 -2
- novel_downloader/core/interfaces/exporter.py +7 -7
- novel_downloader/core/interfaces/fetcher.py +4 -17
- novel_downloader/core/interfaces/parser.py +5 -6
- novel_downloader/core/interfaces/searcher.py +26 -0
- novel_downloader/core/parsers/__init__.py +58 -22
- novel_downloader/core/parsers/aaatxt.py +132 -0
- novel_downloader/core/parsers/b520.py +116 -0
- novel_downloader/core/parsers/base.py +63 -12
- novel_downloader/core/parsers/biquyuedu.py +133 -0
- novel_downloader/core/parsers/dxmwx.py +162 -0
- novel_downloader/core/parsers/eightnovel.py +224 -0
- novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +67 -67
- novel_downloader/core/parsers/guidaye.py +128 -0
- novel_downloader/core/parsers/hetushu.py +139 -0
- novel_downloader/core/parsers/i25zw.py +137 -0
- novel_downloader/core/parsers/ixdzs8.py +186 -0
- novel_downloader/core/parsers/jpxs123.py +137 -0
- novel_downloader/core/parsers/lewenn.py +142 -0
- novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +54 -65
- novel_downloader/core/parsers/piaotia.py +189 -0
- novel_downloader/core/parsers/qbtr.py +136 -0
- novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +54 -51
- novel_downloader/core/parsers/qidian/__init__.py +2 -2
- novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +290 -346
- novel_downloader/core/parsers/qidian/chapter_normal.py +25 -56
- novel_downloader/core/parsers/qidian/main_parser.py +19 -57
- novel_downloader/core/parsers/qidian/utils/__init__.py +12 -11
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +6 -7
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
- novel_downloader/core/parsers/quanben5.py +103 -0
- novel_downloader/core/parsers/registry.py +57 -0
- novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +46 -48
- novel_downloader/core/parsers/shencou.py +215 -0
- novel_downloader/core/parsers/shuhaige.py +111 -0
- novel_downloader/core/parsers/tongrenquan.py +116 -0
- novel_downloader/core/parsers/ttkan.py +132 -0
- novel_downloader/core/parsers/wanbengo.py +191 -0
- novel_downloader/core/parsers/xiaoshuowu.py +173 -0
- novel_downloader/core/parsers/xiguashuwu.py +435 -0
- novel_downloader/core/parsers/xs63b.py +161 -0
- novel_downloader/core/parsers/xshbook.py +134 -0
- novel_downloader/core/parsers/yamibo.py +155 -0
- novel_downloader/core/parsers/yibige.py +166 -0
- novel_downloader/core/searchers/__init__.py +51 -0
- novel_downloader/core/searchers/aaatxt.py +107 -0
- novel_downloader/core/searchers/b520.py +84 -0
- novel_downloader/core/searchers/base.py +168 -0
- novel_downloader/core/searchers/dxmwx.py +105 -0
- novel_downloader/core/searchers/eightnovel.py +84 -0
- novel_downloader/core/searchers/esjzone.py +102 -0
- novel_downloader/core/searchers/hetushu.py +92 -0
- novel_downloader/core/searchers/i25zw.py +93 -0
- novel_downloader/core/searchers/ixdzs8.py +107 -0
- novel_downloader/core/searchers/jpxs123.py +107 -0
- novel_downloader/core/searchers/piaotia.py +100 -0
- novel_downloader/core/searchers/qbtr.py +106 -0
- novel_downloader/core/searchers/qianbi.py +165 -0
- novel_downloader/core/searchers/quanben5.py +144 -0
- novel_downloader/core/searchers/registry.py +79 -0
- novel_downloader/core/searchers/shuhaige.py +124 -0
- novel_downloader/core/searchers/tongrenquan.py +110 -0
- novel_downloader/core/searchers/ttkan.py +92 -0
- novel_downloader/core/searchers/xiaoshuowu.py +122 -0
- novel_downloader/core/searchers/xiguashuwu.py +95 -0
- novel_downloader/core/searchers/xs63b.py +104 -0
- novel_downloader/locales/en.json +36 -79
- novel_downloader/locales/zh.json +37 -80
- novel_downloader/models/__init__.py +23 -50
- novel_downloader/models/book.py +44 -0
- novel_downloader/models/config.py +16 -43
- novel_downloader/models/login.py +1 -1
- novel_downloader/models/search.py +21 -0
- novel_downloader/resources/config/settings.toml +39 -74
- novel_downloader/resources/css_styles/intro.css +83 -0
- novel_downloader/resources/css_styles/main.css +30 -89
- novel_downloader/resources/json/xiguashuwu.json +718 -0
- novel_downloader/utils/__init__.py +43 -0
- novel_downloader/utils/chapter_storage.py +247 -226
- novel_downloader/utils/constants.py +5 -50
- novel_downloader/utils/cookies.py +6 -18
- novel_downloader/utils/crypto_utils/__init__.py +13 -0
- novel_downloader/utils/crypto_utils/aes_util.py +90 -0
- novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
- novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
- novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
- novel_downloader/utils/epub/__init__.py +34 -0
- novel_downloader/utils/epub/builder.py +377 -0
- novel_downloader/utils/epub/constants.py +118 -0
- novel_downloader/utils/epub/documents.py +297 -0
- novel_downloader/utils/epub/models.py +120 -0
- novel_downloader/utils/epub/utils.py +179 -0
- novel_downloader/utils/file_utils/__init__.py +5 -30
- novel_downloader/utils/file_utils/io.py +9 -150
- novel_downloader/utils/file_utils/normalize.py +2 -2
- novel_downloader/utils/file_utils/sanitize.py +2 -7
- novel_downloader/utils/fontocr.py +207 -0
- novel_downloader/utils/i18n.py +2 -0
- novel_downloader/utils/logger.py +10 -16
- novel_downloader/utils/network.py +111 -252
- novel_downloader/utils/state.py +5 -90
- novel_downloader/utils/text_utils/__init__.py +16 -21
- novel_downloader/utils/text_utils/diff_display.py +6 -9
- novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
- novel_downloader/utils/text_utils/text_cleaner.py +179 -0
- novel_downloader/utils/text_utils/truncate_utils.py +62 -0
- novel_downloader/utils/time_utils/__init__.py +6 -12
- novel_downloader/utils/time_utils/datetime_utils.py +23 -33
- novel_downloader/utils/time_utils/sleep_utils.py +5 -10
- novel_downloader/web/__init__.py +13 -0
- novel_downloader/web/components/__init__.py +11 -0
- novel_downloader/web/components/navigation.py +35 -0
- novel_downloader/web/main.py +66 -0
- novel_downloader/web/pages/__init__.py +17 -0
- novel_downloader/web/pages/download.py +78 -0
- novel_downloader/web/pages/progress.py +147 -0
- novel_downloader/web/pages/search.py +329 -0
- novel_downloader/web/services/__init__.py +17 -0
- novel_downloader/web/services/client_dialog.py +164 -0
- novel_downloader/web/services/cred_broker.py +113 -0
- novel_downloader/web/services/cred_models.py +35 -0
- novel_downloader/web/services/task_manager.py +264 -0
- novel_downloader-2.0.0.dist-info/METADATA +171 -0
- novel_downloader-2.0.0.dist-info/RECORD +210 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
- novel_downloader/config/site_rules.py +0 -94
- novel_downloader/core/downloaders/biquge.py +0 -25
- novel_downloader/core/downloaders/esjzone.py +0 -25
- novel_downloader/core/downloaders/linovelib.py +0 -25
- novel_downloader/core/downloaders/sfacg.py +0 -25
- novel_downloader/core/downloaders/yamibo.py +0 -25
- novel_downloader/core/exporters/biquge.py +0 -25
- novel_downloader/core/exporters/esjzone.py +0 -25
- novel_downloader/core/exporters/qianbi.py +0 -25
- novel_downloader/core/exporters/sfacg.py +0 -25
- novel_downloader/core/exporters/yamibo.py +0 -25
- novel_downloader/core/factory/__init__.py +0 -20
- novel_downloader/core/factory/downloader.py +0 -73
- novel_downloader/core/factory/exporter.py +0 -58
- novel_downloader/core/factory/fetcher.py +0 -96
- novel_downloader/core/factory/parser.py +0 -86
- novel_downloader/core/fetchers/base/__init__.py +0 -14
- novel_downloader/core/fetchers/base/browser.py +0 -403
- novel_downloader/core/fetchers/biquge/__init__.py +0 -14
- novel_downloader/core/fetchers/common/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/browser.py +0 -204
- novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
- novel_downloader/core/fetchers/linovelib/browser.py +0 -193
- novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/browser.py +0 -318
- novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
- novel_downloader/core/fetchers/sfacg/browser.py +0 -189
- novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
- novel_downloader/core/fetchers/yamibo/browser.py +0 -229
- novel_downloader/core/parsers/biquge/__init__.py +0 -10
- novel_downloader/core/parsers/biquge/main_parser.py +0 -134
- novel_downloader/core/parsers/common/__init__.py +0 -13
- novel_downloader/core/parsers/common/helper.py +0 -323
- novel_downloader/core/parsers/common/main_parser.py +0 -106
- novel_downloader/core/parsers/esjzone/__init__.py +0 -10
- novel_downloader/core/parsers/linovelib/__init__.py +0 -10
- novel_downloader/core/parsers/qianbi/__init__.py +0 -10
- novel_downloader/core/parsers/sfacg/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/main_parser.py +0 -194
- novel_downloader/models/browser.py +0 -21
- novel_downloader/models/chapter.py +0 -25
- novel_downloader/models/site_rules.py +0 -99
- novel_downloader/models/tasks.py +0 -33
- novel_downloader/models/types.py +0 -15
- novel_downloader/resources/css_styles/volume-intro.css +0 -56
- novel_downloader/resources/json/replace_word_map.json +0 -4
- novel_downloader/resources/text/blacklist.txt +0 -22
- novel_downloader/tui/__init__.py +0 -7
- novel_downloader/tui/app.py +0 -32
- novel_downloader/tui/main.py +0 -17
- novel_downloader/tui/screens/__init__.py +0 -14
- novel_downloader/tui/screens/home.py +0 -198
- novel_downloader/tui/screens/login.py +0 -74
- novel_downloader/tui/styles/home_layout.tcss +0 -79
- novel_downloader/tui/widgets/richlog_handler.py +0 -24
- novel_downloader/utils/cache.py +0 -24
- novel_downloader/utils/fontocr/__init__.py +0 -22
- novel_downloader/utils/fontocr/model_loader.py +0 -69
- novel_downloader/utils/fontocr/ocr_v1.py +0 -303
- novel_downloader/utils/fontocr/ocr_v2.py +0 -752
- novel_downloader/utils/hash_store.py +0 -279
- novel_downloader/utils/hash_utils.py +0 -103
- novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
- novel_downloader/utils/text_utils/font_mapping.py +0 -28
- novel_downloader/utils/text_utils/text_cleaning.py +0 -107
- novel_downloader-1.4.5.dist-info/METADATA +0 -196
- novel_downloader-1.4.5.dist-info/RECORD +0 -165
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
"""
|
3
|
-
novel_downloader.core.parsers.esjzone
|
4
|
-
|
3
|
+
novel_downloader.core.parsers.esjzone
|
4
|
+
-------------------------------------
|
5
5
|
|
6
6
|
"""
|
7
7
|
|
@@ -11,22 +11,21 @@ from typing import Any
|
|
11
11
|
from lxml import html
|
12
12
|
|
13
13
|
from novel_downloader.core.parsers.base import BaseParser
|
14
|
-
from novel_downloader.
|
14
|
+
from novel_downloader.core.parsers.registry import register_parser
|
15
|
+
from novel_downloader.models import (
|
16
|
+
BookInfoDict,
|
17
|
+
ChapterDict,
|
18
|
+
VolumeInfoDict,
|
19
|
+
)
|
15
20
|
|
16
21
|
|
22
|
+
@register_parser(
|
23
|
+
site_keys=["esjzone"],
|
24
|
+
)
|
17
25
|
class EsjzoneParser(BaseParser):
|
18
|
-
"""
|
19
|
-
|
20
|
-
|
21
|
-
_BOOK_NAME_XPATH = '//h2[contains(@class, "text-normal")]/text()'
|
22
|
-
_AUTHOR_XPATH = '//li[strong[text()="作者:"]]/a/text()'
|
23
|
-
_COVER_URL_XPATH = '//div[contains(@class,"product-gallery")]//img/@src'
|
24
|
-
_UPDATE_TIME_XPATH = '//li[strong[text()="更新日期:"]]/text()'
|
25
|
-
_WORD_COUNT_XPATH = '//span[@id="txt"]/text()'
|
26
|
-
_TYPE_XPATH = '//li[strong[text()="類型:"]]/text()'
|
27
|
-
_ALT_NAME_XPATH = '//li[strong[text()="其他書名:"]]/text()'
|
28
|
-
_WEB_URL_XPATH = '//li[strong[text()="Web生肉:"]]/a/@href'
|
29
|
-
_SUMMARY_XPATH = '//div[@class="description"]/p//text()'
|
26
|
+
"""
|
27
|
+
Parser for esjzone book pages.
|
28
|
+
"""
|
30
29
|
|
31
30
|
# Chapter XPaths
|
32
31
|
_CHAPTER_TEXT_XPATH = 'string(//div[contains(@class, "forum-content")])'
|
@@ -35,14 +34,13 @@ class EsjzoneParser(BaseParser):
|
|
35
34
|
'//i[contains(@class, "icon-clock")]/following-sibling::text()',
|
36
35
|
'//i[contains(@class, "icon-pen-tool")]/following-sibling::text()',
|
37
36
|
]
|
38
|
-
|
39
37
|
_CHECK_FORUM_XPATH = '//div[@class="page-title"]//ul[@class="breadcrumbs"]/li[not(@class="slash")]//text()' # noqa: E501
|
40
38
|
|
41
39
|
def parse_book_info(
|
42
40
|
self,
|
43
41
|
html_list: list[str],
|
44
42
|
**kwargs: Any,
|
45
|
-
) ->
|
43
|
+
) -> BookInfoDict | None:
|
46
44
|
"""
|
47
45
|
Parse a book info page and extract metadata and chapter structure.
|
48
46
|
|
@@ -53,27 +51,40 @@ class EsjzoneParser(BaseParser):
|
|
53
51
|
:return: Parsed metadata and chapter structure as a dictionary.
|
54
52
|
"""
|
55
53
|
if not html_list or self._is_forum_page(html_list):
|
56
|
-
return
|
54
|
+
return None
|
55
|
+
|
57
56
|
tree = html.fromstring(html_list[0])
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
tree
|
57
|
+
|
58
|
+
# --- Basic metadata ---
|
59
|
+
book_name = self._first_str(
|
60
|
+
tree.xpath('//h2[contains(@class,"text-normal")]/text()')
|
61
|
+
)
|
62
|
+
author = self._first_str(tree.xpath('//li[strong[text()="作者:"]]/a/text()'))
|
63
|
+
cover_url = self._first_str(
|
64
|
+
tree.xpath('//div[contains(@class,"product-gallery")]//img/@src')
|
66
65
|
)
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
66
|
+
update_time = self._first_str(
|
67
|
+
tree.xpath('//li[strong[text()="更新日期:"]]/text()')
|
68
|
+
) # noqa: E501
|
69
|
+
word_count = self._first_str(
|
70
|
+
tree.xpath('//span[@id="txt"]/text()'), replaces=[(",", "")]
|
71
|
+
)
|
72
|
+
book_type = self._first_str(tree.xpath('//li[strong[text()="類型:"]]/text()'))
|
73
|
+
alt_name = self._first_str(
|
74
|
+
tree.xpath('//li[strong[text()="其他書名:"]]/text()')
|
75
|
+
) # noqa: E501
|
76
|
+
web_url = self._first_str(tree.xpath('//li[strong[text()="Web生肉:"]]/a/@href'))
|
77
|
+
|
78
|
+
# Summary paragraphs
|
71
79
|
paras = tree.xpath('//div[@class="description"]/p')
|
72
80
|
texts = [p.xpath("string()").strip() for p in paras]
|
73
|
-
|
81
|
+
summary = "\n".join(t for t in texts if t)
|
74
82
|
|
75
|
-
|
76
|
-
|
83
|
+
current_vol: VolumeInfoDict = {
|
84
|
+
"volume_name": "單卷",
|
85
|
+
"chapters": [],
|
86
|
+
}
|
87
|
+
volumes: list[VolumeInfoDict] = [current_vol]
|
77
88
|
|
78
89
|
def _is_garbage_title(name: str) -> bool:
|
79
90
|
stripped = name.strip()
|
@@ -84,25 +95,18 @@ class EsjzoneParser(BaseParser):
|
|
84
95
|
if _is_garbage_title(name):
|
85
96
|
return
|
86
97
|
name = name.strip() or "未命名卷"
|
87
|
-
if
|
98
|
+
if current_vol and current_vol["volume_name"] == name:
|
88
99
|
return
|
89
100
|
current_vol = {"volume_name": name, "chapters": []}
|
90
101
|
volumes.append(current_vol)
|
91
102
|
|
92
|
-
_start_volume("單卷")
|
93
|
-
|
94
|
-
# nodes = tree.xpath('//div[@id="chapterList"]/details') + tree.xpath(
|
95
|
-
# '//div[@id="chapterList"]/*[not(self::details)]'
|
96
|
-
# )
|
97
103
|
nodes = tree.xpath('//div[@id="chapterList"]/*')
|
98
|
-
|
99
104
|
for node in nodes:
|
100
105
|
tag = node.tag.lower()
|
101
106
|
|
102
107
|
if tag == "details":
|
103
108
|
# ---- DETAILS-based layout ----
|
104
|
-
|
105
|
-
vol_name = summary.text if summary is not None else "未命名卷"
|
109
|
+
vol_name = node.xpath("string(./summary)").strip() or "未命名卷"
|
106
110
|
_start_volume(vol_name)
|
107
111
|
|
108
112
|
# all chapters inside this details
|
@@ -111,7 +115,11 @@ class EsjzoneParser(BaseParser):
|
|
111
115
|
href = a.get("href", "")
|
112
116
|
chap_id = href.rstrip("/").split("/")[-1].split(".", 1)[0]
|
113
117
|
current_vol["chapters"].append(
|
114
|
-
{
|
118
|
+
{
|
119
|
+
"title": title,
|
120
|
+
"url": href,
|
121
|
+
"chapterId": chap_id,
|
122
|
+
}
|
115
123
|
)
|
116
124
|
|
117
125
|
elif (
|
@@ -134,9 +142,21 @@ class EsjzoneParser(BaseParser):
|
|
134
142
|
{"title": title, "url": href, "chapterId": chap_id}
|
135
143
|
)
|
136
144
|
volumes = [vol for vol in volumes if vol["chapters"]]
|
137
|
-
result["volumes"] = volumes
|
138
145
|
|
139
|
-
return
|
146
|
+
return {
|
147
|
+
"book_name": book_name,
|
148
|
+
"author": author,
|
149
|
+
"cover_url": cover_url,
|
150
|
+
"update_time": update_time,
|
151
|
+
"summary": summary,
|
152
|
+
"tags": [book_type],
|
153
|
+
"word_count": word_count,
|
154
|
+
"volumes": volumes,
|
155
|
+
"extra": {
|
156
|
+
"alt_name": alt_name,
|
157
|
+
"web_url": web_url,
|
158
|
+
},
|
159
|
+
}
|
140
160
|
|
141
161
|
def parse_chapter(
|
142
162
|
self,
|
@@ -144,16 +164,9 @@ class EsjzoneParser(BaseParser):
|
|
144
164
|
chapter_id: str,
|
145
165
|
**kwargs: Any,
|
146
166
|
) -> ChapterDict | None:
|
147
|
-
"""
|
148
|
-
Parse a single chapter page and extract clean text or simplified HTML.
|
149
|
-
|
150
|
-
:param html_list: Raw HTML of the chapter page.
|
151
|
-
:param chapter_id: Identifier of the chapter being parsed.
|
152
|
-
:return: Cleaned chapter content as plain text or minimal HTML.
|
153
|
-
"""
|
154
167
|
if not html_list or self._is_forum_page(html_list):
|
155
168
|
return None
|
156
|
-
tree = html.fromstring(html_list[0]
|
169
|
+
tree = html.fromstring(html_list[0])
|
157
170
|
|
158
171
|
content_lines: list[str] = []
|
159
172
|
content_nodes = tree.xpath(self._CHAPTER_CONTENT_NODES_XPATH)
|
@@ -173,7 +186,7 @@ class EsjzoneParser(BaseParser):
|
|
173
186
|
content_lines.append(f'<img src="{src}" />')
|
174
187
|
|
175
188
|
content = (
|
176
|
-
"\n
|
189
|
+
"\n".join(content_lines).strip()
|
177
190
|
if content_lines
|
178
191
|
else tree.xpath(self._CHAPTER_TEXT_XPATH).strip()
|
179
192
|
)
|
@@ -211,16 +224,3 @@ class EsjzoneParser(BaseParser):
|
|
211
224
|
breadcrumb: list[str] = tree.xpath(self._CHECK_FORUM_XPATH)
|
212
225
|
breadcrumb = [s.strip() for s in breadcrumb if s.strip()]
|
213
226
|
return breadcrumb == ["Home", "論壇"]
|
214
|
-
|
215
|
-
@staticmethod
|
216
|
-
def _get_text(
|
217
|
-
tree: html.HtmlElement,
|
218
|
-
xpath: str,
|
219
|
-
join: bool = False,
|
220
|
-
clean_comma: bool = False,
|
221
|
-
) -> str:
|
222
|
-
data = tree.xpath(xpath)
|
223
|
-
if not data:
|
224
|
-
return ""
|
225
|
-
text = "\n".join(data) if join else data[0].strip()
|
226
|
-
return text.replace(",", "") if clean_comma else text
|
@@ -0,0 +1,128 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.guidaye
|
4
|
+
-------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from datetime import datetime
|
10
|
+
from typing import Any
|
11
|
+
|
12
|
+
from lxml import html
|
13
|
+
|
14
|
+
from novel_downloader.core.parsers.base import BaseParser
|
15
|
+
from novel_downloader.core.parsers.registry import register_parser
|
16
|
+
from novel_downloader.models import (
|
17
|
+
BookInfoDict,
|
18
|
+
ChapterDict,
|
19
|
+
VolumeInfoDict,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
@register_parser(
|
24
|
+
site_keys=["guidaye"],
|
25
|
+
)
|
26
|
+
class GuidayeParser(BaseParser):
|
27
|
+
"""
|
28
|
+
Parser for 名著阅读 book pages.
|
29
|
+
"""
|
30
|
+
|
31
|
+
BASE_URL = "https://b.guidaye.com"
|
32
|
+
|
33
|
+
def parse_book_info(
|
34
|
+
self,
|
35
|
+
html_list: list[str],
|
36
|
+
**kwargs: Any,
|
37
|
+
) -> BookInfoDict | None:
|
38
|
+
if not html_list:
|
39
|
+
return None
|
40
|
+
|
41
|
+
tree = html.fromstring(html_list[0])
|
42
|
+
|
43
|
+
# Book metadata
|
44
|
+
book_name = self._first_str(tree.xpath('//h1[@class="page-title"]/a/text()'))
|
45
|
+
author = self._first_str(
|
46
|
+
tree.xpath('//div[@id="category-description-author"]/a/text()')
|
47
|
+
)
|
48
|
+
cover_url = self.BASE_URL + self._first_str(
|
49
|
+
tree.xpath('//div[@id="category-description-image"]//img/@src')
|
50
|
+
)
|
51
|
+
|
52
|
+
# Summary paragraphs
|
53
|
+
summary = (
|
54
|
+
tree.xpath('string(//div[@id="category-description-text"])')
|
55
|
+
.replace("内容简介:", "", 1)
|
56
|
+
.strip()
|
57
|
+
)
|
58
|
+
|
59
|
+
# Chapter volumes & listings
|
60
|
+
volumes: list[VolumeInfoDict] = []
|
61
|
+
curr_vol: VolumeInfoDict = {"volume_name": "未命名卷", "chapters": []}
|
62
|
+
|
63
|
+
items = tree.xpath('//div[@class="entry-content"]/ul/*')
|
64
|
+
for elem in items:
|
65
|
+
if elem.tag.lower() == "h3":
|
66
|
+
# Flush previous volume
|
67
|
+
if curr_vol["chapters"]:
|
68
|
+
volumes.append(curr_vol)
|
69
|
+
curr_vol = {"volume_name": elem.text_content().strip(), "chapters": []}
|
70
|
+
elif elem.tag.lower() == "li":
|
71
|
+
link = elem.xpath(".//a")[0]
|
72
|
+
href = link.get("href", "").strip()
|
73
|
+
title = link.get("title", "").strip()
|
74
|
+
cid_match = re.search(r"/(\d+)\.html$", href)
|
75
|
+
chapter_id = cid_match.group(1) if cid_match else ""
|
76
|
+
curr_vol["chapters"].append(
|
77
|
+
{"title": title, "url": href, "chapterId": chapter_id}
|
78
|
+
)
|
79
|
+
|
80
|
+
# Append last volume
|
81
|
+
if curr_vol["chapters"]:
|
82
|
+
volumes.append(curr_vol)
|
83
|
+
|
84
|
+
# Timestamp of parsing
|
85
|
+
share_text = tree.xpath('string(//div[@id="category-description-share"])')
|
86
|
+
m = re.search(r"最近更新[::]\s*([\d-]+)", share_text)
|
87
|
+
update_time = m.group(1) if m else datetime.now().strftime("%Y-%m-%d")
|
88
|
+
|
89
|
+
return {
|
90
|
+
"book_name": book_name,
|
91
|
+
"author": author,
|
92
|
+
"cover_url": cover_url,
|
93
|
+
"update_time": update_time,
|
94
|
+
"summary": summary,
|
95
|
+
"volumes": volumes,
|
96
|
+
"extra": {},
|
97
|
+
}
|
98
|
+
|
99
|
+
def parse_chapter(
|
100
|
+
self,
|
101
|
+
html_list: list[str],
|
102
|
+
chapter_id: str,
|
103
|
+
**kwargs: Any,
|
104
|
+
) -> ChapterDict | None:
|
105
|
+
if not html_list:
|
106
|
+
return None
|
107
|
+
tree = html.fromstring(html_list[0])
|
108
|
+
|
109
|
+
# Title from entry-title
|
110
|
+
title = self._first_str(tree.xpath('//h1[@class="entry-title"]/text()'))
|
111
|
+
|
112
|
+
# Extract paragraphs within entry-content
|
113
|
+
full_text = tree.xpath('string(//div[@class="entry-content"])')
|
114
|
+
full_text = full_text.replace("\u00A0", " ")
|
115
|
+
|
116
|
+
# 3. Split into lines and clean up
|
117
|
+
lines = [line.strip() for line in full_text.splitlines() if line.strip()]
|
118
|
+
if not lines:
|
119
|
+
return None
|
120
|
+
|
121
|
+
content = "\n".join(lines)
|
122
|
+
|
123
|
+
return {
|
124
|
+
"id": chapter_id,
|
125
|
+
"title": title,
|
126
|
+
"content": content,
|
127
|
+
"extra": {"site": "guidaye"},
|
128
|
+
}
|
@@ -0,0 +1,139 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.hetushu
|
4
|
+
-------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from datetime import datetime
|
10
|
+
from typing import Any
|
11
|
+
|
12
|
+
from lxml import html
|
13
|
+
|
14
|
+
from novel_downloader.core.parsers.base import BaseParser
|
15
|
+
from novel_downloader.core.parsers.registry import register_parser
|
16
|
+
from novel_downloader.models import (
|
17
|
+
BookInfoDict,
|
18
|
+
ChapterDict,
|
19
|
+
VolumeInfoDict,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
@register_parser(
|
24
|
+
site_keys=["hetushu"],
|
25
|
+
)
|
26
|
+
class HetushuParser(BaseParser):
|
27
|
+
"""
|
28
|
+
Parser for 和图书 book pages.
|
29
|
+
"""
|
30
|
+
|
31
|
+
BASE_URL = "https://www.hetushu.com"
|
32
|
+
|
33
|
+
def parse_book_info(
|
34
|
+
self,
|
35
|
+
html_list: list[str],
|
36
|
+
**kwargs: Any,
|
37
|
+
) -> BookInfoDict | None:
|
38
|
+
if not html_list:
|
39
|
+
return None
|
40
|
+
|
41
|
+
tree = html.fromstring(html_list[0])
|
42
|
+
|
43
|
+
# --- Metadata ---
|
44
|
+
book_name = self._first_str(
|
45
|
+
tree.xpath('//div[contains(@class,"book_info")]/h2/text()')
|
46
|
+
)
|
47
|
+
author = self._first_str(
|
48
|
+
tree.xpath(
|
49
|
+
'//div[contains(@class,"book_info")]/div[contains(.,"作者")]/a/text()'
|
50
|
+
)
|
51
|
+
)
|
52
|
+
cover_url = self.BASE_URL + self._first_str(
|
53
|
+
tree.xpath('//div[contains(@class,"book_info")]//img/@src')
|
54
|
+
)
|
55
|
+
|
56
|
+
cls_attr = self._first_str(
|
57
|
+
tree.xpath('//div[contains(@class,"book_info")]/@class')
|
58
|
+
)
|
59
|
+
serial_status = "已完结" if "finish" in cls_attr else "连载中"
|
60
|
+
|
61
|
+
tags = [
|
62
|
+
a.strip()
|
63
|
+
for a in tree.xpath('//dl[@class="tag"]//dd/a/text()')
|
64
|
+
if a.strip()
|
65
|
+
]
|
66
|
+
|
67
|
+
paras = tree.xpath('//div[@class="intro"]/p/text()')
|
68
|
+
summary = "\n".join(p.strip() for p in paras if p.strip())
|
69
|
+
|
70
|
+
# --- Chapter volumes & listings ---
|
71
|
+
volumes: list[VolumeInfoDict] = []
|
72
|
+
curr_vol: VolumeInfoDict = {"volume_name": "未命名卷", "chapters": []}
|
73
|
+
|
74
|
+
for elem in tree.xpath('//dl[@id="dir"]/*'):
|
75
|
+
if elem.tag == "dt":
|
76
|
+
# Start a new volume
|
77
|
+
if curr_vol["chapters"]:
|
78
|
+
volumes.append(curr_vol)
|
79
|
+
curr_vol = {
|
80
|
+
"volume_name": elem.text_content().strip(),
|
81
|
+
"chapters": [],
|
82
|
+
}
|
83
|
+
elif elem.tag == "dd":
|
84
|
+
link = elem.xpath(".//a")[0]
|
85
|
+
href = link.get("href", "").strip()
|
86
|
+
title = link.get("title", "").strip()
|
87
|
+
# Extract numeric chapterId from the URL
|
88
|
+
m = re.search(r"/book/\d+/(?P<id>\d+)\.html", href)
|
89
|
+
chapter_id = m.group("id") if m else ""
|
90
|
+
curr_vol["chapters"].append(
|
91
|
+
{"title": title, "url": href, "chapterId": chapter_id}
|
92
|
+
)
|
93
|
+
|
94
|
+
# Append the last volume if it has any chapters
|
95
|
+
if curr_vol["chapters"]:
|
96
|
+
volumes.append(curr_vol)
|
97
|
+
|
98
|
+
update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
99
|
+
|
100
|
+
return {
|
101
|
+
"book_name": book_name,
|
102
|
+
"author": author,
|
103
|
+
"cover_url": cover_url,
|
104
|
+
"update_time": update_time,
|
105
|
+
"serial_status": serial_status,
|
106
|
+
"tags": tags,
|
107
|
+
"summary": summary,
|
108
|
+
"volumes": volumes,
|
109
|
+
"extra": {},
|
110
|
+
}
|
111
|
+
|
112
|
+
def parse_chapter(
|
113
|
+
self,
|
114
|
+
html_list: list[str],
|
115
|
+
chapter_id: str,
|
116
|
+
**kwargs: Any,
|
117
|
+
) -> ChapterDict | None:
|
118
|
+
if not html_list:
|
119
|
+
return None
|
120
|
+
|
121
|
+
tree = html.fromstring(html_list[0])
|
122
|
+
|
123
|
+
title = self._first_str(
|
124
|
+
tree.xpath('//div[@id="content"]//h2[@class="h2"]/text()')
|
125
|
+
)
|
126
|
+
|
127
|
+
paras = tree.xpath('//div[@id="content"]/div[not(@class)]/text()')
|
128
|
+
paragraph_texts = [p.strip() for p in paras if p.strip()]
|
129
|
+
|
130
|
+
content = "\n".join(paragraph_texts)
|
131
|
+
if not content.strip():
|
132
|
+
return None
|
133
|
+
|
134
|
+
return {
|
135
|
+
"id": chapter_id,
|
136
|
+
"title": title,
|
137
|
+
"content": content,
|
138
|
+
"extra": {"site": "hetushu"},
|
139
|
+
}
|
@@ -0,0 +1,137 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.i25zw
|
4
|
+
-----------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any
|
9
|
+
|
10
|
+
from lxml import html
|
11
|
+
|
12
|
+
from novel_downloader.core.parsers.base import BaseParser
|
13
|
+
from novel_downloader.core.parsers.registry import register_parser
|
14
|
+
from novel_downloader.models import (
|
15
|
+
BookInfoDict,
|
16
|
+
ChapterDict,
|
17
|
+
ChapterInfoDict,
|
18
|
+
VolumeInfoDict,
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
@register_parser(
|
23
|
+
site_keys=["i25zw"],
|
24
|
+
)
|
25
|
+
class I25zwParser(BaseParser):
|
26
|
+
"""
|
27
|
+
Parser for 25中文网 book-info pages.
|
28
|
+
"""
|
29
|
+
|
30
|
+
def parse_book_info(
|
31
|
+
self,
|
32
|
+
html_list: list[str],
|
33
|
+
**kwargs: Any,
|
34
|
+
) -> BookInfoDict | None:
|
35
|
+
if len(html_list) < 2:
|
36
|
+
return None
|
37
|
+
|
38
|
+
info_tree = html.fromstring(html_list[0])
|
39
|
+
catalog_tree = html.fromstring(html_list[1])
|
40
|
+
|
41
|
+
# Metadata extraction
|
42
|
+
book_name = self._first_str(info_tree.xpath("//h1[@class='f21h']/text()"))
|
43
|
+
author = self._first_str(info_tree.xpath("//h1[@class='f21h']/em/a/text()"))
|
44
|
+
cover_url = self._first_str(info_tree.xpath("//div[@class='pic']/img/@src"))
|
45
|
+
|
46
|
+
# Tags, status, word count, update time
|
47
|
+
tag = self._first_str(
|
48
|
+
info_tree.xpath("//b[contains(text(),'小说分类')]/parent::td/text()")
|
49
|
+
)
|
50
|
+
serial_status = self._first_str(
|
51
|
+
info_tree.xpath("//b[contains(text(),'小说状态')]/parent::td/text()")
|
52
|
+
)
|
53
|
+
word_count = self._first_str(
|
54
|
+
info_tree.xpath("//b[contains(text(),'全文字数')]/parent::td/text()")
|
55
|
+
)
|
56
|
+
raw_update = self._first_str(
|
57
|
+
info_tree.xpath("//b[contains(text(),'更新时间')]/parent::td/text()")
|
58
|
+
)
|
59
|
+
update_time = raw_update.strip("()")
|
60
|
+
|
61
|
+
# Summary from styled intro div
|
62
|
+
full_intro = info_tree.xpath("string(//div[@class='intro'][@style])").strip()
|
63
|
+
summary = full_intro.replace(f"关于{book_name}:", "", 1).strip()
|
64
|
+
|
65
|
+
# Chapter list extraction
|
66
|
+
dl = catalog_tree.xpath("//div[@id='list']/dl")[0]
|
67
|
+
# Full-text section dd's
|
68
|
+
dds = dl.xpath("./dd[preceding-sibling::dt[1][contains(., '正文')]]/a")
|
69
|
+
if not dds:
|
70
|
+
# Fallback to second <dt>'s following <dd>
|
71
|
+
dds = dl.xpath("./dt[2]/following-sibling::dd/a")
|
72
|
+
|
73
|
+
chapters: list[ChapterInfoDict] = []
|
74
|
+
for a in dds:
|
75
|
+
url = a.get("href", "").strip()
|
76
|
+
title = a.text_content().strip()
|
77
|
+
# '/311006/252845677.html' -> '252845677'
|
78
|
+
chapter_id = url.split("/")[-1].split(".")[0]
|
79
|
+
chapters.append(
|
80
|
+
{
|
81
|
+
"title": title,
|
82
|
+
"url": url,
|
83
|
+
"chapterId": chapter_id,
|
84
|
+
}
|
85
|
+
)
|
86
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
87
|
+
|
88
|
+
return {
|
89
|
+
"book_name": book_name,
|
90
|
+
"author": author,
|
91
|
+
"cover_url": cover_url,
|
92
|
+
"update_time": update_time,
|
93
|
+
"word_count": word_count,
|
94
|
+
"serial_status": serial_status,
|
95
|
+
"tags": [tag] if tag else [],
|
96
|
+
"summary": summary,
|
97
|
+
"volumes": volumes,
|
98
|
+
"extra": {},
|
99
|
+
}
|
100
|
+
|
101
|
+
def parse_chapter(
|
102
|
+
self,
|
103
|
+
html_list: list[str],
|
104
|
+
chapter_id: str,
|
105
|
+
**kwargs: Any,
|
106
|
+
) -> ChapterDict | None:
|
107
|
+
if not html_list:
|
108
|
+
return None
|
109
|
+
|
110
|
+
tree = html.fromstring(html_list[0])
|
111
|
+
|
112
|
+
title_text = self._first_str(
|
113
|
+
tree.xpath("//div[@class='zhangjieming']/h1/text()")
|
114
|
+
)
|
115
|
+
|
116
|
+
content_divs = tree.xpath("//div[@id='content']")
|
117
|
+
if not content_divs:
|
118
|
+
return None
|
119
|
+
content_div = content_divs[0]
|
120
|
+
|
121
|
+
# Only select direct <p> children to avoid nav links
|
122
|
+
paragraphs = []
|
123
|
+
for p in content_div.xpath("./p"):
|
124
|
+
text = p.text_content().strip()
|
125
|
+
if text:
|
126
|
+
paragraphs.append(text)
|
127
|
+
|
128
|
+
content_text = "\n".join(paragraphs)
|
129
|
+
if not content_text.strip():
|
130
|
+
return None
|
131
|
+
|
132
|
+
return {
|
133
|
+
"id": chapter_id,
|
134
|
+
"title": title_text,
|
135
|
+
"content": content_text,
|
136
|
+
"extra": {"site": "i25zw"},
|
137
|
+
}
|