novel-downloader 1.5.0__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +1 -3
- novel_downloader/cli/clean.py +21 -88
- novel_downloader/cli/config.py +26 -21
- novel_downloader/cli/download.py +79 -66
- novel_downloader/cli/export.py +17 -21
- novel_downloader/cli/main.py +1 -1
- novel_downloader/cli/search.py +62 -65
- novel_downloader/cli/ui.py +156 -0
- novel_downloader/config/__init__.py +8 -5
- novel_downloader/config/adapter.py +206 -209
- novel_downloader/config/{loader.py → file_io.py} +53 -26
- novel_downloader/core/__init__.py +5 -5
- novel_downloader/core/archived/deqixs/fetcher.py +115 -0
- novel_downloader/core/archived/deqixs/parser.py +132 -0
- novel_downloader/core/archived/deqixs/searcher.py +89 -0
- novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
- novel_downloader/core/archived/wanbengo/searcher.py +98 -0
- novel_downloader/core/archived/xshbook/searcher.py +93 -0
- novel_downloader/core/downloaders/__init__.py +3 -24
- novel_downloader/core/downloaders/base.py +49 -23
- novel_downloader/core/downloaders/common.py +191 -137
- novel_downloader/core/downloaders/qianbi.py +187 -146
- novel_downloader/core/downloaders/qidian.py +187 -141
- novel_downloader/core/downloaders/registry.py +4 -2
- novel_downloader/core/downloaders/signals.py +46 -0
- novel_downloader/core/exporters/__init__.py +3 -20
- novel_downloader/core/exporters/base.py +33 -37
- novel_downloader/core/exporters/common/__init__.py +1 -2
- novel_downloader/core/exporters/common/epub.py +15 -10
- novel_downloader/core/exporters/common/main_exporter.py +19 -12
- novel_downloader/core/exporters/common/txt.py +17 -12
- novel_downloader/core/exporters/epub_util.py +59 -29
- novel_downloader/core/exporters/linovelib/__init__.py +1 -0
- novel_downloader/core/exporters/linovelib/epub.py +23 -25
- novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
- novel_downloader/core/exporters/linovelib/txt.py +20 -14
- novel_downloader/core/exporters/qidian.py +2 -8
- novel_downloader/core/exporters/registry.py +4 -2
- novel_downloader/core/exporters/txt_util.py +7 -7
- novel_downloader/core/fetchers/__init__.py +54 -48
- novel_downloader/core/fetchers/aaatxt.py +83 -0
- novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
- novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
- novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
- novel_downloader/core/fetchers/dxmwx.py +110 -0
- novel_downloader/core/fetchers/eightnovel.py +139 -0
- novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
- novel_downloader/core/fetchers/guidaye.py +85 -0
- novel_downloader/core/fetchers/hetushu.py +92 -0
- novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
- novel_downloader/core/fetchers/ixdzs8.py +113 -0
- novel_downloader/core/fetchers/jpxs123.py +101 -0
- novel_downloader/core/fetchers/lewenn.py +83 -0
- novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
- novel_downloader/core/fetchers/piaotia.py +105 -0
- novel_downloader/core/fetchers/qbtr.py +101 -0
- novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
- novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +56 -64
- novel_downloader/core/fetchers/quanben5.py +92 -0
- novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
- novel_downloader/core/fetchers/registry.py +5 -16
- novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
- novel_downloader/core/fetchers/shencou.py +106 -0
- novel_downloader/core/fetchers/shuhaige.py +84 -0
- novel_downloader/core/fetchers/tongrenquan.py +84 -0
- novel_downloader/core/fetchers/ttkan.py +95 -0
- novel_downloader/core/fetchers/wanbengo.py +83 -0
- novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
- novel_downloader/core/fetchers/xiguashuwu.py +177 -0
- novel_downloader/core/fetchers/xs63b.py +171 -0
- novel_downloader/core/fetchers/xshbook.py +85 -0
- novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
- novel_downloader/core/fetchers/yibige.py +114 -0
- novel_downloader/core/interfaces/__init__.py +1 -9
- novel_downloader/core/interfaces/downloader.py +6 -2
- novel_downloader/core/interfaces/exporter.py +7 -7
- novel_downloader/core/interfaces/fetcher.py +6 -19
- novel_downloader/core/interfaces/parser.py +7 -8
- novel_downloader/core/interfaces/searcher.py +9 -1
- novel_downloader/core/parsers/__init__.py +49 -12
- novel_downloader/core/parsers/aaatxt.py +132 -0
- novel_downloader/core/parsers/b520.py +116 -0
- novel_downloader/core/parsers/base.py +64 -12
- novel_downloader/core/parsers/biquyuedu.py +133 -0
- novel_downloader/core/parsers/dxmwx.py +162 -0
- novel_downloader/core/parsers/eightnovel.py +224 -0
- novel_downloader/core/parsers/esjzone.py +64 -69
- novel_downloader/core/parsers/guidaye.py +128 -0
- novel_downloader/core/parsers/hetushu.py +139 -0
- novel_downloader/core/parsers/i25zw.py +137 -0
- novel_downloader/core/parsers/ixdzs8.py +186 -0
- novel_downloader/core/parsers/jpxs123.py +137 -0
- novel_downloader/core/parsers/lewenn.py +142 -0
- novel_downloader/core/parsers/linovelib.py +48 -64
- novel_downloader/core/parsers/piaotia.py +189 -0
- novel_downloader/core/parsers/qbtr.py +136 -0
- novel_downloader/core/parsers/qianbi.py +48 -50
- novel_downloader/core/parsers/qidian/main_parser.py +756 -48
- novel_downloader/core/parsers/qidian/utils/__init__.py +3 -21
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +4 -4
- novel_downloader/core/parsers/quanben5.py +103 -0
- novel_downloader/core/parsers/registry.py +5 -16
- novel_downloader/core/parsers/sfacg.py +38 -45
- novel_downloader/core/parsers/shencou.py +215 -0
- novel_downloader/core/parsers/shuhaige.py +111 -0
- novel_downloader/core/parsers/tongrenquan.py +116 -0
- novel_downloader/core/parsers/ttkan.py +132 -0
- novel_downloader/core/parsers/wanbengo.py +191 -0
- novel_downloader/core/parsers/xiaoshuowu.py +173 -0
- novel_downloader/core/parsers/xiguashuwu.py +429 -0
- novel_downloader/core/parsers/xs63b.py +161 -0
- novel_downloader/core/parsers/xshbook.py +134 -0
- novel_downloader/core/parsers/yamibo.py +87 -131
- novel_downloader/core/parsers/yibige.py +166 -0
- novel_downloader/core/searchers/__init__.py +34 -3
- novel_downloader/core/searchers/aaatxt.py +107 -0
- novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
- novel_downloader/core/searchers/base.py +112 -36
- novel_downloader/core/searchers/dxmwx.py +105 -0
- novel_downloader/core/searchers/eightnovel.py +84 -0
- novel_downloader/core/searchers/esjzone.py +43 -25
- novel_downloader/core/searchers/hetushu.py +92 -0
- novel_downloader/core/searchers/i25zw.py +93 -0
- novel_downloader/core/searchers/ixdzs8.py +107 -0
- novel_downloader/core/searchers/jpxs123.py +107 -0
- novel_downloader/core/searchers/piaotia.py +100 -0
- novel_downloader/core/searchers/qbtr.py +106 -0
- novel_downloader/core/searchers/qianbi.py +74 -40
- novel_downloader/core/searchers/quanben5.py +144 -0
- novel_downloader/core/searchers/registry.py +24 -8
- novel_downloader/core/searchers/shuhaige.py +124 -0
- novel_downloader/core/searchers/tongrenquan.py +110 -0
- novel_downloader/core/searchers/ttkan.py +92 -0
- novel_downloader/core/searchers/xiaoshuowu.py +122 -0
- novel_downloader/core/searchers/xiguashuwu.py +95 -0
- novel_downloader/core/searchers/xs63b.py +104 -0
- novel_downloader/locales/en.json +34 -85
- novel_downloader/locales/zh.json +35 -86
- novel_downloader/models/__init__.py +21 -22
- novel_downloader/models/book.py +44 -0
- novel_downloader/models/config.py +4 -37
- novel_downloader/models/login.py +1 -1
- novel_downloader/models/search.py +5 -0
- novel_downloader/resources/config/settings.toml +8 -70
- novel_downloader/resources/json/xiguashuwu.json +718 -0
- novel_downloader/utils/__init__.py +13 -24
- novel_downloader/utils/chapter_storage.py +5 -5
- novel_downloader/utils/constants.py +4 -31
- novel_downloader/utils/cookies.py +38 -35
- novel_downloader/utils/crypto_utils/__init__.py +7 -0
- novel_downloader/utils/crypto_utils/aes_util.py +90 -0
- novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
- novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
- novel_downloader/utils/crypto_utils/rc4.py +54 -0
- novel_downloader/utils/epub/__init__.py +3 -4
- novel_downloader/utils/epub/builder.py +6 -6
- novel_downloader/utils/epub/constants.py +62 -21
- novel_downloader/utils/epub/documents.py +95 -201
- novel_downloader/utils/epub/models.py +8 -22
- novel_downloader/utils/epub/utils.py +73 -106
- novel_downloader/utils/file_utils/__init__.py +2 -23
- novel_downloader/utils/file_utils/io.py +53 -188
- novel_downloader/utils/file_utils/normalize.py +1 -7
- novel_downloader/utils/file_utils/sanitize.py +4 -15
- novel_downloader/utils/fontocr/__init__.py +5 -14
- novel_downloader/utils/fontocr/core.py +216 -0
- novel_downloader/utils/fontocr/loader.py +50 -0
- novel_downloader/utils/logger.py +81 -65
- novel_downloader/utils/network.py +17 -41
- novel_downloader/utils/state.py +4 -90
- novel_downloader/utils/text_utils/__init__.py +1 -7
- novel_downloader/utils/text_utils/diff_display.py +5 -7
- novel_downloader/utils/text_utils/text_cleaner.py +39 -30
- novel_downloader/utils/text_utils/truncate_utils.py +3 -14
- novel_downloader/utils/time_utils/__init__.py +5 -11
- novel_downloader/utils/time_utils/datetime_utils.py +20 -29
- novel_downloader/utils/time_utils/sleep_utils.py +55 -49
- novel_downloader/web/__init__.py +13 -0
- novel_downloader/web/components/__init__.py +11 -0
- novel_downloader/web/components/navigation.py +35 -0
- novel_downloader/web/main.py +66 -0
- novel_downloader/web/pages/__init__.py +17 -0
- novel_downloader/web/pages/download.py +78 -0
- novel_downloader/web/pages/progress.py +147 -0
- novel_downloader/web/pages/search.py +329 -0
- novel_downloader/web/services/__init__.py +17 -0
- novel_downloader/web/services/client_dialog.py +164 -0
- novel_downloader/web/services/cred_broker.py +113 -0
- novel_downloader/web/services/cred_models.py +35 -0
- novel_downloader/web/services/task_manager.py +264 -0
- novel_downloader-2.0.1.dist-info/METADATA +172 -0
- novel_downloader-2.0.1.dist-info/RECORD +206 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/entry_points.txt +1 -1
- novel_downloader/core/downloaders/biquge.py +0 -29
- novel_downloader/core/downloaders/esjzone.py +0 -29
- novel_downloader/core/downloaders/linovelib.py +0 -29
- novel_downloader/core/downloaders/sfacg.py +0 -29
- novel_downloader/core/downloaders/yamibo.py +0 -29
- novel_downloader/core/exporters/biquge.py +0 -22
- novel_downloader/core/exporters/esjzone.py +0 -22
- novel_downloader/core/exporters/qianbi.py +0 -22
- novel_downloader/core/exporters/sfacg.py +0 -22
- novel_downloader/core/exporters/yamibo.py +0 -22
- novel_downloader/core/fetchers/base/__init__.py +0 -14
- novel_downloader/core/fetchers/base/browser.py +0 -422
- novel_downloader/core/fetchers/biquge/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/browser.py +0 -209
- novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
- novel_downloader/core/fetchers/linovelib/browser.py +0 -198
- novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/browser.py +0 -326
- novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
- novel_downloader/core/fetchers/sfacg/browser.py +0 -194
- novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
- novel_downloader/core/fetchers/yamibo/browser.py +0 -234
- novel_downloader/core/parsers/biquge.py +0 -139
- novel_downloader/core/parsers/qidian/book_info_parser.py +0 -90
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -528
- novel_downloader/core/parsers/qidian/chapter_normal.py +0 -157
- novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -114
- novel_downloader/models/chapter.py +0 -25
- novel_downloader/models/types.py +0 -13
- novel_downloader/tui/__init__.py +0 -7
- novel_downloader/tui/app.py +0 -32
- novel_downloader/tui/main.py +0 -17
- novel_downloader/tui/screens/__init__.py +0 -14
- novel_downloader/tui/screens/home.py +0 -198
- novel_downloader/tui/screens/login.py +0 -74
- novel_downloader/tui/styles/home_layout.tcss +0 -79
- novel_downloader/tui/widgets/richlog_handler.py +0 -24
- novel_downloader/utils/cache.py +0 -24
- novel_downloader/utils/crypto_utils.py +0 -71
- novel_downloader/utils/fontocr/hash_store.py +0 -280
- novel_downloader/utils/fontocr/hash_utils.py +0 -103
- novel_downloader/utils/fontocr/model_loader.py +0 -69
- novel_downloader/utils/fontocr/ocr_v1.py +0 -315
- novel_downloader/utils/fontocr/ocr_v2.py +0 -764
- novel_downloader/utils/fontocr/ocr_v3.py +0 -744
- novel_downloader-1.5.0.dist-info/METADATA +0 -196
- novel_downloader-1.5.0.dist-info/RECORD +0 -164
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/WHEEL +0 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,133 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.biquyuedu
|
4
|
+
---------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any
|
9
|
+
|
10
|
+
from lxml import etree, html
|
11
|
+
|
12
|
+
from novel_downloader.core.parsers.base import BaseParser
|
13
|
+
from novel_downloader.core.parsers.registry import register_parser
|
14
|
+
from novel_downloader.models import (
|
15
|
+
BookInfoDict,
|
16
|
+
ChapterDict,
|
17
|
+
ChapterInfoDict,
|
18
|
+
VolumeInfoDict,
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
@register_parser(
|
23
|
+
site_keys=["biquyuedu"],
|
24
|
+
)
|
25
|
+
class BiquyueduParser(BaseParser):
|
26
|
+
"""
|
27
|
+
Parser for 精彩小说 book pages.
|
28
|
+
"""
|
29
|
+
|
30
|
+
ADS: set[str] = {
|
31
|
+
"笔趣阁",
|
32
|
+
"请记住本书首发域名",
|
33
|
+
"www.biquyuedu.com",
|
34
|
+
}
|
35
|
+
|
36
|
+
def parse_book_info(
|
37
|
+
self,
|
38
|
+
html_list: list[str],
|
39
|
+
**kwargs: Any,
|
40
|
+
) -> BookInfoDict | None:
|
41
|
+
if not html_list:
|
42
|
+
return None
|
43
|
+
|
44
|
+
tree = html.fromstring(html_list[0])
|
45
|
+
|
46
|
+
# --- Metadata ---
|
47
|
+
book_name = self._first_str(tree.xpath("//div[@class='info']/h1/text()"))
|
48
|
+
author = self._first_str(
|
49
|
+
tree.xpath(
|
50
|
+
"//div[@class='info']//div[@class='small'][1]//span[1]//a/text()"
|
51
|
+
)
|
52
|
+
)
|
53
|
+
cover_url = self._first_str(
|
54
|
+
tree.xpath("//div[@class='info']//div[@class='cover']//img/@src")
|
55
|
+
)
|
56
|
+
update_time = self._first_str(
|
57
|
+
tree.xpath("//div[@class='info']//div[@class='small'][2]//span[1]/text()"),
|
58
|
+
replaces=[("更新时间:", "")],
|
59
|
+
)
|
60
|
+
|
61
|
+
crumbs = tree.xpath("//div[@class='path']//div[@class='p']/a/text()")
|
62
|
+
book_type = self._first_str(crumbs[1:2])
|
63
|
+
tags = [book_type] if book_type else []
|
64
|
+
|
65
|
+
intro_text = tree.xpath(
|
66
|
+
"string(//div[@class='info']//div[@class='intro'])"
|
67
|
+
).strip()
|
68
|
+
summary = intro_text.replace("简介:", "", 1).split("作者:", 1)[0].strip()
|
69
|
+
|
70
|
+
# --- Chapters ---
|
71
|
+
chapters: list[ChapterInfoDict] = [
|
72
|
+
{
|
73
|
+
"title": (a.get("title") or a.text_content() or "").strip(),
|
74
|
+
"url": (a.get("href") or "").strip(),
|
75
|
+
"chapterId": (a.get("href") or "").rsplit("/", 1)[-1].split(".", 1)[0],
|
76
|
+
}
|
77
|
+
for a in tree.xpath(
|
78
|
+
"//div[@class='listmain']//dl/dd[preceding-sibling::dt[1][contains(text(),'全文')]]/a"
|
79
|
+
)
|
80
|
+
]
|
81
|
+
|
82
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
83
|
+
|
84
|
+
return {
|
85
|
+
"book_name": book_name,
|
86
|
+
"author": author,
|
87
|
+
"cover_url": cover_url,
|
88
|
+
"update_time": update_time,
|
89
|
+
"tags": tags,
|
90
|
+
"summary": summary,
|
91
|
+
"volumes": volumes,
|
92
|
+
"extra": {},
|
93
|
+
}
|
94
|
+
|
95
|
+
def parse_chapter(
|
96
|
+
self,
|
97
|
+
html_list: list[str],
|
98
|
+
chapter_id: str,
|
99
|
+
**kwargs: Any,
|
100
|
+
) -> ChapterDict | None:
|
101
|
+
if not html_list:
|
102
|
+
return None
|
103
|
+
tree = html.fromstring(html_list[0])
|
104
|
+
|
105
|
+
# Extract chapter title via helper
|
106
|
+
title = self._first_str(tree.xpath("//div[@class='content']/h1/text()"))
|
107
|
+
|
108
|
+
# Find the main content container
|
109
|
+
content_nodes = tree.xpath("//div[@id='content']")
|
110
|
+
if not content_nodes:
|
111
|
+
return None
|
112
|
+
content_div = content_nodes[0]
|
113
|
+
|
114
|
+
etree.strip_elements(content_div, "script", with_tail=False)
|
115
|
+
raw_texts = content_div.xpath(".//text()[normalize-space()]")
|
116
|
+
|
117
|
+
# Clean & filter in one comprehension
|
118
|
+
paragraphs = [
|
119
|
+
txt.replace("\xa0", "").strip()
|
120
|
+
for txt in raw_texts
|
121
|
+
if not self._is_ad_line(txt)
|
122
|
+
]
|
123
|
+
|
124
|
+
content = "\n".join(paragraphs)
|
125
|
+
if not content.strip():
|
126
|
+
return None
|
127
|
+
|
128
|
+
return {
|
129
|
+
"id": chapter_id,
|
130
|
+
"title": title,
|
131
|
+
"content": content,
|
132
|
+
"extra": {"site": "biquyuedu"},
|
133
|
+
}
|
@@ -0,0 +1,162 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.dxmwx
|
4
|
+
-----------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from datetime import datetime
|
10
|
+
from typing import Any
|
11
|
+
|
12
|
+
from lxml import html
|
13
|
+
|
14
|
+
from novel_downloader.core.parsers.base import BaseParser
|
15
|
+
from novel_downloader.core.parsers.registry import register_parser
|
16
|
+
from novel_downloader.models import (
|
17
|
+
BookInfoDict,
|
18
|
+
ChapterDict,
|
19
|
+
ChapterInfoDict,
|
20
|
+
VolumeInfoDict,
|
21
|
+
)
|
22
|
+
|
23
|
+
|
24
|
+
@register_parser(
|
25
|
+
site_keys=["dxmwx"],
|
26
|
+
)
|
27
|
+
class DxmwxParser(BaseParser):
|
28
|
+
"""
|
29
|
+
Parser for 大熊猫文学网 book pages.
|
30
|
+
"""
|
31
|
+
|
32
|
+
_RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}")
|
33
|
+
_RE_SPACES = re.compile(r"[ \t\u3000]+")
|
34
|
+
_RE_NEWLINES = re.compile(r"\n{2,}")
|
35
|
+
_RE_TITLE_WS = re.compile(r"\s+")
|
36
|
+
|
37
|
+
def parse_book_info(
|
38
|
+
self,
|
39
|
+
html_list: list[str],
|
40
|
+
**kwargs: Any,
|
41
|
+
) -> BookInfoDict | None:
|
42
|
+
if len(html_list) < 2:
|
43
|
+
return None
|
44
|
+
|
45
|
+
info_tree = html.fromstring(html_list[0])
|
46
|
+
catalog_tree = html.fromstring(html_list[1])
|
47
|
+
|
48
|
+
book_name = self._first_str(
|
49
|
+
info_tree.xpath("//span[contains(@style,'font-size: 24px')]/text()")
|
50
|
+
)
|
51
|
+
author = self._first_str(
|
52
|
+
info_tree.xpath(
|
53
|
+
"//div[contains(@style,'height: 28px') and contains(., '著')]//a/text()"
|
54
|
+
)
|
55
|
+
)
|
56
|
+
tags = [
|
57
|
+
t.strip()
|
58
|
+
for t in info_tree.xpath("//span[@class='typebut']//a/text()")
|
59
|
+
if t.strip()
|
60
|
+
]
|
61
|
+
cover_url = "https://www.dxmwx.org" + self._first_str(
|
62
|
+
info_tree.xpath("//img[@class='imgwidth']/@src")
|
63
|
+
)
|
64
|
+
|
65
|
+
raw_update = self._first_str(
|
66
|
+
info_tree.xpath(
|
67
|
+
"normalize-space(string(//span[starts-with(normalize-space(.), '更新时间:')]))" # noqa: E501
|
68
|
+
)
|
69
|
+
)
|
70
|
+
raw_update = raw_update.replace("更新时间:", "").strip()
|
71
|
+
update_time = self._normalize_update_date(raw_update)
|
72
|
+
|
73
|
+
nodes = info_tree.xpath(
|
74
|
+
"//div[contains(@style,'min-height') and "
|
75
|
+
"contains(@style,'padding-left') and contains(@style,'padding-right')][1]"
|
76
|
+
)
|
77
|
+
summary = ""
|
78
|
+
if nodes:
|
79
|
+
texts = [
|
80
|
+
t.replace("\xa0", " ").strip() for t in nodes[0].xpath(".//text()")
|
81
|
+
]
|
82
|
+
lines = [t for t in texts if t]
|
83
|
+
summary = "\n".join(lines)
|
84
|
+
summary = re.sub(r"^\s*[::]\s*", "", summary)
|
85
|
+
summary = self._clean_spaces(summary)
|
86
|
+
|
87
|
+
chapters: list[ChapterInfoDict] = []
|
88
|
+
for a in catalog_tree.xpath(
|
89
|
+
"//div[contains(@style,'height:40px') and contains(@style,'border-bottom')]//a" # noqa: E501
|
90
|
+
):
|
91
|
+
href = a.get("href") or ""
|
92
|
+
title = (a.text_content() or "").strip()
|
93
|
+
if not href or not title:
|
94
|
+
continue
|
95
|
+
# "/read/57215_50197663.html" -> "50197663"
|
96
|
+
chap_id = href.split("read/", 1)[-1].split(".html", 1)[0].split("_")[-1]
|
97
|
+
chapters.append({"title": title, "url": href, "chapterId": chap_id})
|
98
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
99
|
+
|
100
|
+
return {
|
101
|
+
"book_name": book_name,
|
102
|
+
"author": author,
|
103
|
+
"cover_url": cover_url,
|
104
|
+
"update_time": update_time,
|
105
|
+
"tags": tags,
|
106
|
+
"summary": summary,
|
107
|
+
"volumes": volumes,
|
108
|
+
"extra": {},
|
109
|
+
}
|
110
|
+
|
111
|
+
def parse_chapter(
|
112
|
+
self,
|
113
|
+
html_list: list[str],
|
114
|
+
chapter_id: str,
|
115
|
+
**kwargs: Any,
|
116
|
+
) -> ChapterDict | None:
|
117
|
+
if not html_list:
|
118
|
+
return None
|
119
|
+
|
120
|
+
tree = html.fromstring(html_list[0])
|
121
|
+
|
122
|
+
title = self._first_str(tree.xpath("//h1[@id='ChapterTitle']/text()"))
|
123
|
+
title = self._RE_TITLE_WS.sub(" ", title).strip()
|
124
|
+
if not title:
|
125
|
+
title = f"第 {chapter_id} 章"
|
126
|
+
|
127
|
+
paragraphs: list[str] = []
|
128
|
+
for p in tree.xpath("//div[@id='Lab_Contents']//p"):
|
129
|
+
text = self._clean_spaces(p.text_content())
|
130
|
+
if not text:
|
131
|
+
continue
|
132
|
+
if "点这里听书" in text or "大熊猫文学" in text:
|
133
|
+
continue
|
134
|
+
paragraphs.append(text)
|
135
|
+
|
136
|
+
content = "\n".join(paragraphs).strip()
|
137
|
+
if not content:
|
138
|
+
return None
|
139
|
+
|
140
|
+
return {
|
141
|
+
"id": chapter_id,
|
142
|
+
"title": title,
|
143
|
+
"content": content,
|
144
|
+
"extra": {"site": "dxmwx"},
|
145
|
+
}
|
146
|
+
|
147
|
+
@classmethod
|
148
|
+
def _clean_spaces(cls, s: str) -> str:
|
149
|
+
s = s.replace("\xa0", " ")
|
150
|
+
s = cls._RE_SPACES.sub(" ", s)
|
151
|
+
s = cls._RE_NEWLINES.sub("\n", s)
|
152
|
+
return s.strip()
|
153
|
+
|
154
|
+
@classmethod
|
155
|
+
def _normalize_update_date(cls, raw: str) -> str:
|
156
|
+
"""Return a YYYY-MM-DD string."""
|
157
|
+
if not raw:
|
158
|
+
return datetime.now().strftime("%Y-%m-%d")
|
159
|
+
m = cls._RE_DATE.search(raw)
|
160
|
+
if m:
|
161
|
+
return m.group(0)
|
162
|
+
return datetime.now().strftime("%Y-%m-%d")
|
@@ -0,0 +1,224 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.eightnovel
|
4
|
+
----------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
from lxml import html
|
12
|
+
|
13
|
+
from novel_downloader.core.parsers.base import BaseParser
|
14
|
+
from novel_downloader.core.parsers.registry import register_parser
|
15
|
+
from novel_downloader.models import (
|
16
|
+
BookInfoDict,
|
17
|
+
ChapterDict,
|
18
|
+
ChapterInfoDict,
|
19
|
+
VolumeInfoDict,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
@register_parser(
|
24
|
+
site_keys=["eightnovel", "8novel"],
|
25
|
+
)
|
26
|
+
class EightnovelParser(BaseParser):
|
27
|
+
"""
|
28
|
+
Parser for 无限轻小说 book pages.
|
29
|
+
"""
|
30
|
+
|
31
|
+
BASE_URL = "https://www.8novel.com"
|
32
|
+
_SPLIT_STR_PATTERN = re.compile(
|
33
|
+
r'["\']([^"\']+)["\']\s*\.split\s*\(\s*["\']\s*,\s*["\']\s*\)', re.DOTALL
|
34
|
+
)
|
35
|
+
_RE_AUTHOR = re.compile(r"作者[::]?\s*")
|
36
|
+
_RE_UPDATE = re.compile(r"更新[::]?\s*")
|
37
|
+
|
38
|
+
def parse_book_info(
|
39
|
+
self,
|
40
|
+
html_list: list[str],
|
41
|
+
**kwargs: Any,
|
42
|
+
) -> BookInfoDict | None:
|
43
|
+
if not html_list:
|
44
|
+
return None
|
45
|
+
|
46
|
+
tree = html.fromstring(html_list[0])
|
47
|
+
|
48
|
+
# --- Basic metadata ---
|
49
|
+
book_name = self._first_str(tree.xpath("//li[contains(@class,'h2')]/text()"))
|
50
|
+
|
51
|
+
author_raw = self._first_str(
|
52
|
+
tree.xpath("//span[contains(@class,'item-info-author')]/text()")
|
53
|
+
)
|
54
|
+
author = self._RE_AUTHOR.sub("", author_raw)
|
55
|
+
|
56
|
+
cover_url = self.BASE_URL + self._first_str(
|
57
|
+
tree.xpath("//div[contains(@class,'item-cover')]//img/@src")
|
58
|
+
)
|
59
|
+
|
60
|
+
update_raw = self._first_str(
|
61
|
+
tree.xpath("//span[contains(@class,'item-info-date')]/text()")
|
62
|
+
)
|
63
|
+
update_time = self._RE_UPDATE.sub("", update_raw)
|
64
|
+
|
65
|
+
counts = tree.xpath(
|
66
|
+
"//li[@class='small text-gray']//span[contains(@class,'item-info-num')]/text()" # noqa: E501
|
67
|
+
)
|
68
|
+
word_count = counts[1].strip() + "萬字" if len(counts) >= 2 else ""
|
69
|
+
|
70
|
+
tags = tree.xpath("//meta[@property='og:novel:category']/@content")
|
71
|
+
|
72
|
+
# --- Summary ---
|
73
|
+
summary_nodes = tree.xpath(
|
74
|
+
"//li[contains(@class,'full_text') and contains(@class,'mt-2')]"
|
75
|
+
)
|
76
|
+
if summary_nodes:
|
77
|
+
texts = [t.strip() for t in summary_nodes[0].itertext()]
|
78
|
+
summary = "\n".join(line for line in texts if line)
|
79
|
+
else:
|
80
|
+
summary = ""
|
81
|
+
|
82
|
+
# --- Chapters / Volumes ---
|
83
|
+
volumes: list[VolumeInfoDict] = []
|
84
|
+
for vol_div in tree.xpath("//div[contains(@class,'folder') and @pid]"):
|
85
|
+
# Volume title
|
86
|
+
h3 = vol_div.xpath(".//div[contains(@class,'vol-title')]//h3")
|
87
|
+
vol_name = (
|
88
|
+
h3[0].text_content().split("/")[0].strip() if h3 else "Unnamed Volume"
|
89
|
+
)
|
90
|
+
|
91
|
+
# Chapters
|
92
|
+
chapters: list[ChapterInfoDict] = []
|
93
|
+
for a in vol_div.xpath(
|
94
|
+
".//a[contains(@class,'episode_li') and contains(@class,'d-block')]"
|
95
|
+
):
|
96
|
+
title = (a.text_content() or "").strip()
|
97
|
+
href = a.get("href") or ""
|
98
|
+
if not href or not title:
|
99
|
+
continue
|
100
|
+
url = href if href.startswith("http") else self.BASE_URL + href
|
101
|
+
chapter_id = href.split("?")[-1] # "/read/3355/?270015" -> "270015"
|
102
|
+
chapters.append({"title": title, "url": url, "chapterId": chapter_id})
|
103
|
+
|
104
|
+
volumes.append({"volume_name": vol_name, "chapters": chapters})
|
105
|
+
|
106
|
+
return {
|
107
|
+
"book_name": book_name,
|
108
|
+
"author": author,
|
109
|
+
"cover_url": cover_url,
|
110
|
+
"update_time": update_time,
|
111
|
+
"word_count": word_count,
|
112
|
+
"tags": tags,
|
113
|
+
"summary": summary,
|
114
|
+
"volumes": volumes,
|
115
|
+
"extra": {},
|
116
|
+
}
|
117
|
+
|
118
|
+
def parse_chapter(
|
119
|
+
self,
|
120
|
+
html_list: list[str],
|
121
|
+
chapter_id: str,
|
122
|
+
**kwargs: Any,
|
123
|
+
) -> ChapterDict | None:
|
124
|
+
if len(html_list) < 2:
|
125
|
+
return None
|
126
|
+
|
127
|
+
try:
|
128
|
+
id_title_map = self._build_id_title_map(html_list[0])
|
129
|
+
title = id_title_map.get(chapter_id) or ""
|
130
|
+
except Exception:
|
131
|
+
title = ""
|
132
|
+
|
133
|
+
wrapper = html.fromstring(f"<div>{html_list[1]}</div>")
|
134
|
+
|
135
|
+
segments: list[str] = []
|
136
|
+
|
137
|
+
self._append_segment(segments, wrapper.text)
|
138
|
+
|
139
|
+
for node in wrapper:
|
140
|
+
tag = node.tag.lower() if isinstance(node.tag, str) else ""
|
141
|
+
|
142
|
+
# A picture‑gallery block
|
143
|
+
if tag == "div" and "content-pics" in (node.get("class") or ""):
|
144
|
+
for img in node.xpath(".//img"):
|
145
|
+
src = img.get("src")
|
146
|
+
full = src if not src.startswith("/") else self.BASE_URL + src
|
147
|
+
segments.append(f'<img src="{full}" />')
|
148
|
+
self._append_segment(segments, node.tail)
|
149
|
+
|
150
|
+
# Standalone <img>
|
151
|
+
elif tag == "img":
|
152
|
+
src = node.get("src")
|
153
|
+
if not src:
|
154
|
+
continue
|
155
|
+
full = src if not src.startswith("/") else self.BASE_URL + src
|
156
|
+
segments.append(f'<img src="{full}" />')
|
157
|
+
self._append_segment(segments, node.tail)
|
158
|
+
|
159
|
+
# Line break -> text in .tail is next paragraph
|
160
|
+
elif tag == "br":
|
161
|
+
self._append_segment(segments, node.tail)
|
162
|
+
|
163
|
+
# Any other element -> get its text content
|
164
|
+
else:
|
165
|
+
self._append_segment(segments, node.text_content())
|
166
|
+
self._append_segment(segments, node.tail)
|
167
|
+
|
168
|
+
# Remove final ad line if present
|
169
|
+
if segments and segments[-1] and segments[-1][0] in ("8", "⑧", "⒏"):
|
170
|
+
segments.pop()
|
171
|
+
|
172
|
+
content = "\n".join(segments).strip()
|
173
|
+
if not content.strip():
|
174
|
+
return None
|
175
|
+
|
176
|
+
return {
|
177
|
+
"id": chapter_id,
|
178
|
+
"title": title,
|
179
|
+
"content": content,
|
180
|
+
"extra": {"site": "eightnovel"},
|
181
|
+
}
|
182
|
+
|
183
|
+
@staticmethod
|
184
|
+
def _append_segment(segments: list[str], text: str | None) -> None:
|
185
|
+
"""
|
186
|
+
Strip, filter out the '8novel' ad, and append non-empty text to segments.
|
187
|
+
"""
|
188
|
+
if not text:
|
189
|
+
return
|
190
|
+
cleaned = text.strip()
|
191
|
+
if cleaned:
|
192
|
+
segments.append(cleaned)
|
193
|
+
|
194
|
+
@classmethod
|
195
|
+
def _build_id_title_map(cls, html_str: str) -> dict[str, str]:
|
196
|
+
"""
|
197
|
+
Extracts two comma-split lists from html_str:
|
198
|
+
* A numeric list of IDs (one element longer)
|
199
|
+
* A list of titles
|
200
|
+
"""
|
201
|
+
id_list = None
|
202
|
+
title_list = None
|
203
|
+
|
204
|
+
for content in cls._SPLIT_STR_PATTERN.findall(html_str):
|
205
|
+
items = [s.strip() for s in content.split(",")]
|
206
|
+
if items == [""]:
|
207
|
+
# skip bids=""
|
208
|
+
continue
|
209
|
+
if all(item.isdigit() for item in items):
|
210
|
+
id_list = items
|
211
|
+
else:
|
212
|
+
title_list = items
|
213
|
+
|
214
|
+
if id_list and title_list:
|
215
|
+
break
|
216
|
+
|
217
|
+
if not id_list or not title_list:
|
218
|
+
raise ValueError("Could not locate both ID and title lists")
|
219
|
+
if len(id_list) != len(title_list) + 1:
|
220
|
+
raise ValueError(
|
221
|
+
"ID list must be exactly one element longer than title list"
|
222
|
+
)
|
223
|
+
|
224
|
+
return dict(zip(id_list[:-1], title_list, strict=False))
|