novel-downloader 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +1 -3
- novel_downloader/cli/clean.py +21 -88
- novel_downloader/cli/config.py +26 -21
- novel_downloader/cli/download.py +77 -64
- novel_downloader/cli/export.py +16 -20
- novel_downloader/cli/main.py +1 -1
- novel_downloader/cli/search.py +62 -65
- novel_downloader/cli/ui.py +156 -0
- novel_downloader/config/__init__.py +8 -5
- novel_downloader/config/adapter.py +65 -105
- novel_downloader/config/{loader.py → file_io.py} +53 -26
- novel_downloader/core/__init__.py +1 -0
- novel_downloader/core/archived/deqixs/fetcher.py +115 -0
- novel_downloader/core/archived/deqixs/parser.py +132 -0
- novel_downloader/core/archived/deqixs/searcher.py +89 -0
- novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
- novel_downloader/core/archived/wanbengo/searcher.py +98 -0
- novel_downloader/core/archived/xshbook/searcher.py +93 -0
- novel_downloader/core/downloaders/__init__.py +3 -24
- novel_downloader/core/downloaders/base.py +49 -23
- novel_downloader/core/downloaders/common.py +191 -137
- novel_downloader/core/downloaders/qianbi.py +187 -146
- novel_downloader/core/downloaders/qidian.py +187 -141
- novel_downloader/core/downloaders/registry.py +4 -2
- novel_downloader/core/downloaders/signals.py +46 -0
- novel_downloader/core/exporters/__init__.py +3 -20
- novel_downloader/core/exporters/base.py +33 -37
- novel_downloader/core/exporters/common/__init__.py +1 -2
- novel_downloader/core/exporters/common/epub.py +15 -10
- novel_downloader/core/exporters/common/main_exporter.py +19 -12
- novel_downloader/core/exporters/common/txt.py +14 -9
- novel_downloader/core/exporters/epub_util.py +59 -29
- novel_downloader/core/exporters/linovelib/__init__.py +1 -0
- novel_downloader/core/exporters/linovelib/epub.py +23 -25
- novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
- novel_downloader/core/exporters/linovelib/txt.py +17 -11
- novel_downloader/core/exporters/qidian.py +2 -8
- novel_downloader/core/exporters/registry.py +4 -2
- novel_downloader/core/exporters/txt_util.py +7 -7
- novel_downloader/core/fetchers/__init__.py +54 -48
- novel_downloader/core/fetchers/aaatxt.py +83 -0
- novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
- novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
- novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
- novel_downloader/core/fetchers/dxmwx.py +110 -0
- novel_downloader/core/fetchers/eightnovel.py +139 -0
- novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
- novel_downloader/core/fetchers/guidaye.py +85 -0
- novel_downloader/core/fetchers/hetushu.py +92 -0
- novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
- novel_downloader/core/fetchers/ixdzs8.py +113 -0
- novel_downloader/core/fetchers/jpxs123.py +101 -0
- novel_downloader/core/fetchers/lewenn.py +83 -0
- novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
- novel_downloader/core/fetchers/piaotia.py +105 -0
- novel_downloader/core/fetchers/qbtr.py +101 -0
- novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
- novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +46 -39
- novel_downloader/core/fetchers/quanben5.py +92 -0
- novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
- novel_downloader/core/fetchers/registry.py +5 -16
- novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
- novel_downloader/core/fetchers/shencou.py +106 -0
- novel_downloader/core/fetchers/shuhaige.py +84 -0
- novel_downloader/core/fetchers/tongrenquan.py +84 -0
- novel_downloader/core/fetchers/ttkan.py +95 -0
- novel_downloader/core/fetchers/wanbengo.py +83 -0
- novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
- novel_downloader/core/fetchers/xiguashuwu.py +177 -0
- novel_downloader/core/fetchers/xs63b.py +171 -0
- novel_downloader/core/fetchers/xshbook.py +85 -0
- novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
- novel_downloader/core/fetchers/yibige.py +114 -0
- novel_downloader/core/interfaces/__init__.py +1 -9
- novel_downloader/core/interfaces/downloader.py +6 -2
- novel_downloader/core/interfaces/exporter.py +7 -7
- novel_downloader/core/interfaces/fetcher.py +4 -17
- novel_downloader/core/interfaces/parser.py +5 -6
- novel_downloader/core/interfaces/searcher.py +9 -1
- novel_downloader/core/parsers/__init__.py +49 -12
- novel_downloader/core/parsers/aaatxt.py +132 -0
- novel_downloader/core/parsers/b520.py +116 -0
- novel_downloader/core/parsers/base.py +63 -12
- novel_downloader/core/parsers/biquyuedu.py +133 -0
- novel_downloader/core/parsers/dxmwx.py +162 -0
- novel_downloader/core/parsers/eightnovel.py +224 -0
- novel_downloader/core/parsers/esjzone.py +61 -66
- novel_downloader/core/parsers/guidaye.py +128 -0
- novel_downloader/core/parsers/hetushu.py +139 -0
- novel_downloader/core/parsers/i25zw.py +137 -0
- novel_downloader/core/parsers/ixdzs8.py +186 -0
- novel_downloader/core/parsers/jpxs123.py +137 -0
- novel_downloader/core/parsers/lewenn.py +142 -0
- novel_downloader/core/parsers/linovelib.py +48 -64
- novel_downloader/core/parsers/piaotia.py +189 -0
- novel_downloader/core/parsers/qbtr.py +136 -0
- novel_downloader/core/parsers/qianbi.py +48 -50
- novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +272 -330
- novel_downloader/core/parsers/qidian/chapter_normal.py +24 -55
- novel_downloader/core/parsers/qidian/main_parser.py +11 -38
- novel_downloader/core/parsers/qidian/utils/__init__.py +1 -0
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
- novel_downloader/core/parsers/quanben5.py +103 -0
- novel_downloader/core/parsers/registry.py +5 -16
- novel_downloader/core/parsers/sfacg.py +38 -45
- novel_downloader/core/parsers/shencou.py +215 -0
- novel_downloader/core/parsers/shuhaige.py +111 -0
- novel_downloader/core/parsers/tongrenquan.py +116 -0
- novel_downloader/core/parsers/ttkan.py +132 -0
- novel_downloader/core/parsers/wanbengo.py +191 -0
- novel_downloader/core/parsers/xiaoshuowu.py +173 -0
- novel_downloader/core/parsers/xiguashuwu.py +435 -0
- novel_downloader/core/parsers/xs63b.py +161 -0
- novel_downloader/core/parsers/xshbook.py +134 -0
- novel_downloader/core/parsers/yamibo.py +87 -131
- novel_downloader/core/parsers/yibige.py +166 -0
- novel_downloader/core/searchers/__init__.py +34 -3
- novel_downloader/core/searchers/aaatxt.py +107 -0
- novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
- novel_downloader/core/searchers/base.py +112 -36
- novel_downloader/core/searchers/dxmwx.py +105 -0
- novel_downloader/core/searchers/eightnovel.py +84 -0
- novel_downloader/core/searchers/esjzone.py +43 -25
- novel_downloader/core/searchers/hetushu.py +92 -0
- novel_downloader/core/searchers/i25zw.py +93 -0
- novel_downloader/core/searchers/ixdzs8.py +107 -0
- novel_downloader/core/searchers/jpxs123.py +107 -0
- novel_downloader/core/searchers/piaotia.py +100 -0
- novel_downloader/core/searchers/qbtr.py +106 -0
- novel_downloader/core/searchers/qianbi.py +74 -40
- novel_downloader/core/searchers/quanben5.py +144 -0
- novel_downloader/core/searchers/registry.py +24 -8
- novel_downloader/core/searchers/shuhaige.py +124 -0
- novel_downloader/core/searchers/tongrenquan.py +110 -0
- novel_downloader/core/searchers/ttkan.py +92 -0
- novel_downloader/core/searchers/xiaoshuowu.py +122 -0
- novel_downloader/core/searchers/xiguashuwu.py +95 -0
- novel_downloader/core/searchers/xs63b.py +104 -0
- novel_downloader/locales/en.json +31 -82
- novel_downloader/locales/zh.json +32 -83
- novel_downloader/models/__init__.py +21 -22
- novel_downloader/models/book.py +44 -0
- novel_downloader/models/config.py +4 -37
- novel_downloader/models/login.py +1 -1
- novel_downloader/models/search.py +5 -0
- novel_downloader/resources/config/settings.toml +8 -70
- novel_downloader/resources/json/xiguashuwu.json +718 -0
- novel_downloader/utils/__init__.py +13 -22
- novel_downloader/utils/chapter_storage.py +3 -2
- novel_downloader/utils/constants.py +4 -29
- novel_downloader/utils/cookies.py +6 -18
- novel_downloader/utils/crypto_utils/__init__.py +13 -0
- novel_downloader/utils/crypto_utils/aes_util.py +90 -0
- novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
- novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
- novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
- novel_downloader/utils/epub/__init__.py +1 -1
- novel_downloader/utils/epub/constants.py +57 -16
- novel_downloader/utils/epub/documents.py +88 -194
- novel_downloader/utils/epub/models.py +0 -14
- novel_downloader/utils/epub/utils.py +63 -96
- novel_downloader/utils/file_utils/__init__.py +2 -23
- novel_downloader/utils/file_utils/io.py +3 -113
- novel_downloader/utils/file_utils/sanitize.py +0 -4
- novel_downloader/utils/fontocr.py +207 -0
- novel_downloader/utils/logger.py +8 -16
- novel_downloader/utils/network.py +2 -2
- novel_downloader/utils/state.py +4 -90
- novel_downloader/utils/text_utils/__init__.py +1 -7
- novel_downloader/utils/text_utils/diff_display.py +5 -7
- novel_downloader/utils/time_utils/__init__.py +5 -11
- novel_downloader/utils/time_utils/datetime_utils.py +20 -29
- novel_downloader/utils/time_utils/sleep_utils.py +4 -8
- novel_downloader/web/__init__.py +13 -0
- novel_downloader/web/components/__init__.py +11 -0
- novel_downloader/web/components/navigation.py +35 -0
- novel_downloader/web/main.py +66 -0
- novel_downloader/web/pages/__init__.py +17 -0
- novel_downloader/web/pages/download.py +78 -0
- novel_downloader/web/pages/progress.py +147 -0
- novel_downloader/web/pages/search.py +329 -0
- novel_downloader/web/services/__init__.py +17 -0
- novel_downloader/web/services/client_dialog.py +164 -0
- novel_downloader/web/services/cred_broker.py +113 -0
- novel_downloader/web/services/cred_models.py +35 -0
- novel_downloader/web/services/task_manager.py +264 -0
- novel_downloader-2.0.0.dist-info/METADATA +171 -0
- novel_downloader-2.0.0.dist-info/RECORD +210 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
- novel_downloader/core/downloaders/biquge.py +0 -29
- novel_downloader/core/downloaders/esjzone.py +0 -29
- novel_downloader/core/downloaders/linovelib.py +0 -29
- novel_downloader/core/downloaders/sfacg.py +0 -29
- novel_downloader/core/downloaders/yamibo.py +0 -29
- novel_downloader/core/exporters/biquge.py +0 -22
- novel_downloader/core/exporters/esjzone.py +0 -22
- novel_downloader/core/exporters/qianbi.py +0 -22
- novel_downloader/core/exporters/sfacg.py +0 -22
- novel_downloader/core/exporters/yamibo.py +0 -22
- novel_downloader/core/fetchers/base/__init__.py +0 -14
- novel_downloader/core/fetchers/base/browser.py +0 -422
- novel_downloader/core/fetchers/biquge/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/browser.py +0 -209
- novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
- novel_downloader/core/fetchers/linovelib/browser.py +0 -198
- novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/browser.py +0 -326
- novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
- novel_downloader/core/fetchers/sfacg/browser.py +0 -194
- novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
- novel_downloader/core/fetchers/yamibo/browser.py +0 -234
- novel_downloader/core/parsers/biquge.py +0 -139
- novel_downloader/models/chapter.py +0 -25
- novel_downloader/models/types.py +0 -13
- novel_downloader/tui/__init__.py +0 -7
- novel_downloader/tui/app.py +0 -32
- novel_downloader/tui/main.py +0 -17
- novel_downloader/tui/screens/__init__.py +0 -14
- novel_downloader/tui/screens/home.py +0 -198
- novel_downloader/tui/screens/login.py +0 -74
- novel_downloader/tui/styles/home_layout.tcss +0 -79
- novel_downloader/tui/widgets/richlog_handler.py +0 -24
- novel_downloader/utils/cache.py +0 -24
- novel_downloader/utils/fontocr/__init__.py +0 -22
- novel_downloader/utils/fontocr/hash_store.py +0 -280
- novel_downloader/utils/fontocr/hash_utils.py +0 -103
- novel_downloader/utils/fontocr/model_loader.py +0 -69
- novel_downloader/utils/fontocr/ocr_v1.py +0 -315
- novel_downloader/utils/fontocr/ocr_v2.py +0 -764
- novel_downloader/utils/fontocr/ocr_v3.py +0 -744
- novel_downloader-1.5.0.dist-info/METADATA +0 -196
- novel_downloader-1.5.0.dist-info/RECORD +0 -164
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,162 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.dxmwx
|
4
|
+
-----------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from datetime import datetime
|
10
|
+
from typing import Any
|
11
|
+
|
12
|
+
from lxml import html
|
13
|
+
|
14
|
+
from novel_downloader.core.parsers.base import BaseParser
|
15
|
+
from novel_downloader.core.parsers.registry import register_parser
|
16
|
+
from novel_downloader.models import (
|
17
|
+
BookInfoDict,
|
18
|
+
ChapterDict,
|
19
|
+
ChapterInfoDict,
|
20
|
+
VolumeInfoDict,
|
21
|
+
)
|
22
|
+
|
23
|
+
|
24
|
+
@register_parser(
|
25
|
+
site_keys=["dxmwx"],
|
26
|
+
)
|
27
|
+
class DxmwxParser(BaseParser):
|
28
|
+
"""
|
29
|
+
Parser for 大熊猫文学网 book pages.
|
30
|
+
"""
|
31
|
+
|
32
|
+
_RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}")
|
33
|
+
_RE_SPACES = re.compile(r"[ \t\u3000]+")
|
34
|
+
_RE_NEWLINES = re.compile(r"\n{2,}")
|
35
|
+
_RE_TITLE_WS = re.compile(r"\s+")
|
36
|
+
|
37
|
+
def parse_book_info(
|
38
|
+
self,
|
39
|
+
html_list: list[str],
|
40
|
+
**kwargs: Any,
|
41
|
+
) -> BookInfoDict | None:
|
42
|
+
if len(html_list) < 2:
|
43
|
+
return None
|
44
|
+
|
45
|
+
info_tree = html.fromstring(html_list[0])
|
46
|
+
catalog_tree = html.fromstring(html_list[1])
|
47
|
+
|
48
|
+
book_name = self._first_str(
|
49
|
+
info_tree.xpath("//span[contains(@style,'font-size: 24px')]/text()")
|
50
|
+
)
|
51
|
+
author = self._first_str(
|
52
|
+
info_tree.xpath(
|
53
|
+
"//div[contains(@style,'height: 28px') and contains(., '著')]//a/text()"
|
54
|
+
)
|
55
|
+
)
|
56
|
+
tags = [
|
57
|
+
t.strip()
|
58
|
+
for t in info_tree.xpath("//span[@class='typebut']//a/text()")
|
59
|
+
if t.strip()
|
60
|
+
]
|
61
|
+
cover_url = "https://www.dxmwx.org" + self._first_str(
|
62
|
+
info_tree.xpath("//img[@class='imgwidth']/@src")
|
63
|
+
)
|
64
|
+
|
65
|
+
raw_update = self._first_str(
|
66
|
+
info_tree.xpath(
|
67
|
+
"normalize-space(string(//span[starts-with(normalize-space(.), '更新时间:')]))" # noqa: E501
|
68
|
+
)
|
69
|
+
)
|
70
|
+
raw_update = raw_update.replace("更新时间:", "").strip()
|
71
|
+
update_time = self._normalize_update_date(raw_update)
|
72
|
+
|
73
|
+
nodes = info_tree.xpath(
|
74
|
+
"//div[contains(@style,'min-height') and "
|
75
|
+
"contains(@style,'padding-left') and contains(@style,'padding-right')][1]"
|
76
|
+
)
|
77
|
+
summary = ""
|
78
|
+
if nodes:
|
79
|
+
texts = [
|
80
|
+
t.replace("\xa0", " ").strip() for t in nodes[0].xpath(".//text()")
|
81
|
+
]
|
82
|
+
lines = [t for t in texts if t]
|
83
|
+
summary = "\n".join(lines)
|
84
|
+
summary = re.sub(r"^\s*[::]\s*", "", summary)
|
85
|
+
summary = self._clean_spaces(summary)
|
86
|
+
|
87
|
+
chapters: list[ChapterInfoDict] = []
|
88
|
+
for a in catalog_tree.xpath(
|
89
|
+
"//div[contains(@style,'height:40px') and contains(@style,'border-bottom')]//a" # noqa: E501
|
90
|
+
):
|
91
|
+
href = a.get("href") or ""
|
92
|
+
title = (a.text_content() or "").strip()
|
93
|
+
if not href or not title:
|
94
|
+
continue
|
95
|
+
# "/read/57215_50197663.html" -> "50197663"
|
96
|
+
chap_id = href.split("read/", 1)[-1].split(".html", 1)[0].split("_")[-1]
|
97
|
+
chapters.append({"title": title, "url": href, "chapterId": chap_id})
|
98
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
99
|
+
|
100
|
+
return {
|
101
|
+
"book_name": book_name,
|
102
|
+
"author": author,
|
103
|
+
"cover_url": cover_url,
|
104
|
+
"update_time": update_time,
|
105
|
+
"tags": tags,
|
106
|
+
"summary": summary,
|
107
|
+
"volumes": volumes,
|
108
|
+
"extra": {},
|
109
|
+
}
|
110
|
+
|
111
|
+
def parse_chapter(
|
112
|
+
self,
|
113
|
+
html_list: list[str],
|
114
|
+
chapter_id: str,
|
115
|
+
**kwargs: Any,
|
116
|
+
) -> ChapterDict | None:
|
117
|
+
if not html_list:
|
118
|
+
return None
|
119
|
+
|
120
|
+
tree = html.fromstring(html_list[0])
|
121
|
+
|
122
|
+
title = self._first_str(tree.xpath("//h1[@id='ChapterTitle']/text()"))
|
123
|
+
title = self._RE_TITLE_WS.sub(" ", title).strip()
|
124
|
+
if not title:
|
125
|
+
title = f"第 {chapter_id} 章"
|
126
|
+
|
127
|
+
paragraphs: list[str] = []
|
128
|
+
for p in tree.xpath("//div[@id='Lab_Contents']//p"):
|
129
|
+
text = self._clean_spaces(p.text_content())
|
130
|
+
if not text:
|
131
|
+
continue
|
132
|
+
if "点这里听书" in text or "大熊猫文学" in text:
|
133
|
+
continue
|
134
|
+
paragraphs.append(text)
|
135
|
+
|
136
|
+
content = "\n".join(paragraphs).strip()
|
137
|
+
if not content:
|
138
|
+
return None
|
139
|
+
|
140
|
+
return {
|
141
|
+
"id": chapter_id,
|
142
|
+
"title": title,
|
143
|
+
"content": content,
|
144
|
+
"extra": {"site": "dxmwx"},
|
145
|
+
}
|
146
|
+
|
147
|
+
@classmethod
|
148
|
+
def _clean_spaces(cls, s: str) -> str:
|
149
|
+
s = s.replace("\xa0", " ")
|
150
|
+
s = cls._RE_SPACES.sub(" ", s)
|
151
|
+
s = cls._RE_NEWLINES.sub("\n", s)
|
152
|
+
return s.strip()
|
153
|
+
|
154
|
+
@classmethod
|
155
|
+
def _normalize_update_date(cls, raw: str) -> str:
|
156
|
+
"""Return a YYYY-MM-DD string."""
|
157
|
+
if not raw:
|
158
|
+
return datetime.now().strftime("%Y-%m-%d")
|
159
|
+
m = cls._RE_DATE.search(raw)
|
160
|
+
if m:
|
161
|
+
return m.group(0)
|
162
|
+
return datetime.now().strftime("%Y-%m-%d")
|
@@ -0,0 +1,224 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.eightnovel
|
4
|
+
----------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
from lxml import html
|
12
|
+
|
13
|
+
from novel_downloader.core.parsers.base import BaseParser
|
14
|
+
from novel_downloader.core.parsers.registry import register_parser
|
15
|
+
from novel_downloader.models import (
|
16
|
+
BookInfoDict,
|
17
|
+
ChapterDict,
|
18
|
+
ChapterInfoDict,
|
19
|
+
VolumeInfoDict,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
@register_parser(
|
24
|
+
site_keys=["eightnovel", "8novel"],
|
25
|
+
)
|
26
|
+
class EightnovelParser(BaseParser):
|
27
|
+
"""
|
28
|
+
Parser for 无限轻小说 book pages.
|
29
|
+
"""
|
30
|
+
|
31
|
+
BASE_URL = "https://www.8novel.com"
|
32
|
+
_SPLIT_STR_PATTERN = re.compile(
|
33
|
+
r'["\']([^"\']+)["\']\s*\.split\s*\(\s*["\']\s*,\s*["\']\s*\)', re.DOTALL
|
34
|
+
)
|
35
|
+
_RE_AUTHOR = re.compile(r"作者[::]?\s*")
|
36
|
+
_RE_UPDATE = re.compile(r"更新[::]?\s*")
|
37
|
+
|
38
|
+
def parse_book_info(
|
39
|
+
self,
|
40
|
+
html_list: list[str],
|
41
|
+
**kwargs: Any,
|
42
|
+
) -> BookInfoDict | None:
|
43
|
+
if not html_list:
|
44
|
+
return None
|
45
|
+
|
46
|
+
tree = html.fromstring(html_list[0])
|
47
|
+
|
48
|
+
# --- Basic metadata ---
|
49
|
+
book_name = self._first_str(tree.xpath("//li[contains(@class,'h2')]/text()"))
|
50
|
+
|
51
|
+
author_raw = self._first_str(
|
52
|
+
tree.xpath("//span[contains(@class,'item-info-author')]/text()")
|
53
|
+
)
|
54
|
+
author = self._RE_AUTHOR.sub("", author_raw)
|
55
|
+
|
56
|
+
cover_url = self.BASE_URL + self._first_str(
|
57
|
+
tree.xpath("//div[contains(@class,'item-cover')]//img/@src")
|
58
|
+
)
|
59
|
+
|
60
|
+
update_raw = self._first_str(
|
61
|
+
tree.xpath("//span[contains(@class,'item-info-date')]/text()")
|
62
|
+
)
|
63
|
+
update_time = self._RE_UPDATE.sub("", update_raw)
|
64
|
+
|
65
|
+
counts = tree.xpath(
|
66
|
+
"//li[@class='small text-gray']//span[contains(@class,'item-info-num')]/text()" # noqa: E501
|
67
|
+
)
|
68
|
+
word_count = counts[1].strip() + "萬字" if len(counts) >= 2 else ""
|
69
|
+
|
70
|
+
tags = tree.xpath("//meta[@property='og:novel:category']/@content")
|
71
|
+
|
72
|
+
# --- Summary ---
|
73
|
+
summary_nodes = tree.xpath(
|
74
|
+
"//li[contains(@class,'full_text') and contains(@class,'mt-2')]"
|
75
|
+
)
|
76
|
+
if summary_nodes:
|
77
|
+
texts = [t.strip() for t in summary_nodes[0].itertext()]
|
78
|
+
summary = "\n".join(line for line in texts if line)
|
79
|
+
else:
|
80
|
+
summary = ""
|
81
|
+
|
82
|
+
# --- Chapters / Volumes ---
|
83
|
+
volumes: list[VolumeInfoDict] = []
|
84
|
+
for vol_div in tree.xpath("//div[contains(@class,'folder') and @pid]"):
|
85
|
+
# Volume title
|
86
|
+
h3 = vol_div.xpath(".//div[contains(@class,'vol-title')]//h3")
|
87
|
+
vol_name = (
|
88
|
+
h3[0].text_content().split("/")[0].strip() if h3 else "Unnamed Volume"
|
89
|
+
)
|
90
|
+
|
91
|
+
# Chapters
|
92
|
+
chapters: list[ChapterInfoDict] = []
|
93
|
+
for a in vol_div.xpath(
|
94
|
+
".//a[contains(@class,'episode_li') and contains(@class,'d-block')]"
|
95
|
+
):
|
96
|
+
title = (a.text_content() or "").strip()
|
97
|
+
href = a.get("href") or ""
|
98
|
+
if not href or not title:
|
99
|
+
continue
|
100
|
+
url = href if href.startswith("http") else self.BASE_URL + href
|
101
|
+
chapter_id = href.split("?")[-1] # "/read/3355/?270015" -> "270015"
|
102
|
+
chapters.append({"title": title, "url": url, "chapterId": chapter_id})
|
103
|
+
|
104
|
+
volumes.append({"volume_name": vol_name, "chapters": chapters})
|
105
|
+
|
106
|
+
return {
|
107
|
+
"book_name": book_name,
|
108
|
+
"author": author,
|
109
|
+
"cover_url": cover_url,
|
110
|
+
"update_time": update_time,
|
111
|
+
"word_count": word_count,
|
112
|
+
"tags": tags,
|
113
|
+
"summary": summary,
|
114
|
+
"volumes": volumes,
|
115
|
+
"extra": {},
|
116
|
+
}
|
117
|
+
|
118
|
+
def parse_chapter(
|
119
|
+
self,
|
120
|
+
html_list: list[str],
|
121
|
+
chapter_id: str,
|
122
|
+
**kwargs: Any,
|
123
|
+
) -> ChapterDict | None:
|
124
|
+
if len(html_list) < 2:
|
125
|
+
return None
|
126
|
+
|
127
|
+
try:
|
128
|
+
id_title_map = self._build_id_title_map(html_list[0])
|
129
|
+
title = id_title_map.get(chapter_id) or ""
|
130
|
+
except Exception:
|
131
|
+
title = ""
|
132
|
+
|
133
|
+
wrapper = html.fromstring(f"<div>{html_list[1]}</div>")
|
134
|
+
|
135
|
+
segments: list[str] = []
|
136
|
+
|
137
|
+
self._append_segment(segments, wrapper.text)
|
138
|
+
|
139
|
+
for node in wrapper:
|
140
|
+
tag = node.tag.lower() if isinstance(node.tag, str) else ""
|
141
|
+
|
142
|
+
# A picture‑gallery block
|
143
|
+
if tag == "div" and "content-pics" in (node.get("class") or ""):
|
144
|
+
for img in node.xpath(".//img"):
|
145
|
+
src = img.get("src")
|
146
|
+
full = src if not src.startswith("/") else self.BASE_URL + src
|
147
|
+
segments.append(f'<img src="{full}" />')
|
148
|
+
self._append_segment(segments, node.tail)
|
149
|
+
|
150
|
+
# Standalone <img>
|
151
|
+
elif tag == "img":
|
152
|
+
src = node.get("src")
|
153
|
+
if not src:
|
154
|
+
continue
|
155
|
+
full = src if not src.startswith("/") else self.BASE_URL + src
|
156
|
+
segments.append(f'<img src="{full}" />')
|
157
|
+
self._append_segment(segments, node.tail)
|
158
|
+
|
159
|
+
# Line break -> text in .tail is next paragraph
|
160
|
+
elif tag == "br":
|
161
|
+
self._append_segment(segments, node.tail)
|
162
|
+
|
163
|
+
# Any other element -> get its text content
|
164
|
+
else:
|
165
|
+
self._append_segment(segments, node.text_content())
|
166
|
+
self._append_segment(segments, node.tail)
|
167
|
+
|
168
|
+
# Remove final ad line if present
|
169
|
+
if segments and segments[-1] and segments[-1][0] in ("8", "⑧", "⒏"):
|
170
|
+
segments.pop()
|
171
|
+
|
172
|
+
content = "\n".join(segments).strip()
|
173
|
+
if not content.strip():
|
174
|
+
return None
|
175
|
+
|
176
|
+
return {
|
177
|
+
"id": chapter_id,
|
178
|
+
"title": title,
|
179
|
+
"content": content,
|
180
|
+
"extra": {"site": "eightnovel"},
|
181
|
+
}
|
182
|
+
|
183
|
+
@staticmethod
|
184
|
+
def _append_segment(segments: list[str], text: str | None) -> None:
|
185
|
+
"""
|
186
|
+
Strip, filter out the '8novel' ad, and append non-empty text to segments.
|
187
|
+
"""
|
188
|
+
if not text:
|
189
|
+
return
|
190
|
+
cleaned = text.strip()
|
191
|
+
if cleaned:
|
192
|
+
segments.append(cleaned)
|
193
|
+
|
194
|
+
@classmethod
|
195
|
+
def _build_id_title_map(cls, html_str: str) -> dict[str, str]:
|
196
|
+
"""
|
197
|
+
Extracts two comma-split lists from html_str:
|
198
|
+
- A numeric list of IDs (one element longer)
|
199
|
+
- A list of titles
|
200
|
+
"""
|
201
|
+
id_list = None
|
202
|
+
title_list = None
|
203
|
+
|
204
|
+
for content in cls._SPLIT_STR_PATTERN.findall(html_str):
|
205
|
+
items = [s.strip() for s in content.split(",")]
|
206
|
+
if items == [""]:
|
207
|
+
# skip bids=""
|
208
|
+
continue
|
209
|
+
if all(item.isdigit() for item in items):
|
210
|
+
id_list = items
|
211
|
+
else:
|
212
|
+
title_list = items
|
213
|
+
|
214
|
+
if id_list and title_list:
|
215
|
+
break
|
216
|
+
|
217
|
+
if not id_list or not title_list:
|
218
|
+
raise ValueError("Could not locate both ID and title lists")
|
219
|
+
if len(id_list) != len(title_list) + 1:
|
220
|
+
raise ValueError(
|
221
|
+
"ID list must be exactly one element longer than title list"
|
222
|
+
)
|
223
|
+
|
224
|
+
return dict(zip(id_list[:-1], title_list, strict=False))
|
@@ -12,26 +12,20 @@ from lxml import html
|
|
12
12
|
|
13
13
|
from novel_downloader.core.parsers.base import BaseParser
|
14
14
|
from novel_downloader.core.parsers.registry import register_parser
|
15
|
-
from novel_downloader.models import
|
15
|
+
from novel_downloader.models import (
|
16
|
+
BookInfoDict,
|
17
|
+
ChapterDict,
|
18
|
+
VolumeInfoDict,
|
19
|
+
)
|
16
20
|
|
17
21
|
|
18
22
|
@register_parser(
|
19
23
|
site_keys=["esjzone"],
|
20
|
-
backends=["session", "browser"],
|
21
24
|
)
|
22
25
|
class EsjzoneParser(BaseParser):
|
23
|
-
"""
|
24
|
-
|
25
|
-
|
26
|
-
_BOOK_NAME_XPATH = '//h2[contains(@class, "text-normal")]/text()'
|
27
|
-
_AUTHOR_XPATH = '//li[strong[text()="作者:"]]/a/text()'
|
28
|
-
_COVER_URL_XPATH = '//div[contains(@class,"product-gallery")]//img/@src'
|
29
|
-
_UPDATE_TIME_XPATH = '//li[strong[text()="更新日期:"]]/text()'
|
30
|
-
_WORD_COUNT_XPATH = '//span[@id="txt"]/text()'
|
31
|
-
_TYPE_XPATH = '//li[strong[text()="類型:"]]/text()'
|
32
|
-
_ALT_NAME_XPATH = '//li[strong[text()="其他書名:"]]/text()'
|
33
|
-
_WEB_URL_XPATH = '//li[strong[text()="Web生肉:"]]/a/@href'
|
34
|
-
_SUMMARY_XPATH = '//div[@class="description"]/p//text()'
|
26
|
+
"""
|
27
|
+
Parser for esjzone book pages.
|
28
|
+
"""
|
35
29
|
|
36
30
|
# Chapter XPaths
|
37
31
|
_CHAPTER_TEXT_XPATH = 'string(//div[contains(@class, "forum-content")])'
|
@@ -40,14 +34,13 @@ class EsjzoneParser(BaseParser):
|
|
40
34
|
'//i[contains(@class, "icon-clock")]/following-sibling::text()',
|
41
35
|
'//i[contains(@class, "icon-pen-tool")]/following-sibling::text()',
|
42
36
|
]
|
43
|
-
|
44
37
|
_CHECK_FORUM_XPATH = '//div[@class="page-title"]//ul[@class="breadcrumbs"]/li[not(@class="slash")]//text()' # noqa: E501
|
45
38
|
|
46
39
|
def parse_book_info(
|
47
40
|
self,
|
48
41
|
html_list: list[str],
|
49
42
|
**kwargs: Any,
|
50
|
-
) ->
|
43
|
+
) -> BookInfoDict | None:
|
51
44
|
"""
|
52
45
|
Parse a book info page and extract metadata and chapter structure.
|
53
46
|
|
@@ -58,27 +51,40 @@ class EsjzoneParser(BaseParser):
|
|
58
51
|
:return: Parsed metadata and chapter structure as a dictionary.
|
59
52
|
"""
|
60
53
|
if not html_list or self._is_forum_page(html_list):
|
61
|
-
return
|
54
|
+
return None
|
55
|
+
|
62
56
|
tree = html.fromstring(html_list[0])
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
result["cover_url"] = self._get_text(tree, self._COVER_URL_XPATH)
|
68
|
-
result["update_time"] = self._get_text(tree, self._UPDATE_TIME_XPATH)
|
69
|
-
result["word_count"] = self._get_text(
|
70
|
-
tree, self._WORD_COUNT_XPATH, clean_comma=True
|
57
|
+
|
58
|
+
# --- Basic metadata ---
|
59
|
+
book_name = self._first_str(
|
60
|
+
tree.xpath('//h2[contains(@class,"text-normal")]/text()')
|
71
61
|
)
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
62
|
+
author = self._first_str(tree.xpath('//li[strong[text()="作者:"]]/a/text()'))
|
63
|
+
cover_url = self._first_str(
|
64
|
+
tree.xpath('//div[contains(@class,"product-gallery")]//img/@src')
|
65
|
+
)
|
66
|
+
update_time = self._first_str(
|
67
|
+
tree.xpath('//li[strong[text()="更新日期:"]]/text()')
|
68
|
+
) # noqa: E501
|
69
|
+
word_count = self._first_str(
|
70
|
+
tree.xpath('//span[@id="txt"]/text()'), replaces=[(",", "")]
|
71
|
+
)
|
72
|
+
book_type = self._first_str(tree.xpath('//li[strong[text()="類型:"]]/text()'))
|
73
|
+
alt_name = self._first_str(
|
74
|
+
tree.xpath('//li[strong[text()="其他書名:"]]/text()')
|
75
|
+
) # noqa: E501
|
76
|
+
web_url = self._first_str(tree.xpath('//li[strong[text()="Web生肉:"]]/a/@href'))
|
77
|
+
|
78
|
+
# Summary paragraphs
|
76
79
|
paras = tree.xpath('//div[@class="description"]/p')
|
77
80
|
texts = [p.xpath("string()").strip() for p in paras]
|
78
|
-
|
81
|
+
summary = "\n".join(t for t in texts if t)
|
79
82
|
|
80
|
-
|
81
|
-
|
83
|
+
current_vol: VolumeInfoDict = {
|
84
|
+
"volume_name": "單卷",
|
85
|
+
"chapters": [],
|
86
|
+
}
|
87
|
+
volumes: list[VolumeInfoDict] = [current_vol]
|
82
88
|
|
83
89
|
def _is_garbage_title(name: str) -> bool:
|
84
90
|
stripped = name.strip()
|
@@ -89,25 +95,18 @@ class EsjzoneParser(BaseParser):
|
|
89
95
|
if _is_garbage_title(name):
|
90
96
|
return
|
91
97
|
name = name.strip() or "未命名卷"
|
92
|
-
if
|
98
|
+
if current_vol and current_vol["volume_name"] == name:
|
93
99
|
return
|
94
100
|
current_vol = {"volume_name": name, "chapters": []}
|
95
101
|
volumes.append(current_vol)
|
96
102
|
|
97
|
-
_start_volume("單卷")
|
98
|
-
|
99
|
-
# nodes = tree.xpath('//div[@id="chapterList"]/details') + tree.xpath(
|
100
|
-
# '//div[@id="chapterList"]/*[not(self::details)]'
|
101
|
-
# )
|
102
103
|
nodes = tree.xpath('//div[@id="chapterList"]/*')
|
103
|
-
|
104
104
|
for node in nodes:
|
105
105
|
tag = node.tag.lower()
|
106
106
|
|
107
107
|
if tag == "details":
|
108
108
|
# ---- DETAILS-based layout ----
|
109
|
-
|
110
|
-
vol_name = summary.text if summary is not None else "未命名卷"
|
109
|
+
vol_name = node.xpath("string(./summary)").strip() or "未命名卷"
|
111
110
|
_start_volume(vol_name)
|
112
111
|
|
113
112
|
# all chapters inside this details
|
@@ -116,7 +115,11 @@ class EsjzoneParser(BaseParser):
|
|
116
115
|
href = a.get("href", "")
|
117
116
|
chap_id = href.rstrip("/").split("/")[-1].split(".", 1)[0]
|
118
117
|
current_vol["chapters"].append(
|
119
|
-
{
|
118
|
+
{
|
119
|
+
"title": title,
|
120
|
+
"url": href,
|
121
|
+
"chapterId": chap_id,
|
122
|
+
}
|
120
123
|
)
|
121
124
|
|
122
125
|
elif (
|
@@ -139,9 +142,21 @@ class EsjzoneParser(BaseParser):
|
|
139
142
|
{"title": title, "url": href, "chapterId": chap_id}
|
140
143
|
)
|
141
144
|
volumes = [vol for vol in volumes if vol["chapters"]]
|
142
|
-
result["volumes"] = volumes
|
143
145
|
|
144
|
-
return
|
146
|
+
return {
|
147
|
+
"book_name": book_name,
|
148
|
+
"author": author,
|
149
|
+
"cover_url": cover_url,
|
150
|
+
"update_time": update_time,
|
151
|
+
"summary": summary,
|
152
|
+
"tags": [book_type],
|
153
|
+
"word_count": word_count,
|
154
|
+
"volumes": volumes,
|
155
|
+
"extra": {
|
156
|
+
"alt_name": alt_name,
|
157
|
+
"web_url": web_url,
|
158
|
+
},
|
159
|
+
}
|
145
160
|
|
146
161
|
def parse_chapter(
|
147
162
|
self,
|
@@ -149,16 +164,9 @@ class EsjzoneParser(BaseParser):
|
|
149
164
|
chapter_id: str,
|
150
165
|
**kwargs: Any,
|
151
166
|
) -> ChapterDict | None:
|
152
|
-
"""
|
153
|
-
Parse a single chapter page and extract clean text or simplified HTML.
|
154
|
-
|
155
|
-
:param html_list: Raw HTML of the chapter page.
|
156
|
-
:param chapter_id: Identifier of the chapter being parsed.
|
157
|
-
:return: Cleaned chapter content as plain text or minimal HTML.
|
158
|
-
"""
|
159
167
|
if not html_list or self._is_forum_page(html_list):
|
160
168
|
return None
|
161
|
-
tree = html.fromstring(html_list[0]
|
169
|
+
tree = html.fromstring(html_list[0])
|
162
170
|
|
163
171
|
content_lines: list[str] = []
|
164
172
|
content_nodes = tree.xpath(self._CHAPTER_CONTENT_NODES_XPATH)
|
@@ -178,7 +186,7 @@ class EsjzoneParser(BaseParser):
|
|
178
186
|
content_lines.append(f'<img src="{src}" />')
|
179
187
|
|
180
188
|
content = (
|
181
|
-
"\n
|
189
|
+
"\n".join(content_lines).strip()
|
182
190
|
if content_lines
|
183
191
|
else tree.xpath(self._CHAPTER_TEXT_XPATH).strip()
|
184
192
|
)
|
@@ -216,16 +224,3 @@ class EsjzoneParser(BaseParser):
|
|
216
224
|
breadcrumb: list[str] = tree.xpath(self._CHECK_FORUM_XPATH)
|
217
225
|
breadcrumb = [s.strip() for s in breadcrumb if s.strip()]
|
218
226
|
return breadcrumb == ["Home", "論壇"]
|
219
|
-
|
220
|
-
@staticmethod
|
221
|
-
def _get_text(
|
222
|
-
tree: html.HtmlElement,
|
223
|
-
xpath: str,
|
224
|
-
join: bool = False,
|
225
|
-
clean_comma: bool = False,
|
226
|
-
) -> str:
|
227
|
-
data = tree.xpath(xpath)
|
228
|
-
if not data:
|
229
|
-
return ""
|
230
|
-
text = "\n".join(data) if join else data[0].strip()
|
231
|
-
return text.replace(",", "") if clean_comma else text
|