novel-downloader 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +1 -3
- novel_downloader/cli/clean.py +21 -88
- novel_downloader/cli/config.py +26 -21
- novel_downloader/cli/download.py +77 -64
- novel_downloader/cli/export.py +16 -20
- novel_downloader/cli/main.py +1 -1
- novel_downloader/cli/search.py +62 -65
- novel_downloader/cli/ui.py +156 -0
- novel_downloader/config/__init__.py +8 -5
- novel_downloader/config/adapter.py +65 -105
- novel_downloader/config/{loader.py → file_io.py} +53 -26
- novel_downloader/core/__init__.py +1 -0
- novel_downloader/core/archived/deqixs/fetcher.py +115 -0
- novel_downloader/core/archived/deqixs/parser.py +132 -0
- novel_downloader/core/archived/deqixs/searcher.py +89 -0
- novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
- novel_downloader/core/archived/wanbengo/searcher.py +98 -0
- novel_downloader/core/archived/xshbook/searcher.py +93 -0
- novel_downloader/core/downloaders/__init__.py +3 -24
- novel_downloader/core/downloaders/base.py +49 -23
- novel_downloader/core/downloaders/common.py +191 -137
- novel_downloader/core/downloaders/qianbi.py +187 -146
- novel_downloader/core/downloaders/qidian.py +187 -141
- novel_downloader/core/downloaders/registry.py +4 -2
- novel_downloader/core/downloaders/signals.py +46 -0
- novel_downloader/core/exporters/__init__.py +3 -20
- novel_downloader/core/exporters/base.py +33 -37
- novel_downloader/core/exporters/common/__init__.py +1 -2
- novel_downloader/core/exporters/common/epub.py +15 -10
- novel_downloader/core/exporters/common/main_exporter.py +19 -12
- novel_downloader/core/exporters/common/txt.py +14 -9
- novel_downloader/core/exporters/epub_util.py +59 -29
- novel_downloader/core/exporters/linovelib/__init__.py +1 -0
- novel_downloader/core/exporters/linovelib/epub.py +23 -25
- novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
- novel_downloader/core/exporters/linovelib/txt.py +17 -11
- novel_downloader/core/exporters/qidian.py +2 -8
- novel_downloader/core/exporters/registry.py +4 -2
- novel_downloader/core/exporters/txt_util.py +7 -7
- novel_downloader/core/fetchers/__init__.py +54 -48
- novel_downloader/core/fetchers/aaatxt.py +83 -0
- novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
- novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
- novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
- novel_downloader/core/fetchers/dxmwx.py +110 -0
- novel_downloader/core/fetchers/eightnovel.py +139 -0
- novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
- novel_downloader/core/fetchers/guidaye.py +85 -0
- novel_downloader/core/fetchers/hetushu.py +92 -0
- novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
- novel_downloader/core/fetchers/ixdzs8.py +113 -0
- novel_downloader/core/fetchers/jpxs123.py +101 -0
- novel_downloader/core/fetchers/lewenn.py +83 -0
- novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
- novel_downloader/core/fetchers/piaotia.py +105 -0
- novel_downloader/core/fetchers/qbtr.py +101 -0
- novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
- novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +46 -39
- novel_downloader/core/fetchers/quanben5.py +92 -0
- novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
- novel_downloader/core/fetchers/registry.py +5 -16
- novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
- novel_downloader/core/fetchers/shencou.py +106 -0
- novel_downloader/core/fetchers/shuhaige.py +84 -0
- novel_downloader/core/fetchers/tongrenquan.py +84 -0
- novel_downloader/core/fetchers/ttkan.py +95 -0
- novel_downloader/core/fetchers/wanbengo.py +83 -0
- novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
- novel_downloader/core/fetchers/xiguashuwu.py +177 -0
- novel_downloader/core/fetchers/xs63b.py +171 -0
- novel_downloader/core/fetchers/xshbook.py +85 -0
- novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
- novel_downloader/core/fetchers/yibige.py +114 -0
- novel_downloader/core/interfaces/__init__.py +1 -9
- novel_downloader/core/interfaces/downloader.py +6 -2
- novel_downloader/core/interfaces/exporter.py +7 -7
- novel_downloader/core/interfaces/fetcher.py +4 -17
- novel_downloader/core/interfaces/parser.py +5 -6
- novel_downloader/core/interfaces/searcher.py +9 -1
- novel_downloader/core/parsers/__init__.py +49 -12
- novel_downloader/core/parsers/aaatxt.py +132 -0
- novel_downloader/core/parsers/b520.py +116 -0
- novel_downloader/core/parsers/base.py +63 -12
- novel_downloader/core/parsers/biquyuedu.py +133 -0
- novel_downloader/core/parsers/dxmwx.py +162 -0
- novel_downloader/core/parsers/eightnovel.py +224 -0
- novel_downloader/core/parsers/esjzone.py +61 -66
- novel_downloader/core/parsers/guidaye.py +128 -0
- novel_downloader/core/parsers/hetushu.py +139 -0
- novel_downloader/core/parsers/i25zw.py +137 -0
- novel_downloader/core/parsers/ixdzs8.py +186 -0
- novel_downloader/core/parsers/jpxs123.py +137 -0
- novel_downloader/core/parsers/lewenn.py +142 -0
- novel_downloader/core/parsers/linovelib.py +48 -64
- novel_downloader/core/parsers/piaotia.py +189 -0
- novel_downloader/core/parsers/qbtr.py +136 -0
- novel_downloader/core/parsers/qianbi.py +48 -50
- novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +272 -330
- novel_downloader/core/parsers/qidian/chapter_normal.py +24 -55
- novel_downloader/core/parsers/qidian/main_parser.py +11 -38
- novel_downloader/core/parsers/qidian/utils/__init__.py +1 -0
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
- novel_downloader/core/parsers/quanben5.py +103 -0
- novel_downloader/core/parsers/registry.py +5 -16
- novel_downloader/core/parsers/sfacg.py +38 -45
- novel_downloader/core/parsers/shencou.py +215 -0
- novel_downloader/core/parsers/shuhaige.py +111 -0
- novel_downloader/core/parsers/tongrenquan.py +116 -0
- novel_downloader/core/parsers/ttkan.py +132 -0
- novel_downloader/core/parsers/wanbengo.py +191 -0
- novel_downloader/core/parsers/xiaoshuowu.py +173 -0
- novel_downloader/core/parsers/xiguashuwu.py +435 -0
- novel_downloader/core/parsers/xs63b.py +161 -0
- novel_downloader/core/parsers/xshbook.py +134 -0
- novel_downloader/core/parsers/yamibo.py +87 -131
- novel_downloader/core/parsers/yibige.py +166 -0
- novel_downloader/core/searchers/__init__.py +34 -3
- novel_downloader/core/searchers/aaatxt.py +107 -0
- novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
- novel_downloader/core/searchers/base.py +112 -36
- novel_downloader/core/searchers/dxmwx.py +105 -0
- novel_downloader/core/searchers/eightnovel.py +84 -0
- novel_downloader/core/searchers/esjzone.py +43 -25
- novel_downloader/core/searchers/hetushu.py +92 -0
- novel_downloader/core/searchers/i25zw.py +93 -0
- novel_downloader/core/searchers/ixdzs8.py +107 -0
- novel_downloader/core/searchers/jpxs123.py +107 -0
- novel_downloader/core/searchers/piaotia.py +100 -0
- novel_downloader/core/searchers/qbtr.py +106 -0
- novel_downloader/core/searchers/qianbi.py +74 -40
- novel_downloader/core/searchers/quanben5.py +144 -0
- novel_downloader/core/searchers/registry.py +24 -8
- novel_downloader/core/searchers/shuhaige.py +124 -0
- novel_downloader/core/searchers/tongrenquan.py +110 -0
- novel_downloader/core/searchers/ttkan.py +92 -0
- novel_downloader/core/searchers/xiaoshuowu.py +122 -0
- novel_downloader/core/searchers/xiguashuwu.py +95 -0
- novel_downloader/core/searchers/xs63b.py +104 -0
- novel_downloader/locales/en.json +31 -82
- novel_downloader/locales/zh.json +32 -83
- novel_downloader/models/__init__.py +21 -22
- novel_downloader/models/book.py +44 -0
- novel_downloader/models/config.py +4 -37
- novel_downloader/models/login.py +1 -1
- novel_downloader/models/search.py +5 -0
- novel_downloader/resources/config/settings.toml +8 -70
- novel_downloader/resources/json/xiguashuwu.json +718 -0
- novel_downloader/utils/__init__.py +13 -22
- novel_downloader/utils/chapter_storage.py +3 -2
- novel_downloader/utils/constants.py +4 -29
- novel_downloader/utils/cookies.py +6 -18
- novel_downloader/utils/crypto_utils/__init__.py +13 -0
- novel_downloader/utils/crypto_utils/aes_util.py +90 -0
- novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
- novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
- novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
- novel_downloader/utils/epub/__init__.py +1 -1
- novel_downloader/utils/epub/constants.py +57 -16
- novel_downloader/utils/epub/documents.py +88 -194
- novel_downloader/utils/epub/models.py +0 -14
- novel_downloader/utils/epub/utils.py +63 -96
- novel_downloader/utils/file_utils/__init__.py +2 -23
- novel_downloader/utils/file_utils/io.py +3 -113
- novel_downloader/utils/file_utils/sanitize.py +0 -4
- novel_downloader/utils/fontocr.py +207 -0
- novel_downloader/utils/logger.py +8 -16
- novel_downloader/utils/network.py +2 -2
- novel_downloader/utils/state.py +4 -90
- novel_downloader/utils/text_utils/__init__.py +1 -7
- novel_downloader/utils/text_utils/diff_display.py +5 -7
- novel_downloader/utils/time_utils/__init__.py +5 -11
- novel_downloader/utils/time_utils/datetime_utils.py +20 -29
- novel_downloader/utils/time_utils/sleep_utils.py +4 -8
- novel_downloader/web/__init__.py +13 -0
- novel_downloader/web/components/__init__.py +11 -0
- novel_downloader/web/components/navigation.py +35 -0
- novel_downloader/web/main.py +66 -0
- novel_downloader/web/pages/__init__.py +17 -0
- novel_downloader/web/pages/download.py +78 -0
- novel_downloader/web/pages/progress.py +147 -0
- novel_downloader/web/pages/search.py +329 -0
- novel_downloader/web/services/__init__.py +17 -0
- novel_downloader/web/services/client_dialog.py +164 -0
- novel_downloader/web/services/cred_broker.py +113 -0
- novel_downloader/web/services/cred_models.py +35 -0
- novel_downloader/web/services/task_manager.py +264 -0
- novel_downloader-2.0.0.dist-info/METADATA +171 -0
- novel_downloader-2.0.0.dist-info/RECORD +210 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
- novel_downloader/core/downloaders/biquge.py +0 -29
- novel_downloader/core/downloaders/esjzone.py +0 -29
- novel_downloader/core/downloaders/linovelib.py +0 -29
- novel_downloader/core/downloaders/sfacg.py +0 -29
- novel_downloader/core/downloaders/yamibo.py +0 -29
- novel_downloader/core/exporters/biquge.py +0 -22
- novel_downloader/core/exporters/esjzone.py +0 -22
- novel_downloader/core/exporters/qianbi.py +0 -22
- novel_downloader/core/exporters/sfacg.py +0 -22
- novel_downloader/core/exporters/yamibo.py +0 -22
- novel_downloader/core/fetchers/base/__init__.py +0 -14
- novel_downloader/core/fetchers/base/browser.py +0 -422
- novel_downloader/core/fetchers/biquge/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/browser.py +0 -209
- novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
- novel_downloader/core/fetchers/linovelib/browser.py +0 -198
- novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/browser.py +0 -326
- novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
- novel_downloader/core/fetchers/sfacg/browser.py +0 -194
- novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
- novel_downloader/core/fetchers/yamibo/browser.py +0 -234
- novel_downloader/core/parsers/biquge.py +0 -139
- novel_downloader/models/chapter.py +0 -25
- novel_downloader/models/types.py +0 -13
- novel_downloader/tui/__init__.py +0 -7
- novel_downloader/tui/app.py +0 -32
- novel_downloader/tui/main.py +0 -17
- novel_downloader/tui/screens/__init__.py +0 -14
- novel_downloader/tui/screens/home.py +0 -198
- novel_downloader/tui/screens/login.py +0 -74
- novel_downloader/tui/styles/home_layout.tcss +0 -79
- novel_downloader/tui/widgets/richlog_handler.py +0 -24
- novel_downloader/utils/cache.py +0 -24
- novel_downloader/utils/fontocr/__init__.py +0 -22
- novel_downloader/utils/fontocr/hash_store.py +0 -280
- novel_downloader/utils/fontocr/hash_utils.py +0 -103
- novel_downloader/utils/fontocr/model_loader.py +0 -69
- novel_downloader/utils/fontocr/ocr_v1.py +0 -315
- novel_downloader/utils/fontocr/ocr_v2.py +0 -764
- novel_downloader/utils/fontocr/ocr_v3.py +0 -744
- novel_downloader-1.5.0.dist-info/METADATA +0 -196
- novel_downloader-1.5.0.dist-info/RECORD +0 -164
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,137 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.jpxs123
|
4
|
+
-------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any
|
9
|
+
|
10
|
+
from lxml import html
|
11
|
+
|
12
|
+
from novel_downloader.core.parsers.base import BaseParser
|
13
|
+
from novel_downloader.core.parsers.registry import register_parser
|
14
|
+
from novel_downloader.models import (
|
15
|
+
BookInfoDict,
|
16
|
+
ChapterDict,
|
17
|
+
ChapterInfoDict,
|
18
|
+
VolumeInfoDict,
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
@register_parser(
|
23
|
+
site_keys=["jpxs123"],
|
24
|
+
)
|
25
|
+
class Jpxs123Parser(BaseParser):
|
26
|
+
"""
|
27
|
+
Parser for 精品小说网 book pages.
|
28
|
+
"""
|
29
|
+
|
30
|
+
BASE_URL = "https://www.jpxs123.com"
|
31
|
+
|
32
|
+
def parse_book_info(
|
33
|
+
self,
|
34
|
+
html_list: list[str],
|
35
|
+
**kwargs: Any,
|
36
|
+
) -> BookInfoDict | None:
|
37
|
+
if not html_list:
|
38
|
+
return None
|
39
|
+
|
40
|
+
# Parse the main info page
|
41
|
+
tree = html.fromstring(html_list[0])
|
42
|
+
# Book name
|
43
|
+
book_name = self._first_str(tree.xpath('//div[@class="infos"]/h1/text()'))
|
44
|
+
# Tags: the second breadcrumb (e.g., "同人小说")
|
45
|
+
tag = self._first_str(
|
46
|
+
tree.xpath('//div[contains(@class,"menNav")]/a[2]/text()')
|
47
|
+
)
|
48
|
+
tags = [tag] if tag else []
|
49
|
+
|
50
|
+
author = self._first_str(tree.xpath('//div[@class="date"]/span[1]//a/text()'))
|
51
|
+
update_time = self._first_str(
|
52
|
+
tree.xpath('//div[@class="date"]/span[2]/text()'), replaces=[("时间:", "")]
|
53
|
+
)
|
54
|
+
cover_rel = self._first_str(tree.xpath('//div[@class="pic"]/img/@src'))
|
55
|
+
cover_url = (
|
56
|
+
f"{self.BASE_URL}{cover_rel}"
|
57
|
+
if cover_rel and not cover_rel.startswith("http")
|
58
|
+
else cover_rel
|
59
|
+
)
|
60
|
+
|
61
|
+
# Summary from the <p> inside infos
|
62
|
+
paras = tree.xpath('//div[@class="infos"]/p//text()')
|
63
|
+
summary = "\n".join(p.strip() for p in paras if p.strip())
|
64
|
+
|
65
|
+
# Chapters from the book_list
|
66
|
+
chapters: list[ChapterInfoDict] = []
|
67
|
+
for a in tree.xpath('//div[contains(@class,"book_list")]//li/a'):
|
68
|
+
url = a.get("href", "").strip()
|
69
|
+
title = a.text_content().strip()
|
70
|
+
# General regex: /{category}/{bookId}/{chapterId}.html
|
71
|
+
cid = url.split("/")[-1].split(".")[0]
|
72
|
+
chapters.append({"title": title, "url": url, "chapterId": cid})
|
73
|
+
|
74
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
75
|
+
|
76
|
+
# Parse the download page (second HTML)
|
77
|
+
download_url = ""
|
78
|
+
if len(html_list) > 1 and html_list[1]:
|
79
|
+
dtree = html.fromstring(html_list[1])
|
80
|
+
a = dtree.xpath('//a[@id="dowloadnUrl"]')
|
81
|
+
if a:
|
82
|
+
link = a[0].get("link") or a[0].get("href") or ""
|
83
|
+
download_url = self._fix_download_link(link)
|
84
|
+
|
85
|
+
return {
|
86
|
+
"book_name": book_name,
|
87
|
+
"author": author,
|
88
|
+
"cover_url": cover_url,
|
89
|
+
"update_time": update_time,
|
90
|
+
"tags": tags,
|
91
|
+
"summary": summary,
|
92
|
+
"volumes": volumes,
|
93
|
+
"extra": {"download_url": download_url},
|
94
|
+
}
|
95
|
+
|
96
|
+
def parse_chapter(
|
97
|
+
self,
|
98
|
+
html_list: list[str],
|
99
|
+
chapter_id: str,
|
100
|
+
**kwargs: Any,
|
101
|
+
) -> ChapterDict | None:
|
102
|
+
if not html_list:
|
103
|
+
return None
|
104
|
+
|
105
|
+
tree = html.fromstring(html_list[0])
|
106
|
+
|
107
|
+
raw_title = self._first_str(
|
108
|
+
tree.xpath('//div[contains(@class,"read_chapterName")]//h1/text()')
|
109
|
+
)
|
110
|
+
|
111
|
+
crumbs = tree.xpath('//div[contains(@class,"readTop")]//a/text()')
|
112
|
+
book_name = crumbs[-1].strip() if crumbs else ""
|
113
|
+
|
114
|
+
title = raw_title.replace(book_name, "").strip()
|
115
|
+
|
116
|
+
paragraphs = tree.xpath('//div[contains(@class,"read_chapterDetail")]/p')
|
117
|
+
texts = []
|
118
|
+
for p in paragraphs:
|
119
|
+
txt = p.text_content().strip()
|
120
|
+
if txt:
|
121
|
+
texts.append(txt)
|
122
|
+
|
123
|
+
content = "\n".join(texts)
|
124
|
+
if not content:
|
125
|
+
return None
|
126
|
+
|
127
|
+
return {
|
128
|
+
"id": chapter_id,
|
129
|
+
"title": title,
|
130
|
+
"content": content,
|
131
|
+
"extra": {"site": "jpxs123"},
|
132
|
+
}
|
133
|
+
|
134
|
+
@classmethod
|
135
|
+
def _fix_download_link(cls, link: str) -> str:
|
136
|
+
true_link = link.replace("xs../", "/e/DownSys/")
|
137
|
+
return f"{cls.BASE_URL}{true_link}"
|
@@ -0,0 +1,142 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.lewenn
|
4
|
+
------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any
|
9
|
+
|
10
|
+
from lxml import html
|
11
|
+
|
12
|
+
from novel_downloader.core.parsers.base import BaseParser
|
13
|
+
from novel_downloader.core.parsers.registry import register_parser
|
14
|
+
from novel_downloader.models import (
|
15
|
+
BookInfoDict,
|
16
|
+
ChapterDict,
|
17
|
+
ChapterInfoDict,
|
18
|
+
VolumeInfoDict,
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
@register_parser(
|
23
|
+
site_keys=["lewenn", "lewen"],
|
24
|
+
)
|
25
|
+
class LewennParser(BaseParser):
|
26
|
+
"""
|
27
|
+
Parser for 乐文小说网 book pages.
|
28
|
+
"""
|
29
|
+
|
30
|
+
BASE_URL = "https://www.lewenn.net"
|
31
|
+
|
32
|
+
ADS: set[str] = {
|
33
|
+
"app2",
|
34
|
+
"read2",
|
35
|
+
"chaptererror",
|
36
|
+
"记住乐文小说网",
|
37
|
+
"lewenn.net",
|
38
|
+
}
|
39
|
+
|
40
|
+
def parse_book_info(
|
41
|
+
self,
|
42
|
+
html_list: list[str],
|
43
|
+
**kwargs: Any,
|
44
|
+
) -> BookInfoDict | None:
|
45
|
+
if not html_list:
|
46
|
+
return None
|
47
|
+
|
48
|
+
tree = html.fromstring(html_list[0])
|
49
|
+
|
50
|
+
# --- Metadata ---
|
51
|
+
book_name = self._first_str(tree.xpath('//div[@id="info"]/h1/text()'))
|
52
|
+
author = self._first_str(
|
53
|
+
tree.xpath('//div[@id="info"]/p[1]/text()'),
|
54
|
+
replaces=[(chr(0xA0), ""), ("作者:", "")],
|
55
|
+
)
|
56
|
+
serial_status = self._first_str(
|
57
|
+
tree.xpath('//div[@id="info"]/p[2]/text()'),
|
58
|
+
replaces=[(chr(0xA0), ""), ("状态:", "")],
|
59
|
+
)
|
60
|
+
update_time = self._first_str(
|
61
|
+
tree.xpath('//div[@id="info"]/p[3]/text()'),
|
62
|
+
replaces=[("最后更新:", "")],
|
63
|
+
)
|
64
|
+
|
65
|
+
cover_src = self._first_str(tree.xpath('//div[@id="sidebar"]//img/@src'))
|
66
|
+
cover_url = (
|
67
|
+
cover_src if cover_src.startswith("http") else f"{self.BASE_URL}{cover_src}"
|
68
|
+
)
|
69
|
+
|
70
|
+
summary_lines = tree.xpath('//div[@id="intro"]/p//text()')
|
71
|
+
summary = "\n".join(line.strip() for line in summary_lines).strip()
|
72
|
+
|
73
|
+
# --- Volumes & Chapters ---
|
74
|
+
chapters: list[ChapterInfoDict] = []
|
75
|
+
for dt in tree.xpath('//div[@class="listmain"]/dl/dt'):
|
76
|
+
title_text = dt.text_content().strip()
|
77
|
+
if "正文" in title_text:
|
78
|
+
# collect its <dd> siblings
|
79
|
+
sib = dt.getnext()
|
80
|
+
while sib is not None and sib.tag == "dd":
|
81
|
+
a = sib.xpath(".//a")[0]
|
82
|
+
chap_title = a.text_content().strip()
|
83
|
+
href = a.get("href")
|
84
|
+
url = href if href.startswith("http") else f"{self.BASE_URL}{href}"
|
85
|
+
chap_id = url.rstrip(".html").split("/")[-1]
|
86
|
+
chapters.append(
|
87
|
+
{"title": chap_title, "url": url, "chapterId": chap_id}
|
88
|
+
)
|
89
|
+
sib = sib.getnext()
|
90
|
+
break
|
91
|
+
|
92
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
93
|
+
|
94
|
+
return {
|
95
|
+
"book_name": book_name,
|
96
|
+
"author": author,
|
97
|
+
"cover_url": cover_url,
|
98
|
+
"update_time": update_time,
|
99
|
+
"serial_status": serial_status,
|
100
|
+
"summary": summary,
|
101
|
+
"volumes": volumes,
|
102
|
+
"extra": {},
|
103
|
+
}
|
104
|
+
|
105
|
+
def parse_chapter(
|
106
|
+
self,
|
107
|
+
html_list: list[str],
|
108
|
+
chapter_id: str,
|
109
|
+
**kwargs: Any,
|
110
|
+
) -> ChapterDict | None:
|
111
|
+
if not html_list:
|
112
|
+
return None
|
113
|
+
|
114
|
+
tree = html.fromstring(html_list[0])
|
115
|
+
|
116
|
+
title = self._first_str(tree.xpath('//div[@class="content"]/h1/text()'))
|
117
|
+
|
118
|
+
nodes = tree.xpath('//div[@id="content" and contains(@class,"showtxt")]')
|
119
|
+
if not nodes:
|
120
|
+
return None
|
121
|
+
content_div = nodes[0]
|
122
|
+
|
123
|
+
raw_lines = [ln.strip() for ln in content_div.xpath(".//text()")]
|
124
|
+
|
125
|
+
lines: list[str] = []
|
126
|
+
for ln in raw_lines:
|
127
|
+
if not ln or self._is_ad_line(ln):
|
128
|
+
continue
|
129
|
+
# if ln.startswith("(") and ln.endswith(")"):
|
130
|
+
# continue
|
131
|
+
lines.append(ln.replace(chr(0xA0), ""))
|
132
|
+
|
133
|
+
content = "\n".join(lines)
|
134
|
+
if not content.strip():
|
135
|
+
return None
|
136
|
+
|
137
|
+
return {
|
138
|
+
"id": chapter_id,
|
139
|
+
"title": title,
|
140
|
+
"content": content,
|
141
|
+
"extra": {"site": "lewenn"},
|
142
|
+
}
|
@@ -7,23 +7,28 @@ novel_downloader.core.parsers.linovelib
|
|
7
7
|
|
8
8
|
import json
|
9
9
|
from itertools import islice
|
10
|
-
from pathlib import PurePosixPath
|
11
10
|
from typing import Any
|
12
11
|
|
13
12
|
from lxml import html
|
14
13
|
|
15
14
|
from novel_downloader.core.parsers.base import BaseParser
|
16
15
|
from novel_downloader.core.parsers.registry import register_parser
|
17
|
-
from novel_downloader.models import
|
16
|
+
from novel_downloader.models import (
|
17
|
+
BookInfoDict,
|
18
|
+
ChapterDict,
|
19
|
+
ChapterInfoDict,
|
20
|
+
VolumeInfoDict,
|
21
|
+
)
|
18
22
|
from novel_downloader.utils.constants import LINOVELIB_FONT_MAP_PATH
|
19
23
|
|
20
24
|
|
21
25
|
@register_parser(
|
22
26
|
site_keys=["linovelib"],
|
23
|
-
backends=["session", "browser"],
|
24
27
|
)
|
25
28
|
class LinovelibParser(BaseParser):
|
26
|
-
"""
|
29
|
+
"""
|
30
|
+
Parser for 哔哩轻小说 book pages.
|
31
|
+
"""
|
27
32
|
|
28
33
|
# Book info XPaths
|
29
34
|
_BOOK_NAME_XPATH = '//div[@class="book-info"]/h1[@class="book-name"]/text()'
|
@@ -51,68 +56,69 @@ class LinovelibParser(BaseParser):
|
|
51
56
|
self,
|
52
57
|
html_list: list[str],
|
53
58
|
**kwargs: Any,
|
54
|
-
) ->
|
55
|
-
"""
|
56
|
-
Parse a book info page and extract metadata and chapter structure.
|
57
|
-
|
58
|
-
:param html_list: Raw HTML of the book info page.
|
59
|
-
:return: Parsed metadata and chapter structure as a dictionary.
|
60
|
-
"""
|
59
|
+
) -> BookInfoDict | None:
|
61
60
|
if not html_list:
|
62
|
-
return
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
info_tree, self._UPDATE_TIME_XPATH, replace=("最后更新:", "")
|
61
|
+
return None
|
62
|
+
tree = html.fromstring(html_list[0])
|
63
|
+
|
64
|
+
book_name = self._first_str(tree.xpath(self._BOOK_NAME_XPATH))
|
65
|
+
author = self._first_str(tree.xpath(self._AUTHOR_XPATH))
|
66
|
+
cover_url = self._first_str(tree.xpath(self._COVER_URL_XPATH))
|
67
|
+
update_time = self._first_str(
|
68
|
+
tree.xpath(self._UPDATE_TIME_XPATH), replaces=[("最后更新:", "")]
|
71
69
|
)
|
72
|
-
|
73
|
-
|
74
|
-
|
70
|
+
serial_status = self._first_str(tree.xpath(self._SERIAL_STATUS_XPATH))
|
71
|
+
word_count = self._first_str(
|
72
|
+
tree.xpath(self._WORD_COUNT_XPATH), replaces=[("最后更新:", "")]
|
75
73
|
)
|
76
74
|
|
77
|
-
|
75
|
+
summary = self._extract_intro(tree, self._SUMMARY_XPATH)
|
78
76
|
|
79
77
|
vol_pages = html_list[1:]
|
80
|
-
volumes: list[
|
78
|
+
volumes: list[VolumeInfoDict] = []
|
81
79
|
for vol_page in vol_pages:
|
82
80
|
vol_tree = html.fromstring(vol_page)
|
83
|
-
volume_cover = self.
|
84
|
-
volume_name = self.
|
85
|
-
|
86
|
-
vol_tree
|
81
|
+
volume_cover = self._first_str(vol_tree.xpath(self._COVER_URL_XPATH))
|
82
|
+
volume_name = self._first_str(vol_tree.xpath(self._BOOK_NAME_XPATH))
|
83
|
+
vol_update_time = self._first_str(
|
84
|
+
vol_tree.xpath(self._UPDATE_TIME_XPATH), replaces=[("最后更新:", "")]
|
87
85
|
)
|
88
|
-
|
89
|
-
vol_tree
|
86
|
+
vol_word_count = self._first_str(
|
87
|
+
vol_tree.xpath(self._WORD_COUNT_XPATH), replaces=[("字数:", "")]
|
90
88
|
)
|
91
89
|
volume_intro = self._extract_intro(vol_tree, self._SUMMARY_XPATH)
|
92
90
|
|
93
|
-
chapters = []
|
91
|
+
chapters: list[ChapterInfoDict] = []
|
94
92
|
chapter_elements = vol_tree.xpath(self._CHAPTERS_XPATH)
|
95
93
|
for a in chapter_elements:
|
96
94
|
title = a.text.strip()
|
97
95
|
url = a.attrib.get("href", "").strip()
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
)
|
96
|
+
# '/novel/4668/276082.html' -> '276082'
|
97
|
+
cid = url.split("/")[-1].split(".")[0]
|
98
|
+
chapters.append({"title": title, "url": url, "chapterId": cid})
|
102
99
|
|
103
100
|
volumes.append(
|
104
101
|
{
|
105
102
|
"volume_name": volume_name,
|
106
103
|
"volume_cover": volume_cover,
|
107
|
-
"update_time":
|
108
|
-
"word_count":
|
104
|
+
"update_time": vol_update_time,
|
105
|
+
"word_count": vol_word_count,
|
109
106
|
"volume_intro": volume_intro,
|
110
107
|
"chapters": chapters,
|
111
108
|
}
|
112
109
|
)
|
113
|
-
result["volumes"] = volumes
|
114
110
|
|
115
|
-
return
|
111
|
+
return {
|
112
|
+
"book_name": book_name,
|
113
|
+
"author": author,
|
114
|
+
"cover_url": cover_url,
|
115
|
+
"serial_status": serial_status,
|
116
|
+
"word_count": word_count,
|
117
|
+
"summary": summary,
|
118
|
+
"update_time": update_time,
|
119
|
+
"volumes": volumes,
|
120
|
+
"extra": {},
|
121
|
+
}
|
116
122
|
|
117
123
|
def parse_chapter(
|
118
124
|
self,
|
@@ -120,13 +126,6 @@ class LinovelibParser(BaseParser):
|
|
120
126
|
chapter_id: str,
|
121
127
|
**kwargs: Any,
|
122
128
|
) -> ChapterDict | None:
|
123
|
-
"""
|
124
|
-
Parse chapter pages and extract clean text or simplified HTML.
|
125
|
-
|
126
|
-
:param html_list: Raw HTML of the chapter page.
|
127
|
-
:param chapter_id: Identifier of the chapter being parsed.
|
128
|
-
:return: Cleaned chapter content as plain text or minimal HTML.
|
129
|
-
"""
|
130
129
|
if not html_list:
|
131
130
|
return None
|
132
131
|
title_text: str = ""
|
@@ -170,25 +169,10 @@ class LinovelibParser(BaseParser):
|
|
170
169
|
return {
|
171
170
|
"id": chapter_id,
|
172
171
|
"title": title_text,
|
173
|
-
"content": "\n
|
172
|
+
"content": "\n".join(contents),
|
174
173
|
"extra": {"site": "linovelib"},
|
175
174
|
}
|
176
175
|
|
177
|
-
def _safe_xpath(
|
178
|
-
self,
|
179
|
-
tree: html.HtmlElement,
|
180
|
-
path: str,
|
181
|
-
replace: tuple[str, str] | None = None,
|
182
|
-
) -> str:
|
183
|
-
result = tree.xpath(path)
|
184
|
-
if not result:
|
185
|
-
return ""
|
186
|
-
value: str = result[0].strip()
|
187
|
-
if replace:
|
188
|
-
old, new = replace
|
189
|
-
value = value.replace(old, new)
|
190
|
-
return value
|
191
|
-
|
192
176
|
@staticmethod
|
193
177
|
def _extract_intro(tree: html.HtmlElement, xpath: str) -> str:
|
194
178
|
paragraphs = tree.xpath(xpath.replace("//text()", ""))
|
@@ -197,7 +181,7 @@ class LinovelibParser(BaseParser):
|
|
197
181
|
text_segments = p.xpath(".//text()")
|
198
182
|
cleaned = [seg.strip() for seg in text_segments if seg.strip()]
|
199
183
|
lines.append("\n".join(cleaned))
|
200
|
-
return "\n
|
184
|
+
return "\n".join(lines)
|
201
185
|
|
202
186
|
@staticmethod
|
203
187
|
def _is_encrypted(html: str) -> bool:
|
@@ -0,0 +1,189 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.piaotia
|
4
|
+
-------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
from lxml import html
|
12
|
+
|
13
|
+
from novel_downloader.core.parsers.base import BaseParser
|
14
|
+
from novel_downloader.core.parsers.registry import register_parser
|
15
|
+
from novel_downloader.models import (
|
16
|
+
BookInfoDict,
|
17
|
+
ChapterDict,
|
18
|
+
ChapterInfoDict,
|
19
|
+
VolumeInfoDict,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
@register_parser(
|
24
|
+
site_keys=["piaotia"],
|
25
|
+
)
|
26
|
+
class PiaotiaParser(BaseParser):
|
27
|
+
"""
|
28
|
+
Parser for 飘天文学网 book pages.
|
29
|
+
"""
|
30
|
+
|
31
|
+
_RE_DEVICE_DIV = re.compile(
|
32
|
+
r'<div\s+id=[\'"“”]?device[\'"“”]?[^>]*>',
|
33
|
+
flags=re.IGNORECASE,
|
34
|
+
)
|
35
|
+
|
36
|
+
def parse_book_info(
|
37
|
+
self,
|
38
|
+
html_list: list[str],
|
39
|
+
**kwargs: Any,
|
40
|
+
) -> BookInfoDict | None:
|
41
|
+
if len(html_list) < 2:
|
42
|
+
return None
|
43
|
+
|
44
|
+
# Parse trees
|
45
|
+
info_tree = html.fromstring(html_list[0])
|
46
|
+
catalog_tree = html.fromstring(html_list[1])
|
47
|
+
|
48
|
+
book_name = self._first_str(info_tree.xpath("//span[@style]//h1/text()"))
|
49
|
+
author = self._first_str(
|
50
|
+
info_tree.xpath(
|
51
|
+
'//td[contains(text(),"作") and contains(text(),"者")]/text()'
|
52
|
+
),
|
53
|
+
replaces=[(chr(0xA0), ""), (" ", ""), ("作者:", "")],
|
54
|
+
)
|
55
|
+
|
56
|
+
# Category as tag
|
57
|
+
category = self._first_str(
|
58
|
+
info_tree.xpath(
|
59
|
+
'//td[contains(text(),"类") and contains(text(),"别")]/text()'
|
60
|
+
),
|
61
|
+
replaces=[(chr(0xA0), ""), (" ", ""), ("类别:", "")],
|
62
|
+
)
|
63
|
+
tags = [category] if category else []
|
64
|
+
|
65
|
+
word_count = self._first_str(
|
66
|
+
info_tree.xpath('//td[contains(text(),"全文长度")]/text()'),
|
67
|
+
replaces=[(chr(0xA0), ""), (" ", ""), ("全文长度:", "")],
|
68
|
+
)
|
69
|
+
|
70
|
+
update_time = self._first_str(
|
71
|
+
info_tree.xpath('//td[contains(text(),"最后更新")]/text()'),
|
72
|
+
replaces=[(chr(0xA0), ""), (" ", ""), ("最后更新:", "")],
|
73
|
+
)
|
74
|
+
|
75
|
+
serial_status = self._first_str(
|
76
|
+
info_tree.xpath('//td[contains(text(),"文章状态")]/text()'),
|
77
|
+
replaces=[(chr(0xA0), ""), (" ", ""), ("文章状态:", "")],
|
78
|
+
)
|
79
|
+
|
80
|
+
cover_url = self._first_str(info_tree.xpath('//td[@width="80%"]//img/@src'))
|
81
|
+
|
82
|
+
# Summary
|
83
|
+
summary_divs = info_tree.xpath('//td[@width="80%"]/div')
|
84
|
+
if summary_divs:
|
85
|
+
raw = str(summary_divs[0].text_content())
|
86
|
+
summary = raw.split("内容简介:")[-1].strip()
|
87
|
+
else:
|
88
|
+
summary = ""
|
89
|
+
|
90
|
+
# Chapters (single volume)
|
91
|
+
chapters: list[ChapterInfoDict] = []
|
92
|
+
for a in catalog_tree.xpath('//div[@class="centent"]//ul/li/a'):
|
93
|
+
title = (a.text or "").strip()
|
94
|
+
url = a.get("href", "").strip()
|
95
|
+
chapter_id = url.split(".")[0]
|
96
|
+
chapters.append({"title": title, "url": url, "chapterId": chapter_id})
|
97
|
+
|
98
|
+
# Single volume
|
99
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
100
|
+
|
101
|
+
return {
|
102
|
+
"book_name": book_name,
|
103
|
+
"author": author,
|
104
|
+
"cover_url": cover_url,
|
105
|
+
"update_time": update_time,
|
106
|
+
"summary": summary,
|
107
|
+
"volumes": volumes,
|
108
|
+
"tags": tags,
|
109
|
+
"word_count": word_count,
|
110
|
+
"serial_status": serial_status,
|
111
|
+
"extra": {},
|
112
|
+
}
|
113
|
+
|
114
|
+
def parse_chapter(
|
115
|
+
self,
|
116
|
+
html_list: list[str],
|
117
|
+
chapter_id: str,
|
118
|
+
**kwargs: Any,
|
119
|
+
) -> ChapterDict | None:
|
120
|
+
"""
|
121
|
+
Parse chapter page and extract the content of one chapter.
|
122
|
+
|
123
|
+
p.s. 结构好混乱:
|
124
|
+
1. `<head>` 没有对应的 `</head>`, 同理 `</body>` 没有对应的 `<body>`
|
125
|
+
2. 部分 html 通过 js 直接写入, 例如:
|
126
|
+
`document.write("<div id=\"main\" class=\"colors1 sidebar\">");`
|
127
|
+
3. 部分 div 的 id 或 style 属性周围的引号是非标准的波浪引号, 例如:
|
128
|
+
`<div id=”device” style=”background-color...”>`,
|
129
|
+
并也没有对应的 `</div>`
|
130
|
+
|
131
|
+
:param html_list: The HTML list of the chapter pages.
|
132
|
+
:param chapter_id: Identifier of the chapter being parsed.
|
133
|
+
:return: The chapter's data.
|
134
|
+
"""
|
135
|
+
if not html_list:
|
136
|
+
return None
|
137
|
+
|
138
|
+
raw = self._RE_DEVICE_DIV.sub("", html_list[0])
|
139
|
+
raw = raw.replace(
|
140
|
+
'<script language="javascript">GetMode();</script>',
|
141
|
+
'<div id="main" class="colors1 sidebar">',
|
142
|
+
).replace(
|
143
|
+
'<script language="javascript">GetFont();</script>',
|
144
|
+
'<div id="content">',
|
145
|
+
)
|
146
|
+
|
147
|
+
doc = html.fromstring(raw)
|
148
|
+
container = doc.xpath('//div[@id="content"]')
|
149
|
+
root = container[0] if container else doc
|
150
|
+
|
151
|
+
# Title comes straight from the <h1>
|
152
|
+
title = ""
|
153
|
+
h1 = root.find(".//h1")
|
154
|
+
if h1 is not None:
|
155
|
+
full = h1.text_content().strip()
|
156
|
+
a_txt = h1.xpath("./a/text()")
|
157
|
+
title = full.replace(a_txt[0].strip(), "").strip() if a_txt else full
|
158
|
+
|
159
|
+
# Walk the “script‑tables” -> <br> siblings for the body
|
160
|
+
table = root.xpath('.//table[@align="center" and @border]')
|
161
|
+
if not table:
|
162
|
+
return None
|
163
|
+
node = table[0].getnext()
|
164
|
+
|
165
|
+
lines: list[str] = []
|
166
|
+
while node is not None:
|
167
|
+
# stop at the next table or any bottom‑link nav div
|
168
|
+
if (node.tag == "table" and node.get("border")) or (
|
169
|
+
node.tag == "div" and node.get("class", "").endswith("link")
|
170
|
+
):
|
171
|
+
break
|
172
|
+
|
173
|
+
if node.tag == "br":
|
174
|
+
txt = (node.tail or "").replace("\xa0", " ").strip()
|
175
|
+
if txt:
|
176
|
+
lines.append(txt)
|
177
|
+
|
178
|
+
node = node.getnext()
|
179
|
+
|
180
|
+
content = "\n".join(lines).strip()
|
181
|
+
if not content:
|
182
|
+
return None
|
183
|
+
|
184
|
+
return {
|
185
|
+
"id": chapter_id,
|
186
|
+
"title": title,
|
187
|
+
"content": content,
|
188
|
+
"extra": {"site": "piaotia"},
|
189
|
+
}
|