novel-downloader 1.4.5__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +2 -4
- novel_downloader/cli/clean.py +21 -88
- novel_downloader/cli/config.py +27 -104
- novel_downloader/cli/download.py +78 -66
- novel_downloader/cli/export.py +20 -21
- novel_downloader/cli/main.py +3 -1
- novel_downloader/cli/search.py +120 -0
- novel_downloader/cli/ui.py +156 -0
- novel_downloader/config/__init__.py +10 -14
- novel_downloader/config/adapter.py +195 -99
- novel_downloader/config/{loader.py → file_io.py} +53 -27
- novel_downloader/core/__init__.py +14 -13
- novel_downloader/core/archived/deqixs/fetcher.py +115 -0
- novel_downloader/core/archived/deqixs/parser.py +132 -0
- novel_downloader/core/archived/deqixs/searcher.py +89 -0
- novel_downloader/core/archived/qidian/searcher.py +79 -0
- novel_downloader/core/archived/wanbengo/searcher.py +98 -0
- novel_downloader/core/archived/xshbook/searcher.py +93 -0
- novel_downloader/core/downloaders/__init__.py +8 -30
- novel_downloader/core/downloaders/base.py +182 -30
- novel_downloader/core/downloaders/common.py +217 -384
- novel_downloader/core/downloaders/qianbi.py +332 -4
- novel_downloader/core/downloaders/qidian.py +250 -290
- novel_downloader/core/downloaders/registry.py +69 -0
- novel_downloader/core/downloaders/signals.py +46 -0
- novel_downloader/core/exporters/__init__.py +8 -26
- novel_downloader/core/exporters/base.py +107 -31
- novel_downloader/core/exporters/common/__init__.py +3 -4
- novel_downloader/core/exporters/common/epub.py +92 -171
- novel_downloader/core/exporters/common/main_exporter.py +14 -67
- novel_downloader/core/exporters/common/txt.py +90 -86
- novel_downloader/core/exporters/epub_util.py +184 -1327
- novel_downloader/core/exporters/linovelib/__init__.py +3 -2
- novel_downloader/core/exporters/linovelib/epub.py +165 -222
- novel_downloader/core/exporters/linovelib/main_exporter.py +10 -71
- novel_downloader/core/exporters/linovelib/txt.py +76 -66
- novel_downloader/core/exporters/qidian.py +15 -11
- novel_downloader/core/exporters/registry.py +55 -0
- novel_downloader/core/exporters/txt_util.py +67 -0
- novel_downloader/core/fetchers/__init__.py +57 -56
- novel_downloader/core/fetchers/aaatxt.py +83 -0
- novel_downloader/core/fetchers/{biquge/session.py → b520.py} +10 -10
- novel_downloader/core/fetchers/{base/session.py → base.py} +63 -47
- novel_downloader/core/fetchers/biquyuedu.py +83 -0
- novel_downloader/core/fetchers/dxmwx.py +110 -0
- novel_downloader/core/fetchers/eightnovel.py +139 -0
- novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +23 -11
- novel_downloader/core/fetchers/guidaye.py +85 -0
- novel_downloader/core/fetchers/hetushu.py +92 -0
- novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +22 -26
- novel_downloader/core/fetchers/ixdzs8.py +113 -0
- novel_downloader/core/fetchers/jpxs123.py +101 -0
- novel_downloader/core/fetchers/{biquge/browser.py → lewenn.py} +15 -15
- novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +16 -12
- novel_downloader/core/fetchers/piaotia.py +105 -0
- novel_downloader/core/fetchers/qbtr.py +101 -0
- novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +9 -9
- novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +55 -40
- novel_downloader/core/fetchers/quanben5.py +92 -0
- novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
- novel_downloader/core/fetchers/registry.py +60 -0
- novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +11 -9
- novel_downloader/core/fetchers/shencou.py +106 -0
- novel_downloader/core/fetchers/{common/browser.py → shuhaige.py} +24 -19
- novel_downloader/core/fetchers/tongrenquan.py +84 -0
- novel_downloader/core/fetchers/ttkan.py +95 -0
- novel_downloader/core/fetchers/{common/session.py → wanbengo.py} +21 -17
- novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
- novel_downloader/core/fetchers/xiguashuwu.py +177 -0
- novel_downloader/core/fetchers/xs63b.py +171 -0
- novel_downloader/core/fetchers/xshbook.py +85 -0
- novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +23 -11
- novel_downloader/core/fetchers/yibige.py +114 -0
- novel_downloader/core/interfaces/__init__.py +8 -14
- novel_downloader/core/interfaces/downloader.py +6 -2
- novel_downloader/core/interfaces/exporter.py +7 -7
- novel_downloader/core/interfaces/fetcher.py +4 -17
- novel_downloader/core/interfaces/parser.py +5 -6
- novel_downloader/core/interfaces/searcher.py +26 -0
- novel_downloader/core/parsers/__init__.py +58 -22
- novel_downloader/core/parsers/aaatxt.py +132 -0
- novel_downloader/core/parsers/b520.py +116 -0
- novel_downloader/core/parsers/base.py +63 -12
- novel_downloader/core/parsers/biquyuedu.py +133 -0
- novel_downloader/core/parsers/dxmwx.py +162 -0
- novel_downloader/core/parsers/eightnovel.py +224 -0
- novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +67 -67
- novel_downloader/core/parsers/guidaye.py +128 -0
- novel_downloader/core/parsers/hetushu.py +139 -0
- novel_downloader/core/parsers/i25zw.py +137 -0
- novel_downloader/core/parsers/ixdzs8.py +186 -0
- novel_downloader/core/parsers/jpxs123.py +137 -0
- novel_downloader/core/parsers/lewenn.py +142 -0
- novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +54 -65
- novel_downloader/core/parsers/piaotia.py +189 -0
- novel_downloader/core/parsers/qbtr.py +136 -0
- novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +54 -51
- novel_downloader/core/parsers/qidian/__init__.py +2 -2
- novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +290 -346
- novel_downloader/core/parsers/qidian/chapter_normal.py +25 -56
- novel_downloader/core/parsers/qidian/main_parser.py +19 -57
- novel_downloader/core/parsers/qidian/utils/__init__.py +12 -11
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +6 -7
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
- novel_downloader/core/parsers/quanben5.py +103 -0
- novel_downloader/core/parsers/registry.py +57 -0
- novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +46 -48
- novel_downloader/core/parsers/shencou.py +215 -0
- novel_downloader/core/parsers/shuhaige.py +111 -0
- novel_downloader/core/parsers/tongrenquan.py +116 -0
- novel_downloader/core/parsers/ttkan.py +132 -0
- novel_downloader/core/parsers/wanbengo.py +191 -0
- novel_downloader/core/parsers/xiaoshuowu.py +173 -0
- novel_downloader/core/parsers/xiguashuwu.py +435 -0
- novel_downloader/core/parsers/xs63b.py +161 -0
- novel_downloader/core/parsers/xshbook.py +134 -0
- novel_downloader/core/parsers/yamibo.py +155 -0
- novel_downloader/core/parsers/yibige.py +166 -0
- novel_downloader/core/searchers/__init__.py +51 -0
- novel_downloader/core/searchers/aaatxt.py +107 -0
- novel_downloader/core/searchers/b520.py +84 -0
- novel_downloader/core/searchers/base.py +168 -0
- novel_downloader/core/searchers/dxmwx.py +105 -0
- novel_downloader/core/searchers/eightnovel.py +84 -0
- novel_downloader/core/searchers/esjzone.py +102 -0
- novel_downloader/core/searchers/hetushu.py +92 -0
- novel_downloader/core/searchers/i25zw.py +93 -0
- novel_downloader/core/searchers/ixdzs8.py +107 -0
- novel_downloader/core/searchers/jpxs123.py +107 -0
- novel_downloader/core/searchers/piaotia.py +100 -0
- novel_downloader/core/searchers/qbtr.py +106 -0
- novel_downloader/core/searchers/qianbi.py +165 -0
- novel_downloader/core/searchers/quanben5.py +144 -0
- novel_downloader/core/searchers/registry.py +79 -0
- novel_downloader/core/searchers/shuhaige.py +124 -0
- novel_downloader/core/searchers/tongrenquan.py +110 -0
- novel_downloader/core/searchers/ttkan.py +92 -0
- novel_downloader/core/searchers/xiaoshuowu.py +122 -0
- novel_downloader/core/searchers/xiguashuwu.py +95 -0
- novel_downloader/core/searchers/xs63b.py +104 -0
- novel_downloader/locales/en.json +36 -79
- novel_downloader/locales/zh.json +37 -80
- novel_downloader/models/__init__.py +23 -50
- novel_downloader/models/book.py +44 -0
- novel_downloader/models/config.py +16 -43
- novel_downloader/models/login.py +1 -1
- novel_downloader/models/search.py +21 -0
- novel_downloader/resources/config/settings.toml +39 -74
- novel_downloader/resources/css_styles/intro.css +83 -0
- novel_downloader/resources/css_styles/main.css +30 -89
- novel_downloader/resources/json/xiguashuwu.json +718 -0
- novel_downloader/utils/__init__.py +43 -0
- novel_downloader/utils/chapter_storage.py +247 -226
- novel_downloader/utils/constants.py +5 -50
- novel_downloader/utils/cookies.py +6 -18
- novel_downloader/utils/crypto_utils/__init__.py +13 -0
- novel_downloader/utils/crypto_utils/aes_util.py +90 -0
- novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
- novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
- novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
- novel_downloader/utils/epub/__init__.py +34 -0
- novel_downloader/utils/epub/builder.py +377 -0
- novel_downloader/utils/epub/constants.py +118 -0
- novel_downloader/utils/epub/documents.py +297 -0
- novel_downloader/utils/epub/models.py +120 -0
- novel_downloader/utils/epub/utils.py +179 -0
- novel_downloader/utils/file_utils/__init__.py +5 -30
- novel_downloader/utils/file_utils/io.py +9 -150
- novel_downloader/utils/file_utils/normalize.py +2 -2
- novel_downloader/utils/file_utils/sanitize.py +2 -7
- novel_downloader/utils/fontocr.py +207 -0
- novel_downloader/utils/i18n.py +2 -0
- novel_downloader/utils/logger.py +10 -16
- novel_downloader/utils/network.py +111 -252
- novel_downloader/utils/state.py +5 -90
- novel_downloader/utils/text_utils/__init__.py +16 -21
- novel_downloader/utils/text_utils/diff_display.py +6 -9
- novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
- novel_downloader/utils/text_utils/text_cleaner.py +179 -0
- novel_downloader/utils/text_utils/truncate_utils.py +62 -0
- novel_downloader/utils/time_utils/__init__.py +6 -12
- novel_downloader/utils/time_utils/datetime_utils.py +23 -33
- novel_downloader/utils/time_utils/sleep_utils.py +5 -10
- novel_downloader/web/__init__.py +13 -0
- novel_downloader/web/components/__init__.py +11 -0
- novel_downloader/web/components/navigation.py +35 -0
- novel_downloader/web/main.py +66 -0
- novel_downloader/web/pages/__init__.py +17 -0
- novel_downloader/web/pages/download.py +78 -0
- novel_downloader/web/pages/progress.py +147 -0
- novel_downloader/web/pages/search.py +329 -0
- novel_downloader/web/services/__init__.py +17 -0
- novel_downloader/web/services/client_dialog.py +164 -0
- novel_downloader/web/services/cred_broker.py +113 -0
- novel_downloader/web/services/cred_models.py +35 -0
- novel_downloader/web/services/task_manager.py +264 -0
- novel_downloader-2.0.0.dist-info/METADATA +171 -0
- novel_downloader-2.0.0.dist-info/RECORD +210 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
- novel_downloader/config/site_rules.py +0 -94
- novel_downloader/core/downloaders/biquge.py +0 -25
- novel_downloader/core/downloaders/esjzone.py +0 -25
- novel_downloader/core/downloaders/linovelib.py +0 -25
- novel_downloader/core/downloaders/sfacg.py +0 -25
- novel_downloader/core/downloaders/yamibo.py +0 -25
- novel_downloader/core/exporters/biquge.py +0 -25
- novel_downloader/core/exporters/esjzone.py +0 -25
- novel_downloader/core/exporters/qianbi.py +0 -25
- novel_downloader/core/exporters/sfacg.py +0 -25
- novel_downloader/core/exporters/yamibo.py +0 -25
- novel_downloader/core/factory/__init__.py +0 -20
- novel_downloader/core/factory/downloader.py +0 -73
- novel_downloader/core/factory/exporter.py +0 -58
- novel_downloader/core/factory/fetcher.py +0 -96
- novel_downloader/core/factory/parser.py +0 -86
- novel_downloader/core/fetchers/base/__init__.py +0 -14
- novel_downloader/core/fetchers/base/browser.py +0 -403
- novel_downloader/core/fetchers/biquge/__init__.py +0 -14
- novel_downloader/core/fetchers/common/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/browser.py +0 -204
- novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
- novel_downloader/core/fetchers/linovelib/browser.py +0 -193
- novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/browser.py +0 -318
- novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
- novel_downloader/core/fetchers/sfacg/browser.py +0 -189
- novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
- novel_downloader/core/fetchers/yamibo/browser.py +0 -229
- novel_downloader/core/parsers/biquge/__init__.py +0 -10
- novel_downloader/core/parsers/biquge/main_parser.py +0 -134
- novel_downloader/core/parsers/common/__init__.py +0 -13
- novel_downloader/core/parsers/common/helper.py +0 -323
- novel_downloader/core/parsers/common/main_parser.py +0 -106
- novel_downloader/core/parsers/esjzone/__init__.py +0 -10
- novel_downloader/core/parsers/linovelib/__init__.py +0 -10
- novel_downloader/core/parsers/qianbi/__init__.py +0 -10
- novel_downloader/core/parsers/sfacg/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/main_parser.py +0 -194
- novel_downloader/models/browser.py +0 -21
- novel_downloader/models/chapter.py +0 -25
- novel_downloader/models/site_rules.py +0 -99
- novel_downloader/models/tasks.py +0 -33
- novel_downloader/models/types.py +0 -15
- novel_downloader/resources/css_styles/volume-intro.css +0 -56
- novel_downloader/resources/json/replace_word_map.json +0 -4
- novel_downloader/resources/text/blacklist.txt +0 -22
- novel_downloader/tui/__init__.py +0 -7
- novel_downloader/tui/app.py +0 -32
- novel_downloader/tui/main.py +0 -17
- novel_downloader/tui/screens/__init__.py +0 -14
- novel_downloader/tui/screens/home.py +0 -198
- novel_downloader/tui/screens/login.py +0 -74
- novel_downloader/tui/styles/home_layout.tcss +0 -79
- novel_downloader/tui/widgets/richlog_handler.py +0 -24
- novel_downloader/utils/cache.py +0 -24
- novel_downloader/utils/fontocr/__init__.py +0 -22
- novel_downloader/utils/fontocr/model_loader.py +0 -69
- novel_downloader/utils/fontocr/ocr_v1.py +0 -303
- novel_downloader/utils/fontocr/ocr_v2.py +0 -752
- novel_downloader/utils/hash_store.py +0 -279
- novel_downloader/utils/hash_utils.py +0 -103
- novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
- novel_downloader/utils/text_utils/font_mapping.py +0 -28
- novel_downloader/utils/text_utils/text_cleaning.py +0 -107
- novel_downloader-1.4.5.dist-info/METADATA +0 -196
- novel_downloader-1.4.5.dist-info/RECORD +0 -165
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,189 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.piaotia
|
4
|
+
-------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
from lxml import html
|
12
|
+
|
13
|
+
from novel_downloader.core.parsers.base import BaseParser
|
14
|
+
from novel_downloader.core.parsers.registry import register_parser
|
15
|
+
from novel_downloader.models import (
|
16
|
+
BookInfoDict,
|
17
|
+
ChapterDict,
|
18
|
+
ChapterInfoDict,
|
19
|
+
VolumeInfoDict,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
@register_parser(
|
24
|
+
site_keys=["piaotia"],
|
25
|
+
)
|
26
|
+
class PiaotiaParser(BaseParser):
|
27
|
+
"""
|
28
|
+
Parser for 飘天文学网 book pages.
|
29
|
+
"""
|
30
|
+
|
31
|
+
_RE_DEVICE_DIV = re.compile(
|
32
|
+
r'<div\s+id=[\'"“”]?device[\'"“”]?[^>]*>',
|
33
|
+
flags=re.IGNORECASE,
|
34
|
+
)
|
35
|
+
|
36
|
+
def parse_book_info(
|
37
|
+
self,
|
38
|
+
html_list: list[str],
|
39
|
+
**kwargs: Any,
|
40
|
+
) -> BookInfoDict | None:
|
41
|
+
if len(html_list) < 2:
|
42
|
+
return None
|
43
|
+
|
44
|
+
# Parse trees
|
45
|
+
info_tree = html.fromstring(html_list[0])
|
46
|
+
catalog_tree = html.fromstring(html_list[1])
|
47
|
+
|
48
|
+
book_name = self._first_str(info_tree.xpath("//span[@style]//h1/text()"))
|
49
|
+
author = self._first_str(
|
50
|
+
info_tree.xpath(
|
51
|
+
'//td[contains(text(),"作") and contains(text(),"者")]/text()'
|
52
|
+
),
|
53
|
+
replaces=[(chr(0xA0), ""), (" ", ""), ("作者:", "")],
|
54
|
+
)
|
55
|
+
|
56
|
+
# Category as tag
|
57
|
+
category = self._first_str(
|
58
|
+
info_tree.xpath(
|
59
|
+
'//td[contains(text(),"类") and contains(text(),"别")]/text()'
|
60
|
+
),
|
61
|
+
replaces=[(chr(0xA0), ""), (" ", ""), ("类别:", "")],
|
62
|
+
)
|
63
|
+
tags = [category] if category else []
|
64
|
+
|
65
|
+
word_count = self._first_str(
|
66
|
+
info_tree.xpath('//td[contains(text(),"全文长度")]/text()'),
|
67
|
+
replaces=[(chr(0xA0), ""), (" ", ""), ("全文长度:", "")],
|
68
|
+
)
|
69
|
+
|
70
|
+
update_time = self._first_str(
|
71
|
+
info_tree.xpath('//td[contains(text(),"最后更新")]/text()'),
|
72
|
+
replaces=[(chr(0xA0), ""), (" ", ""), ("最后更新:", "")],
|
73
|
+
)
|
74
|
+
|
75
|
+
serial_status = self._first_str(
|
76
|
+
info_tree.xpath('//td[contains(text(),"文章状态")]/text()'),
|
77
|
+
replaces=[(chr(0xA0), ""), (" ", ""), ("文章状态:", "")],
|
78
|
+
)
|
79
|
+
|
80
|
+
cover_url = self._first_str(info_tree.xpath('//td[@width="80%"]//img/@src'))
|
81
|
+
|
82
|
+
# Summary
|
83
|
+
summary_divs = info_tree.xpath('//td[@width="80%"]/div')
|
84
|
+
if summary_divs:
|
85
|
+
raw = str(summary_divs[0].text_content())
|
86
|
+
summary = raw.split("内容简介:")[-1].strip()
|
87
|
+
else:
|
88
|
+
summary = ""
|
89
|
+
|
90
|
+
# Chapters (single volume)
|
91
|
+
chapters: list[ChapterInfoDict] = []
|
92
|
+
for a in catalog_tree.xpath('//div[@class="centent"]//ul/li/a'):
|
93
|
+
title = (a.text or "").strip()
|
94
|
+
url = a.get("href", "").strip()
|
95
|
+
chapter_id = url.split(".")[0]
|
96
|
+
chapters.append({"title": title, "url": url, "chapterId": chapter_id})
|
97
|
+
|
98
|
+
# Single volume
|
99
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
100
|
+
|
101
|
+
return {
|
102
|
+
"book_name": book_name,
|
103
|
+
"author": author,
|
104
|
+
"cover_url": cover_url,
|
105
|
+
"update_time": update_time,
|
106
|
+
"summary": summary,
|
107
|
+
"volumes": volumes,
|
108
|
+
"tags": tags,
|
109
|
+
"word_count": word_count,
|
110
|
+
"serial_status": serial_status,
|
111
|
+
"extra": {},
|
112
|
+
}
|
113
|
+
|
114
|
+
def parse_chapter(
|
115
|
+
self,
|
116
|
+
html_list: list[str],
|
117
|
+
chapter_id: str,
|
118
|
+
**kwargs: Any,
|
119
|
+
) -> ChapterDict | None:
|
120
|
+
"""
|
121
|
+
Parse chapter page and extract the content of one chapter.
|
122
|
+
|
123
|
+
p.s. 结构好混乱:
|
124
|
+
1. `<head>` 没有对应的 `</head>`, 同理 `</body>` 没有对应的 `<body>`
|
125
|
+
2. 部分 html 通过 js 直接写入, 例如:
|
126
|
+
`document.write("<div id=\"main\" class=\"colors1 sidebar\">");`
|
127
|
+
3. 部分 div 的 id 或 style 属性周围的引号是非标准的波浪引号, 例如:
|
128
|
+
`<div id=”device” style=”background-color...”>`,
|
129
|
+
并也没有对应的 `</div>`
|
130
|
+
|
131
|
+
:param html_list: The HTML list of the chapter pages.
|
132
|
+
:param chapter_id: Identifier of the chapter being parsed.
|
133
|
+
:return: The chapter's data.
|
134
|
+
"""
|
135
|
+
if not html_list:
|
136
|
+
return None
|
137
|
+
|
138
|
+
raw = self._RE_DEVICE_DIV.sub("", html_list[0])
|
139
|
+
raw = raw.replace(
|
140
|
+
'<script language="javascript">GetMode();</script>',
|
141
|
+
'<div id="main" class="colors1 sidebar">',
|
142
|
+
).replace(
|
143
|
+
'<script language="javascript">GetFont();</script>',
|
144
|
+
'<div id="content">',
|
145
|
+
)
|
146
|
+
|
147
|
+
doc = html.fromstring(raw)
|
148
|
+
container = doc.xpath('//div[@id="content"]')
|
149
|
+
root = container[0] if container else doc
|
150
|
+
|
151
|
+
# Title comes straight from the <h1>
|
152
|
+
title = ""
|
153
|
+
h1 = root.find(".//h1")
|
154
|
+
if h1 is not None:
|
155
|
+
full = h1.text_content().strip()
|
156
|
+
a_txt = h1.xpath("./a/text()")
|
157
|
+
title = full.replace(a_txt[0].strip(), "").strip() if a_txt else full
|
158
|
+
|
159
|
+
# Walk the “script‑tables” -> <br> siblings for the body
|
160
|
+
table = root.xpath('.//table[@align="center" and @border]')
|
161
|
+
if not table:
|
162
|
+
return None
|
163
|
+
node = table[0].getnext()
|
164
|
+
|
165
|
+
lines: list[str] = []
|
166
|
+
while node is not None:
|
167
|
+
# stop at the next table or any bottom‑link nav div
|
168
|
+
if (node.tag == "table" and node.get("border")) or (
|
169
|
+
node.tag == "div" and node.get("class", "").endswith("link")
|
170
|
+
):
|
171
|
+
break
|
172
|
+
|
173
|
+
if node.tag == "br":
|
174
|
+
txt = (node.tail or "").replace("\xa0", " ").strip()
|
175
|
+
if txt:
|
176
|
+
lines.append(txt)
|
177
|
+
|
178
|
+
node = node.getnext()
|
179
|
+
|
180
|
+
content = "\n".join(lines).strip()
|
181
|
+
if not content:
|
182
|
+
return None
|
183
|
+
|
184
|
+
return {
|
185
|
+
"id": chapter_id,
|
186
|
+
"title": title,
|
187
|
+
"content": content,
|
188
|
+
"extra": {"site": "piaotia"},
|
189
|
+
}
|
@@ -0,0 +1,136 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.qbtr
|
4
|
+
----------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
from lxml import html
|
12
|
+
|
13
|
+
from novel_downloader.core.parsers.base import BaseParser
|
14
|
+
from novel_downloader.core.parsers.registry import register_parser
|
15
|
+
from novel_downloader.models import (
|
16
|
+
BookInfoDict,
|
17
|
+
ChapterDict,
|
18
|
+
ChapterInfoDict,
|
19
|
+
VolumeInfoDict,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
@register_parser(
|
24
|
+
site_keys=["qbtr"],
|
25
|
+
)
|
26
|
+
class QbtrParser(BaseParser):
|
27
|
+
"""
|
28
|
+
Parser for 全本同人小说 book pages.
|
29
|
+
"""
|
30
|
+
|
31
|
+
BASE_URL = "https://www.qbtr.cc"
|
32
|
+
|
33
|
+
def parse_book_info(
|
34
|
+
self,
|
35
|
+
html_list: list[str],
|
36
|
+
**kwargs: Any,
|
37
|
+
) -> BookInfoDict | None:
|
38
|
+
if not html_list:
|
39
|
+
return None
|
40
|
+
|
41
|
+
# Parse the main info page
|
42
|
+
tree = html.fromstring(html_list[0])
|
43
|
+
# Book name
|
44
|
+
book_name = self._first_str(tree.xpath('//div[@class="infos"]/h1/text()'))
|
45
|
+
# Tags: the second breadcrumb (e.g., "同人小说")
|
46
|
+
tag = self._first_str(
|
47
|
+
tree.xpath('//div[contains(@class,"menNav")]/a[2]/text()')
|
48
|
+
)
|
49
|
+
tags = [tag] if tag else []
|
50
|
+
|
51
|
+
# Author & update_time from the date div
|
52
|
+
date_div = tree.xpath('//div[@class="date"]')
|
53
|
+
date_text = html.tostring(date_div[0], encoding="unicode", method="text")
|
54
|
+
author_match = re.search(r"作者[::]\s*([^日]+)", date_text)
|
55
|
+
author = author_match.group(1).strip() if author_match else ""
|
56
|
+
date_match = re.search(r"日期[::]\s*([\d-]+)", date_text)
|
57
|
+
update_time = date_match.group(1) if date_match else ""
|
58
|
+
|
59
|
+
# Summary from the <p> inside infos
|
60
|
+
paras = tree.xpath('//div[@class="infos"]/p//text()')
|
61
|
+
summary = "\n".join(p.strip() for p in paras if p.strip())
|
62
|
+
|
63
|
+
# Chapters from the book_list
|
64
|
+
chapters: list[ChapterInfoDict] = []
|
65
|
+
for a in tree.xpath('//div[contains(@class,"book_list")]//li/a'):
|
66
|
+
url = a.get("href", "").strip()
|
67
|
+
title = a.text_content().strip()
|
68
|
+
# General regex: /{category}/{bookId}/{chapterId}.html
|
69
|
+
m = re.search(r"^/[^/]+/\d+/(\d+)\.html$", url)
|
70
|
+
cid = m.group(1) if m else ""
|
71
|
+
chapters.append({"title": title, "url": url, "chapterId": cid})
|
72
|
+
|
73
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
74
|
+
|
75
|
+
# Parse the download page (second HTML)
|
76
|
+
download_url = ""
|
77
|
+
if len(html_list) > 1 and html_list[1]:
|
78
|
+
dtree = html.fromstring(html_list[1])
|
79
|
+
a = dtree.xpath('//a[@id="dowloadnUrl"]')
|
80
|
+
if a:
|
81
|
+
link = a[0].get("link") or a[0].get("href") or ""
|
82
|
+
download_url = self._fix_download_link(link)
|
83
|
+
|
84
|
+
return {
|
85
|
+
"book_name": book_name,
|
86
|
+
"author": author,
|
87
|
+
"cover_url": "",
|
88
|
+
"update_time": update_time,
|
89
|
+
"tags": tags,
|
90
|
+
"summary": summary,
|
91
|
+
"volumes": volumes,
|
92
|
+
"extra": {"download_url": download_url},
|
93
|
+
}
|
94
|
+
|
95
|
+
def parse_chapter(
|
96
|
+
self,
|
97
|
+
html_list: list[str],
|
98
|
+
chapter_id: str,
|
99
|
+
**kwargs: Any,
|
100
|
+
) -> ChapterDict | None:
|
101
|
+
if not html_list:
|
102
|
+
return None
|
103
|
+
|
104
|
+
tree = html.fromstring(html_list[0])
|
105
|
+
|
106
|
+
raw_title = self._first_str(
|
107
|
+
tree.xpath('//div[contains(@class,"read_chapterName")]//h1/text()')
|
108
|
+
)
|
109
|
+
|
110
|
+
crumbs = tree.xpath('//div[contains(@class,"readTop")]//a/text()')
|
111
|
+
book_name = crumbs[-1].strip() if crumbs else ""
|
112
|
+
|
113
|
+
title = raw_title.replace(book_name, "").strip()
|
114
|
+
|
115
|
+
paragraphs = tree.xpath('//div[contains(@class,"read_chapterDetail")]/p')
|
116
|
+
texts = []
|
117
|
+
for p in paragraphs:
|
118
|
+
txt = p.text_content().strip()
|
119
|
+
if txt:
|
120
|
+
texts.append(txt)
|
121
|
+
|
122
|
+
content = "\n".join(texts)
|
123
|
+
if not content:
|
124
|
+
return None
|
125
|
+
|
126
|
+
return {
|
127
|
+
"id": chapter_id,
|
128
|
+
"title": title,
|
129
|
+
"content": content,
|
130
|
+
"extra": {"site": "qbtr"},
|
131
|
+
}
|
132
|
+
|
133
|
+
@classmethod
|
134
|
+
def _fix_download_link(cls, link: str) -> str:
|
135
|
+
true_link = link.replace("qb../", "/e/DownSys/")
|
136
|
+
return f"{cls.BASE_URL}{true_link}"
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
"""
|
3
|
-
novel_downloader.core.parsers.qianbi
|
4
|
-
|
3
|
+
novel_downloader.core.parsers.qianbi
|
4
|
+
------------------------------------
|
5
5
|
|
6
6
|
"""
|
7
7
|
|
@@ -11,59 +11,61 @@ from typing import Any
|
|
11
11
|
from lxml import html
|
12
12
|
|
13
13
|
from novel_downloader.core.parsers.base import BaseParser
|
14
|
-
from novel_downloader.
|
14
|
+
from novel_downloader.core.parsers.registry import register_parser
|
15
|
+
from novel_downloader.models import (
|
16
|
+
BookInfoDict,
|
17
|
+
ChapterDict,
|
18
|
+
VolumeInfoDict,
|
19
|
+
)
|
15
20
|
|
16
21
|
|
22
|
+
@register_parser(
|
23
|
+
site_keys=["qianbi"],
|
24
|
+
)
|
17
25
|
class QianbiParser(BaseParser):
|
18
|
-
"""
|
26
|
+
"""
|
27
|
+
Parser for 铅笔小说 book pages.
|
28
|
+
"""
|
19
29
|
|
20
30
|
def parse_book_info(
|
21
31
|
self,
|
22
32
|
html_list: list[str],
|
23
33
|
**kwargs: Any,
|
24
|
-
) ->
|
25
|
-
"""
|
26
|
-
Parse a book info page and extract metadata and chapter structure.
|
27
|
-
|
28
|
-
:param html_list: Raw HTML of the book info pages.
|
29
|
-
:return: Parsed metadata and chapter structure as a dictionary.
|
30
|
-
"""
|
34
|
+
) -> BookInfoDict | None:
|
31
35
|
if len(html_list) < 2:
|
32
|
-
return
|
36
|
+
return None
|
33
37
|
|
34
38
|
info_tree = html.fromstring(html_list[0])
|
35
39
|
catalog_tree = html.fromstring(html_list[1])
|
36
|
-
result: dict[str, Any] = {}
|
37
|
-
|
38
|
-
title = info_tree.xpath('//h1[@class="page-title"]/text()')
|
39
|
-
result["book_name"] = title[0].strip() if title else ""
|
40
|
-
|
41
|
-
author = info_tree.xpath('//a[contains(@href,"/author/")]/@title')
|
42
|
-
result["author"] = author[0].strip() if author else ""
|
43
40
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
41
|
+
book_name = self._first_str(info_tree.xpath('//h1[@class="page-title"]/text()'))
|
42
|
+
author = self._first_str(
|
43
|
+
info_tree.xpath('//a[contains(@href,"/author/")]/@title')
|
44
|
+
)
|
45
|
+
cover_url = self._first_str(
|
46
|
+
info_tree.xpath('//div[@class="novel-cover"]//img/@data-src')
|
47
|
+
)
|
48
|
+
serial_status = self._first_str(
|
49
|
+
info_tree.xpath(
|
50
|
+
'//a[@class="tag-link" and (text()="完结" or text()="连载")]/text()'
|
51
|
+
)
|
52
|
+
)
|
53
|
+
word_count = self._first_str(
|
54
|
+
info_tree.xpath('//span[contains(text(), "字")]/text()')
|
49
55
|
)
|
50
|
-
result["serial_status"] = status[0] if status else ""
|
51
|
-
|
52
|
-
word_count_raw = info_tree.xpath('//span[contains(text(), "万字")]/text()')
|
53
|
-
result["word_count"] = word_count_raw[0].strip() if word_count_raw else ""
|
54
56
|
|
55
57
|
summary_node = info_tree.xpath(
|
56
58
|
'//div[@class="novel-info-item novel-info-content"]/span'
|
57
59
|
)
|
58
60
|
if summary_node and summary_node[0] is not None:
|
59
|
-
|
61
|
+
summary = str(summary_node[0].text_content()).strip()
|
60
62
|
else:
|
61
|
-
|
63
|
+
summary = ""
|
62
64
|
|
63
|
-
|
65
|
+
update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
64
66
|
|
65
|
-
volumes: list[
|
66
|
-
current_volume = None
|
67
|
+
volumes: list[VolumeInfoDict] = []
|
68
|
+
current_volume: VolumeInfoDict | None = None
|
67
69
|
|
68
70
|
for elem in catalog_tree.xpath('//div[@class="box"]/*'):
|
69
71
|
class_attr = elem.get("class", "")
|
@@ -99,9 +101,17 @@ class QianbiParser(BaseParser):
|
|
99
101
|
if current_volume:
|
100
102
|
volumes.append(current_volume)
|
101
103
|
|
102
|
-
|
103
|
-
|
104
|
-
|
104
|
+
return {
|
105
|
+
"book_name": book_name,
|
106
|
+
"author": author,
|
107
|
+
"cover_url": cover_url,
|
108
|
+
"update_time": update_time,
|
109
|
+
"word_count": word_count,
|
110
|
+
"serial_status": serial_status,
|
111
|
+
"summary": summary,
|
112
|
+
"volumes": volumes,
|
113
|
+
"extra": {},
|
114
|
+
}
|
105
115
|
|
106
116
|
def parse_chapter(
|
107
117
|
self,
|
@@ -109,31 +119,24 @@ class QianbiParser(BaseParser):
|
|
109
119
|
chapter_id: str,
|
110
120
|
**kwargs: Any,
|
111
121
|
) -> ChapterDict | None:
|
112
|
-
"""
|
113
|
-
Parse a single chapter page and extract clean text or simplified HTML.
|
114
|
-
|
115
|
-
:param html_list: Raw HTML of the chapter page.
|
116
|
-
:param chapter_id: Identifier of the chapter being parsed.
|
117
|
-
:return: Cleaned chapter content as plain text or minimal HTML.
|
118
|
-
"""
|
119
122
|
if not html_list:
|
120
123
|
return None
|
121
124
|
tree = html.fromstring(html_list[0])
|
122
125
|
|
126
|
+
# Content paragraphs
|
123
127
|
paras = tree.xpath('//div[@class="article-content"]/p/text()')
|
124
|
-
content_text = "\n
|
128
|
+
content_text = "\n".join(p.strip() for p in paras if p.strip())
|
125
129
|
if not content_text:
|
126
130
|
return None
|
127
131
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
volume = tree.xpath('//h3[@class="text-muted"]/text()')
|
132
|
-
volume_text = volume[0].strip() if volume else ""
|
132
|
+
title_text = self._first_str(tree.xpath('//h1[@class="article-title"]/text()'))
|
133
|
+
volume_text = self._first_str(tree.xpath('//h3[@class="text-muted"]/text()'))
|
133
134
|
|
134
|
-
next_href =
|
135
|
+
next_href = self._first_str(
|
136
|
+
tree.xpath('//div[@class="footer"]/a[@class="f-right"]/@href')
|
137
|
+
)
|
135
138
|
next_chapter_id = (
|
136
|
-
next_href
|
139
|
+
next_href.split("/")[-1].replace(".html", "") if next_href else ""
|
137
140
|
)
|
138
141
|
|
139
142
|
return {
|
@@ -12,10 +12,11 @@ time, status, word count, summary, and volume-chapter structure.
|
|
12
12
|
import logging
|
13
13
|
import re
|
14
14
|
from datetime import datetime
|
15
|
-
from typing import Any
|
16
15
|
|
17
16
|
from lxml import html
|
18
17
|
|
18
|
+
from novel_downloader.models import BookInfoDict, ChapterInfoDict, VolumeInfoDict
|
19
|
+
|
19
20
|
logger = logging.getLogger(__name__)
|
20
21
|
|
21
22
|
|
@@ -23,7 +24,7 @@ def _chapter_url_to_id(url: str) -> str:
|
|
23
24
|
return url.rstrip("/").split("/")[-1]
|
24
25
|
|
25
26
|
|
26
|
-
def parse_book_info(html_str: str) ->
|
27
|
+
def parse_book_info(html_str: str) -> BookInfoDict | None:
|
27
28
|
"""
|
28
29
|
Extract metadata: title, author, cover_url, update_time, status,
|
29
30
|
word_count, summary, and volumes with chapters.
|
@@ -31,60 +32,58 @@ def parse_book_info(html_str: str) -> dict[str, Any]:
|
|
31
32
|
:param html_str: Raw HTML of the book info page.
|
32
33
|
:return: A dict containing book metadata.
|
33
34
|
"""
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
return info
|
35
|
+
doc = html.fromstring(html_str)
|
36
|
+
|
37
|
+
book_name = doc.xpath('string(//h1[@id="bookName"])').strip()
|
38
|
+
|
39
|
+
author = doc.xpath('string(//a[@class="writer-name"])').strip()
|
40
|
+
|
41
|
+
book_id = doc.xpath('//a[@id="bookImg"]/@data-bid')[0]
|
42
|
+
cover_url = f"https://bookcover.yuewen.com/qdbimg/349573/{book_id}/600.webp"
|
43
|
+
|
44
|
+
ut = doc.xpath('string(//span[@class="update-time"])')
|
45
|
+
ut = ut.replace("更新时间:", "").strip()
|
46
|
+
if re.match(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$", ut):
|
47
|
+
update_time = ut
|
48
|
+
else:
|
49
|
+
update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
50
|
+
|
51
|
+
serial_status = doc.xpath('string(//p[@class="book-attribute"]/span[1])').strip()
|
52
|
+
|
53
|
+
tags_elem = doc.xpath('//p[contains(@class,"all-label")]//a/text()')
|
54
|
+
tags = [t.strip() for t in tags_elem if t.strip()]
|
55
|
+
|
56
|
+
word_count = doc.xpath('string(//p[@class="count"]/em[1])').strip()
|
57
|
+
|
58
|
+
summary_brief = doc.xpath('string(//p[@class="intro"])').strip()
|
59
|
+
|
60
|
+
raw = doc.xpath('//p[@id="book-intro-detail"]//text()')
|
61
|
+
summary = "\n".join(line.strip() for line in raw if line.strip())
|
62
|
+
|
63
|
+
volumes: list[VolumeInfoDict] = []
|
64
|
+
for vol in doc.xpath('//div[@id="allCatalog"]//div[@class="catalog-volume"]'):
|
65
|
+
vol_name = vol.xpath('string(.//h3[@class="volume-name"])').strip()
|
66
|
+
vol_name = vol_name.split(chr(183))[0].strip()
|
67
|
+
chapters: list[ChapterInfoDict] = []
|
68
|
+
for li in vol.xpath('.//ul[contains(@class,"volume-chapters")]/li'):
|
69
|
+
a = li.xpath('.//a[@class="chapter-name"]')[0]
|
70
|
+
title = a.text.strip()
|
71
|
+
url = a.get("href")
|
72
|
+
chapters.append(
|
73
|
+
{"title": title, "url": url, "chapterId": _chapter_url_to_id(url)}
|
74
|
+
)
|
75
|
+
volumes.append({"volume_name": vol_name, "chapters": chapters})
|
76
|
+
|
77
|
+
return {
|
78
|
+
"book_name": book_name,
|
79
|
+
"author": author,
|
80
|
+
"cover_url": cover_url,
|
81
|
+
"update_time": update_time,
|
82
|
+
"word_count": word_count,
|
83
|
+
"serial_status": serial_status,
|
84
|
+
"tags": tags,
|
85
|
+
"summary_brief": summary_brief,
|
86
|
+
"summary": summary,
|
87
|
+
"volumes": volumes,
|
88
|
+
"extra": {},
|
89
|
+
}
|