novel-downloader 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +1 -3
- novel_downloader/cli/clean.py +21 -88
- novel_downloader/cli/config.py +26 -21
- novel_downloader/cli/download.py +77 -64
- novel_downloader/cli/export.py +16 -20
- novel_downloader/cli/main.py +1 -1
- novel_downloader/cli/search.py +62 -65
- novel_downloader/cli/ui.py +156 -0
- novel_downloader/config/__init__.py +8 -5
- novel_downloader/config/adapter.py +65 -105
- novel_downloader/config/{loader.py → file_io.py} +53 -26
- novel_downloader/core/__init__.py +1 -0
- novel_downloader/core/archived/deqixs/fetcher.py +115 -0
- novel_downloader/core/archived/deqixs/parser.py +132 -0
- novel_downloader/core/archived/deqixs/searcher.py +89 -0
- novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
- novel_downloader/core/archived/wanbengo/searcher.py +98 -0
- novel_downloader/core/archived/xshbook/searcher.py +93 -0
- novel_downloader/core/downloaders/__init__.py +3 -24
- novel_downloader/core/downloaders/base.py +49 -23
- novel_downloader/core/downloaders/common.py +191 -137
- novel_downloader/core/downloaders/qianbi.py +187 -146
- novel_downloader/core/downloaders/qidian.py +187 -141
- novel_downloader/core/downloaders/registry.py +4 -2
- novel_downloader/core/downloaders/signals.py +46 -0
- novel_downloader/core/exporters/__init__.py +3 -20
- novel_downloader/core/exporters/base.py +33 -37
- novel_downloader/core/exporters/common/__init__.py +1 -2
- novel_downloader/core/exporters/common/epub.py +15 -10
- novel_downloader/core/exporters/common/main_exporter.py +19 -12
- novel_downloader/core/exporters/common/txt.py +14 -9
- novel_downloader/core/exporters/epub_util.py +59 -29
- novel_downloader/core/exporters/linovelib/__init__.py +1 -0
- novel_downloader/core/exporters/linovelib/epub.py +23 -25
- novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
- novel_downloader/core/exporters/linovelib/txt.py +17 -11
- novel_downloader/core/exporters/qidian.py +2 -8
- novel_downloader/core/exporters/registry.py +4 -2
- novel_downloader/core/exporters/txt_util.py +7 -7
- novel_downloader/core/fetchers/__init__.py +54 -48
- novel_downloader/core/fetchers/aaatxt.py +83 -0
- novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
- novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
- novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
- novel_downloader/core/fetchers/dxmwx.py +110 -0
- novel_downloader/core/fetchers/eightnovel.py +139 -0
- novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
- novel_downloader/core/fetchers/guidaye.py +85 -0
- novel_downloader/core/fetchers/hetushu.py +92 -0
- novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
- novel_downloader/core/fetchers/ixdzs8.py +113 -0
- novel_downloader/core/fetchers/jpxs123.py +101 -0
- novel_downloader/core/fetchers/lewenn.py +83 -0
- novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
- novel_downloader/core/fetchers/piaotia.py +105 -0
- novel_downloader/core/fetchers/qbtr.py +101 -0
- novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
- novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +46 -39
- novel_downloader/core/fetchers/quanben5.py +92 -0
- novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
- novel_downloader/core/fetchers/registry.py +5 -16
- novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
- novel_downloader/core/fetchers/shencou.py +106 -0
- novel_downloader/core/fetchers/shuhaige.py +84 -0
- novel_downloader/core/fetchers/tongrenquan.py +84 -0
- novel_downloader/core/fetchers/ttkan.py +95 -0
- novel_downloader/core/fetchers/wanbengo.py +83 -0
- novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
- novel_downloader/core/fetchers/xiguashuwu.py +177 -0
- novel_downloader/core/fetchers/xs63b.py +171 -0
- novel_downloader/core/fetchers/xshbook.py +85 -0
- novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
- novel_downloader/core/fetchers/yibige.py +114 -0
- novel_downloader/core/interfaces/__init__.py +1 -9
- novel_downloader/core/interfaces/downloader.py +6 -2
- novel_downloader/core/interfaces/exporter.py +7 -7
- novel_downloader/core/interfaces/fetcher.py +4 -17
- novel_downloader/core/interfaces/parser.py +5 -6
- novel_downloader/core/interfaces/searcher.py +9 -1
- novel_downloader/core/parsers/__init__.py +49 -12
- novel_downloader/core/parsers/aaatxt.py +132 -0
- novel_downloader/core/parsers/b520.py +116 -0
- novel_downloader/core/parsers/base.py +63 -12
- novel_downloader/core/parsers/biquyuedu.py +133 -0
- novel_downloader/core/parsers/dxmwx.py +162 -0
- novel_downloader/core/parsers/eightnovel.py +224 -0
- novel_downloader/core/parsers/esjzone.py +61 -66
- novel_downloader/core/parsers/guidaye.py +128 -0
- novel_downloader/core/parsers/hetushu.py +139 -0
- novel_downloader/core/parsers/i25zw.py +137 -0
- novel_downloader/core/parsers/ixdzs8.py +186 -0
- novel_downloader/core/parsers/jpxs123.py +137 -0
- novel_downloader/core/parsers/lewenn.py +142 -0
- novel_downloader/core/parsers/linovelib.py +48 -64
- novel_downloader/core/parsers/piaotia.py +189 -0
- novel_downloader/core/parsers/qbtr.py +136 -0
- novel_downloader/core/parsers/qianbi.py +48 -50
- novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +272 -330
- novel_downloader/core/parsers/qidian/chapter_normal.py +24 -55
- novel_downloader/core/parsers/qidian/main_parser.py +11 -38
- novel_downloader/core/parsers/qidian/utils/__init__.py +1 -0
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
- novel_downloader/core/parsers/quanben5.py +103 -0
- novel_downloader/core/parsers/registry.py +5 -16
- novel_downloader/core/parsers/sfacg.py +38 -45
- novel_downloader/core/parsers/shencou.py +215 -0
- novel_downloader/core/parsers/shuhaige.py +111 -0
- novel_downloader/core/parsers/tongrenquan.py +116 -0
- novel_downloader/core/parsers/ttkan.py +132 -0
- novel_downloader/core/parsers/wanbengo.py +191 -0
- novel_downloader/core/parsers/xiaoshuowu.py +173 -0
- novel_downloader/core/parsers/xiguashuwu.py +435 -0
- novel_downloader/core/parsers/xs63b.py +161 -0
- novel_downloader/core/parsers/xshbook.py +134 -0
- novel_downloader/core/parsers/yamibo.py +87 -131
- novel_downloader/core/parsers/yibige.py +166 -0
- novel_downloader/core/searchers/__init__.py +34 -3
- novel_downloader/core/searchers/aaatxt.py +107 -0
- novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
- novel_downloader/core/searchers/base.py +112 -36
- novel_downloader/core/searchers/dxmwx.py +105 -0
- novel_downloader/core/searchers/eightnovel.py +84 -0
- novel_downloader/core/searchers/esjzone.py +43 -25
- novel_downloader/core/searchers/hetushu.py +92 -0
- novel_downloader/core/searchers/i25zw.py +93 -0
- novel_downloader/core/searchers/ixdzs8.py +107 -0
- novel_downloader/core/searchers/jpxs123.py +107 -0
- novel_downloader/core/searchers/piaotia.py +100 -0
- novel_downloader/core/searchers/qbtr.py +106 -0
- novel_downloader/core/searchers/qianbi.py +74 -40
- novel_downloader/core/searchers/quanben5.py +144 -0
- novel_downloader/core/searchers/registry.py +24 -8
- novel_downloader/core/searchers/shuhaige.py +124 -0
- novel_downloader/core/searchers/tongrenquan.py +110 -0
- novel_downloader/core/searchers/ttkan.py +92 -0
- novel_downloader/core/searchers/xiaoshuowu.py +122 -0
- novel_downloader/core/searchers/xiguashuwu.py +95 -0
- novel_downloader/core/searchers/xs63b.py +104 -0
- novel_downloader/locales/en.json +31 -82
- novel_downloader/locales/zh.json +32 -83
- novel_downloader/models/__init__.py +21 -22
- novel_downloader/models/book.py +44 -0
- novel_downloader/models/config.py +4 -37
- novel_downloader/models/login.py +1 -1
- novel_downloader/models/search.py +5 -0
- novel_downloader/resources/config/settings.toml +8 -70
- novel_downloader/resources/json/xiguashuwu.json +718 -0
- novel_downloader/utils/__init__.py +13 -22
- novel_downloader/utils/chapter_storage.py +3 -2
- novel_downloader/utils/constants.py +4 -29
- novel_downloader/utils/cookies.py +6 -18
- novel_downloader/utils/crypto_utils/__init__.py +13 -0
- novel_downloader/utils/crypto_utils/aes_util.py +90 -0
- novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
- novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
- novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
- novel_downloader/utils/epub/__init__.py +1 -1
- novel_downloader/utils/epub/constants.py +57 -16
- novel_downloader/utils/epub/documents.py +88 -194
- novel_downloader/utils/epub/models.py +0 -14
- novel_downloader/utils/epub/utils.py +63 -96
- novel_downloader/utils/file_utils/__init__.py +2 -23
- novel_downloader/utils/file_utils/io.py +3 -113
- novel_downloader/utils/file_utils/sanitize.py +0 -4
- novel_downloader/utils/fontocr.py +207 -0
- novel_downloader/utils/logger.py +8 -16
- novel_downloader/utils/network.py +2 -2
- novel_downloader/utils/state.py +4 -90
- novel_downloader/utils/text_utils/__init__.py +1 -7
- novel_downloader/utils/text_utils/diff_display.py +5 -7
- novel_downloader/utils/time_utils/__init__.py +5 -11
- novel_downloader/utils/time_utils/datetime_utils.py +20 -29
- novel_downloader/utils/time_utils/sleep_utils.py +4 -8
- novel_downloader/web/__init__.py +13 -0
- novel_downloader/web/components/__init__.py +11 -0
- novel_downloader/web/components/navigation.py +35 -0
- novel_downloader/web/main.py +66 -0
- novel_downloader/web/pages/__init__.py +17 -0
- novel_downloader/web/pages/download.py +78 -0
- novel_downloader/web/pages/progress.py +147 -0
- novel_downloader/web/pages/search.py +329 -0
- novel_downloader/web/services/__init__.py +17 -0
- novel_downloader/web/services/client_dialog.py +164 -0
- novel_downloader/web/services/cred_broker.py +113 -0
- novel_downloader/web/services/cred_models.py +35 -0
- novel_downloader/web/services/task_manager.py +264 -0
- novel_downloader-2.0.0.dist-info/METADATA +171 -0
- novel_downloader-2.0.0.dist-info/RECORD +210 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
- novel_downloader/core/downloaders/biquge.py +0 -29
- novel_downloader/core/downloaders/esjzone.py +0 -29
- novel_downloader/core/downloaders/linovelib.py +0 -29
- novel_downloader/core/downloaders/sfacg.py +0 -29
- novel_downloader/core/downloaders/yamibo.py +0 -29
- novel_downloader/core/exporters/biquge.py +0 -22
- novel_downloader/core/exporters/esjzone.py +0 -22
- novel_downloader/core/exporters/qianbi.py +0 -22
- novel_downloader/core/exporters/sfacg.py +0 -22
- novel_downloader/core/exporters/yamibo.py +0 -22
- novel_downloader/core/fetchers/base/__init__.py +0 -14
- novel_downloader/core/fetchers/base/browser.py +0 -422
- novel_downloader/core/fetchers/biquge/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/browser.py +0 -209
- novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
- novel_downloader/core/fetchers/linovelib/browser.py +0 -198
- novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/browser.py +0 -326
- novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
- novel_downloader/core/fetchers/sfacg/browser.py +0 -194
- novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
- novel_downloader/core/fetchers/yamibo/browser.py +0 -234
- novel_downloader/core/parsers/biquge.py +0 -139
- novel_downloader/models/chapter.py +0 -25
- novel_downloader/models/types.py +0 -13
- novel_downloader/tui/__init__.py +0 -7
- novel_downloader/tui/app.py +0 -32
- novel_downloader/tui/main.py +0 -17
- novel_downloader/tui/screens/__init__.py +0 -14
- novel_downloader/tui/screens/home.py +0 -198
- novel_downloader/tui/screens/login.py +0 -74
- novel_downloader/tui/styles/home_layout.tcss +0 -79
- novel_downloader/tui/widgets/richlog_handler.py +0 -24
- novel_downloader/utils/cache.py +0 -24
- novel_downloader/utils/fontocr/__init__.py +0 -22
- novel_downloader/utils/fontocr/hash_store.py +0 -280
- novel_downloader/utils/fontocr/hash_utils.py +0 -103
- novel_downloader/utils/fontocr/model_loader.py +0 -69
- novel_downloader/utils/fontocr/ocr_v1.py +0 -315
- novel_downloader/utils/fontocr/ocr_v2.py +0 -764
- novel_downloader/utils/fontocr/ocr_v3.py +0 -744
- novel_downloader-1.5.0.dist-info/METADATA +0 -196
- novel_downloader-1.5.0.dist-info/RECORD +0 -164
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,435 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.xiguashuwu
|
4
|
+
----------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import base64
|
9
|
+
import hashlib
|
10
|
+
import json
|
11
|
+
import logging
|
12
|
+
import re
|
13
|
+
import urllib.parse
|
14
|
+
from typing import Any
|
15
|
+
|
16
|
+
import requests
|
17
|
+
from lxml import html
|
18
|
+
|
19
|
+
from novel_downloader.core.parsers.base import BaseParser
|
20
|
+
from novel_downloader.core.parsers.registry import register_parser
|
21
|
+
from novel_downloader.models import (
|
22
|
+
BookInfoDict,
|
23
|
+
ChapterDict,
|
24
|
+
ChapterInfoDict,
|
25
|
+
VolumeInfoDict,
|
26
|
+
)
|
27
|
+
from novel_downloader.utils.constants import (
|
28
|
+
DEFAULT_USER_HEADERS,
|
29
|
+
XIGUASHUWU_FONT_MAP_PATH,
|
30
|
+
)
|
31
|
+
from novel_downloader.utils.crypto_utils.aes_util import aes_cbc_decrypt
|
32
|
+
|
33
|
+
logger = logging.getLogger(__name__)
|
34
|
+
|
35
|
+
|
36
|
+
@register_parser(
|
37
|
+
site_keys=["xiguashuwu"],
|
38
|
+
)
|
39
|
+
class XiguashuwuParser(BaseParser):
|
40
|
+
"""
|
41
|
+
Parser for 西瓜书屋 book pages.
|
42
|
+
"""
|
43
|
+
|
44
|
+
BASE_URL = "https://www.xiguashuwu.com"
|
45
|
+
_CONF_THRESHOLD = 0.60
|
46
|
+
_FONT_MAP: dict[str, str] = json.loads(
|
47
|
+
XIGUASHUWU_FONT_MAP_PATH.read_text(encoding="utf-8")
|
48
|
+
)
|
49
|
+
_GLYPH_CACHE: dict[str, str] = {}
|
50
|
+
|
51
|
+
_CODEURL_PATTERN = re.compile(
|
52
|
+
r"""var\s+codeurl\s*=\s*['"]?(\d+)['"]?;?""", re.IGNORECASE
|
53
|
+
)
|
54
|
+
|
55
|
+
_NRID_PATTERN = re.compile(
|
56
|
+
r"""var\s+nrid\s*=\s*['"]?([A-Za-z0-9]+)['"]?;?""", re.IGNORECASE
|
57
|
+
)
|
58
|
+
|
59
|
+
_NEWCON_PATTERN = re.compile(
|
60
|
+
r"""let\s+newcon\s*=\s*decodeURIComponent\(\s*['"](.+?)['"]\s*\);?""",
|
61
|
+
re.IGNORECASE,
|
62
|
+
)
|
63
|
+
|
64
|
+
_D_CALL_PATTERN = re.compile(
|
65
|
+
r"""d\(\s*[^,]+,\s*['"]([0-9A-Fa-f]{32})['"]\s*\);?""", re.IGNORECASE
|
66
|
+
)
|
67
|
+
|
68
|
+
def parse_book_info(
|
69
|
+
self,
|
70
|
+
html_list: list[str],
|
71
|
+
**kwargs: Any,
|
72
|
+
) -> BookInfoDict | None:
|
73
|
+
"""
|
74
|
+
Parse a book info page and extract metadata and chapter structure.
|
75
|
+
|
76
|
+
:param html_list: Raw HTML of the book info page.
|
77
|
+
:return: Parsed metadata and chapter structure as a dictionary.
|
78
|
+
"""
|
79
|
+
if not html_list:
|
80
|
+
return None
|
81
|
+
info_tree = html.fromstring(html_list[0])
|
82
|
+
|
83
|
+
book_name = self._first_str(info_tree.xpath('//p[@class="title"]/text()'))
|
84
|
+
|
85
|
+
author = self._first_str(info_tree.xpath('//p[@class="author"]//a/text()'))
|
86
|
+
|
87
|
+
cover_rel = info_tree.xpath(
|
88
|
+
'//div[@class="BGsectionOne-top-left"]//img/@_src'
|
89
|
+
) or info_tree.xpath('//div[@class="BGsectionOne-top-left"]//img/@src')
|
90
|
+
cover_url = self.BASE_URL + self._first_str(cover_rel)
|
91
|
+
|
92
|
+
tags = [
|
93
|
+
self._first_str(info_tree.xpath('//p[@class="category"]/span[1]/a/text()'))
|
94
|
+
]
|
95
|
+
|
96
|
+
update_time = self._first_str(info_tree.xpath('//p[@class="time"]/span/text()'))
|
97
|
+
|
98
|
+
paras = info_tree.xpath('//section[@id="intro"]//p')
|
99
|
+
summary = "\n".join(p.xpath("string()").strip() for p in paras).strip()
|
100
|
+
|
101
|
+
chapters: list[ChapterInfoDict] = []
|
102
|
+
for catalog_html in html_list[1:]:
|
103
|
+
cat_tree = html.fromstring(catalog_html)
|
104
|
+
links = cat_tree.xpath(
|
105
|
+
'//section[contains(@class,"BCsectionTwo")]'
|
106
|
+
'[.//h3[text()="正文"]]//ol//li/a'
|
107
|
+
)
|
108
|
+
for a in links:
|
109
|
+
title = a.xpath("string()").strip()
|
110
|
+
href = a.get("href", "").strip()
|
111
|
+
# chapterId is filename sans extension
|
112
|
+
chapter_id = href.rsplit("/", 1)[-1].split(".", 1)[0]
|
113
|
+
chapters.append(
|
114
|
+
ChapterInfoDict(
|
115
|
+
title=title,
|
116
|
+
url=self.BASE_URL + href,
|
117
|
+
chapterId=chapter_id,
|
118
|
+
)
|
119
|
+
)
|
120
|
+
|
121
|
+
volumes: list[VolumeInfoDict] = [
|
122
|
+
VolumeInfoDict(volume_name="正文", chapters=chapters)
|
123
|
+
]
|
124
|
+
|
125
|
+
return BookInfoDict(
|
126
|
+
book_name=book_name,
|
127
|
+
author=author,
|
128
|
+
cover_url=cover_url,
|
129
|
+
update_time=update_time,
|
130
|
+
tags=tags,
|
131
|
+
summary=summary,
|
132
|
+
volumes=volumes,
|
133
|
+
extra={},
|
134
|
+
)
|
135
|
+
|
136
|
+
def parse_chapter(
|
137
|
+
self,
|
138
|
+
html_list: list[str],
|
139
|
+
chapter_id: str,
|
140
|
+
**kwargs: Any,
|
141
|
+
) -> ChapterDict | None:
|
142
|
+
"""
|
143
|
+
Parse chapter pages and extract clean text or simplified HTML.
|
144
|
+
|
145
|
+
:param html_list: Raw HTML of the chapter page.
|
146
|
+
:param chapter_id: Identifier of the chapter being parsed.
|
147
|
+
:return: Cleaned chapter content as plain text or minimal HTML.
|
148
|
+
"""
|
149
|
+
if not html_list:
|
150
|
+
return None
|
151
|
+
|
152
|
+
title_text = ""
|
153
|
+
paragraphs: list[str] = []
|
154
|
+
|
155
|
+
for page_idx, html_str in enumerate(html_list, start=1):
|
156
|
+
if page_idx == 1:
|
157
|
+
tree = html.fromstring(html_str)
|
158
|
+
title_text = self._extract_chapter_title(tree)
|
159
|
+
paragraphs.extend(self._parse_chapter_page1(tree))
|
160
|
+
elif page_idx == 2:
|
161
|
+
paragraphs.extend(self._parse_chapter_page2(html_str))
|
162
|
+
else:
|
163
|
+
paragraphs.extend(self._parse_chapter_page3plus(html_str))
|
164
|
+
|
165
|
+
content = "\n".join(paragraphs).strip()
|
166
|
+
if not content:
|
167
|
+
return None
|
168
|
+
|
169
|
+
return {
|
170
|
+
"id": chapter_id,
|
171
|
+
"title": title_text,
|
172
|
+
"content": content,
|
173
|
+
"extra": {"site": "xiguashuwu"},
|
174
|
+
}
|
175
|
+
|
176
|
+
@classmethod
|
177
|
+
def _parse_chapter_page1(cls, tree: html.HtmlElement) -> list[str]:
|
178
|
+
"""
|
179
|
+
Parse page 1 of the chapter: plain text, no encryption or obfuscation.
|
180
|
+
|
181
|
+
This method extracts all visible text from the element with id="C0NTENT",
|
182
|
+
removes known ad sections
|
183
|
+
|
184
|
+
:param tree: Parsed HTML element tree of the chapter page.
|
185
|
+
:return: List of text lines in reading order.
|
186
|
+
"""
|
187
|
+
try:
|
188
|
+
# note: 'C0NTENT' contains a zero, not the letter 'O'
|
189
|
+
content_div = tree.xpath('//*[@id="C0NTENT"]')
|
190
|
+
if not content_div:
|
191
|
+
return []
|
192
|
+
content_div = content_div[0]
|
193
|
+
|
194
|
+
# Remove advertisement or irrelevant sections
|
195
|
+
for ad in content_div.xpath('.//div[@class="s_m"]'):
|
196
|
+
ad.getparent().remove(ad)
|
197
|
+
|
198
|
+
lines = content_div.xpath(".//text()")
|
199
|
+
return [line.strip() for line in lines if line.strip()]
|
200
|
+
except Exception as e:
|
201
|
+
logger.warning("Failed to parse chapter page 1: %s", e)
|
202
|
+
return []
|
203
|
+
|
204
|
+
def _parse_chapter_page2(self, html_str: str) -> list[str]:
|
205
|
+
"""
|
206
|
+
Parse page 2 of the chapter: content order shuffled by JavaScript,
|
207
|
+
and text replaced with images.
|
208
|
+
|
209
|
+
:param html_str: Raw HTML string of the chapter page.
|
210
|
+
:return: List of text lines extracted in correct reading order.
|
211
|
+
"""
|
212
|
+
try:
|
213
|
+
tree = html.fromstring(html_str)
|
214
|
+
# Extract ordering metadata
|
215
|
+
order_raw = self._parse_client_meta(tree)
|
216
|
+
codeurl = self._parse_codeurl(html_str)
|
217
|
+
nrid = self._parse_nrid(html_str)
|
218
|
+
order_list = self._restore_order(order_raw, codeurl)
|
219
|
+
|
220
|
+
# Extract paragraphs in raw order
|
221
|
+
content_divs = tree.xpath(f'//*[@id="{nrid}"]')
|
222
|
+
if not content_divs:
|
223
|
+
return []
|
224
|
+
paragraphs = self._rebuild_paragraphs(content_divs[0])
|
225
|
+
|
226
|
+
# Reorder paragraphs
|
227
|
+
reordered: list[str] = []
|
228
|
+
for idx in order_list:
|
229
|
+
if 0 <= idx < len(paragraphs):
|
230
|
+
reordered.append(paragraphs[idx])
|
231
|
+
return reordered
|
232
|
+
except Exception as e:
|
233
|
+
logger.warning("Failed to parse chapter page 2: %s", e)
|
234
|
+
return []
|
235
|
+
|
236
|
+
def _parse_chapter_page3plus(self, html_str: str) -> list[str]:
|
237
|
+
"""
|
238
|
+
Parse pages 3 and beyond of the chapter: AES-encrypted text
|
239
|
+
replaced with images.
|
240
|
+
|
241
|
+
:param html_str: Raw HTML string of the chapter page.
|
242
|
+
:return: List of decrypted text lines in reading order.
|
243
|
+
"""
|
244
|
+
try:
|
245
|
+
newcon = self._parse_newcon(html_str)
|
246
|
+
d_key = self._parse_d_key(html_str)
|
247
|
+
full_html = self._decrypt_d(newcon, d_key)
|
248
|
+
tree = html.fromstring(full_html)
|
249
|
+
paragraphs = self._rebuild_paragraphs(tree)
|
250
|
+
return paragraphs
|
251
|
+
except Exception as e:
|
252
|
+
logger.warning("Failed to parse chapter page 3+: %s", e)
|
253
|
+
return []
|
254
|
+
|
255
|
+
@classmethod
|
256
|
+
def _extract_chapter_title(cls, tree: html.HtmlElement) -> str:
|
257
|
+
"""
|
258
|
+
Extract the chapter title from the HTML tree.
|
259
|
+
|
260
|
+
The title is expected to be located inside:
|
261
|
+
<h1 id="chapterTitle">...</h1>
|
262
|
+
|
263
|
+
:param tree: Parsed HTML element tree of the chapter page.
|
264
|
+
:return: Chapter title as a string, or an empty string if not found.
|
265
|
+
"""
|
266
|
+
return cls._first_str(tree.xpath('//h1[@id="chapterTitle"]/text()'))
|
267
|
+
|
268
|
+
def _char_from_img(self, url: str) -> str:
|
269
|
+
"""
|
270
|
+
Given an <img> src URL, return the mapped character if this image
|
271
|
+
represents a single glyph.
|
272
|
+
"""
|
273
|
+
fname = url.split("/")[-1].split("?", 1)[0]
|
274
|
+
char = self._FONT_MAP.get(fname)
|
275
|
+
if char:
|
276
|
+
return char
|
277
|
+
if url in self._GLYPH_CACHE:
|
278
|
+
return self._GLYPH_CACHE[url]
|
279
|
+
if self._decode_font:
|
280
|
+
char = self._recognize_glyph_from_url(url)
|
281
|
+
if char:
|
282
|
+
self._GLYPH_CACHE[url] = char
|
283
|
+
return char
|
284
|
+
return f'<img src="{url}" />'
|
285
|
+
|
286
|
+
@classmethod
|
287
|
+
def _recognize_glyph_from_url(cls, url: str) -> str | None:
|
288
|
+
"""
|
289
|
+
Download the glyph image at `url` and run the font OCR on it.
|
290
|
+
|
291
|
+
:param url: Fully-qualified <img src="..."> URL to a single-glyph image.
|
292
|
+
:return: The recognized character (top-1) if OCR succeeds, otherwise None.
|
293
|
+
"""
|
294
|
+
try:
|
295
|
+
import io
|
296
|
+
|
297
|
+
import numpy as np
|
298
|
+
from PIL import Image
|
299
|
+
|
300
|
+
from novel_downloader.utils.fontocr import get_font_ocr
|
301
|
+
|
302
|
+
resp = requests.get(url, headers=DEFAULT_USER_HEADERS, timeout=15)
|
303
|
+
resp.raise_for_status()
|
304
|
+
|
305
|
+
im = Image.open(io.BytesIO(resp.content)).convert("RGB")
|
306
|
+
img_np = np.asarray(im)
|
307
|
+
|
308
|
+
ocr = get_font_ocr(batch_size=1)
|
309
|
+
char, score = ocr.predict([img_np], top_k=1)[0][0]
|
310
|
+
|
311
|
+
return char if score >= cls._CONF_THRESHOLD else None
|
312
|
+
|
313
|
+
except ImportError:
|
314
|
+
logger.warning("[Parser] FontOCR not available, font decoding will skip")
|
315
|
+
except Exception as e:
|
316
|
+
logger.warning("[Parser] Failed to ocr glyph image %s: %s", url, e)
|
317
|
+
return None
|
318
|
+
|
319
|
+
@classmethod
|
320
|
+
def _parse_codeurl(cls, text: str) -> int:
|
321
|
+
"""
|
322
|
+
Extract the integer from `var codeurl="7";`.
|
323
|
+
|
324
|
+
Raises ValueError if not found.
|
325
|
+
"""
|
326
|
+
m = cls._CODEURL_PATTERN.search(text)
|
327
|
+
if not m:
|
328
|
+
raise ValueError("codeurl not found")
|
329
|
+
return int(m.group(1))
|
330
|
+
|
331
|
+
@classmethod
|
332
|
+
def _parse_nrid(cls, text: str) -> str:
|
333
|
+
"""
|
334
|
+
Extract the string from `var nrid="FGQSWYBCK";`.
|
335
|
+
|
336
|
+
Raises ValueError if not found.
|
337
|
+
"""
|
338
|
+
m = cls._NRID_PATTERN.search(text)
|
339
|
+
if not m:
|
340
|
+
raise ValueError("nrid not found")
|
341
|
+
return m.group(1)
|
342
|
+
|
343
|
+
@classmethod
|
344
|
+
def _parse_newcon(cls, text: str) -> str:
|
345
|
+
"""
|
346
|
+
Extract and decode the percent-encoded argument of
|
347
|
+
`let newcon=decodeURIComponent("...");`.
|
348
|
+
|
349
|
+
Raises ValueError if not found.
|
350
|
+
"""
|
351
|
+
m = cls._NEWCON_PATTERN.search(text)
|
352
|
+
if not m:
|
353
|
+
raise ValueError("newcon not found")
|
354
|
+
return urllib.parse.unquote(m.group(1))
|
355
|
+
|
356
|
+
@classmethod
|
357
|
+
def _parse_d_key(cls, text: str) -> str:
|
358
|
+
"""
|
359
|
+
Extract the second argument (the hex key) from `d(newcon, "...");`.
|
360
|
+
|
361
|
+
Raises ValueError if not found.
|
362
|
+
"""
|
363
|
+
m = cls._D_CALL_PATTERN.search(text)
|
364
|
+
if not m:
|
365
|
+
raise ValueError("d() call with key not found")
|
366
|
+
return m.group(1)
|
367
|
+
|
368
|
+
@classmethod
|
369
|
+
def _parse_client_meta(cls, tree: html.HtmlElement) -> str:
|
370
|
+
"""
|
371
|
+
Given an lxml.html tree, return the `content` of
|
372
|
+
<meta name="client" content="..."/> in <head>.
|
373
|
+
|
374
|
+
Raises ValueError if missing.
|
375
|
+
"""
|
376
|
+
vals = tree.xpath("//head/meta[@name='client']/@content")
|
377
|
+
if not vals:
|
378
|
+
raise ValueError("client meta not found")
|
379
|
+
return str(vals[0])
|
380
|
+
|
381
|
+
@staticmethod
|
382
|
+
def _restore_order(raw_b64: str, code: int) -> list[int]:
|
383
|
+
decoded = base64.b64decode(raw_b64).decode("utf-8")
|
384
|
+
fragments = re.split(r"[A-Z]+%", decoded)
|
385
|
+
|
386
|
+
order = [0] * len(fragments)
|
387
|
+
for i, m in enumerate(fragments):
|
388
|
+
# UpWz logic: k = ceil(parseInt(m) - ceil((i+1) % codeurl))
|
389
|
+
k = int(m) - ((i + 1) % code)
|
390
|
+
order[k] = i
|
391
|
+
return order
|
392
|
+
|
393
|
+
@staticmethod
|
394
|
+
def _decrypt_d(a: str, b: str) -> str:
|
395
|
+
digest = hashlib.md5(b.encode("utf-8")).hexdigest() # 32 hex chars
|
396
|
+
|
397
|
+
iv = digest[:16].encode("utf-8")
|
398
|
+
key = digest[16:].encode("utf-8")
|
399
|
+
|
400
|
+
ct = base64.b64decode(a)
|
401
|
+
plaintext = aes_cbc_decrypt(key, iv, ct, block_size=32)
|
402
|
+
|
403
|
+
return plaintext.decode("utf-8")
|
404
|
+
|
405
|
+
def _rebuild_paragraphs(self, content_div: html.HtmlElement) -> list[str]:
|
406
|
+
"""
|
407
|
+
Given a content container element, reconstruct each paragraph by
|
408
|
+
interleaving normal text nodes and <img>-based glyphs.
|
409
|
+
|
410
|
+
Uses `_char_from_img` to map image glyphs to characters.
|
411
|
+
|
412
|
+
:param content_div: The HTML element containing <p> paragraphs.
|
413
|
+
:return: List of reconstructed paragraph strings.
|
414
|
+
"""
|
415
|
+
paragraphs: list[str] = []
|
416
|
+
for p in content_div.xpath(".//p"):
|
417
|
+
parts: list[str] = []
|
418
|
+
|
419
|
+
# Leading text before any children
|
420
|
+
if p.text and p.text.strip():
|
421
|
+
parts.append(p.text.strip())
|
422
|
+
|
423
|
+
for child in p:
|
424
|
+
tag = child.tag.lower()
|
425
|
+
if tag == "img":
|
426
|
+
src = (child.get("src") or "").strip()
|
427
|
+
full = src if src.startswith("http") else self.BASE_URL + src
|
428
|
+
parts.append(self._char_from_img(full))
|
429
|
+
# Append any tail text after this child
|
430
|
+
if child.tail and child.tail.strip():
|
431
|
+
parts.append(child.tail.strip())
|
432
|
+
|
433
|
+
paragraph = "".join(parts).strip()
|
434
|
+
paragraphs.append(paragraph)
|
435
|
+
return paragraphs
|
@@ -0,0 +1,161 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.xs63b
|
4
|
+
-----------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
from lxml import html
|
12
|
+
|
13
|
+
from novel_downloader.core.parsers.base import BaseParser
|
14
|
+
from novel_downloader.core.parsers.registry import register_parser
|
15
|
+
from novel_downloader.models import (
|
16
|
+
BookInfoDict,
|
17
|
+
ChapterDict,
|
18
|
+
ChapterInfoDict,
|
19
|
+
VolumeInfoDict,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
@register_parser(
|
24
|
+
site_keys=["xs63b"],
|
25
|
+
)
|
26
|
+
class Xs63bParser(BaseParser):
|
27
|
+
"""
|
28
|
+
Parser for 小说路上 book pages.
|
29
|
+
"""
|
30
|
+
|
31
|
+
TITLE_SELECTOR = "//div[@class='block_txt2']//h2/text()"
|
32
|
+
AUTHOR_SELECTOR = "//p[contains(., '作者')]/a/text()"
|
33
|
+
TYPE_SELECTOR = "//p[contains(., '分类')]/a/text()"
|
34
|
+
STATUS_SELECTOR = "//p[contains(., '状态')]/text()"
|
35
|
+
UPDATE_SELECTOR = "//p[contains(., '更新')]/text()"
|
36
|
+
COVER_SELECTOR = "//div[@class='block_img2']//img/@src"
|
37
|
+
SUMMARY_SELECTOR = (
|
38
|
+
"//div[@class='intro' and contains(., '小说简介')]"
|
39
|
+
"/following-sibling::div[@class='intro_info'][1]"
|
40
|
+
)
|
41
|
+
CATALOG_ANCHORS = (
|
42
|
+
"//h2[contains(., '正文')]/following-sibling::div[@class='book_list'][1]//a"
|
43
|
+
)
|
44
|
+
|
45
|
+
CHAPTER_TITLE_SELECTOR = "//h1[@id='_52mb_h1']/text()"
|
46
|
+
CHAPTER_PARAGRAPHS = "//div[@id='nr1']//p"
|
47
|
+
|
48
|
+
_RE_STRIP_DIV = re.compile(r"^<div[^>]*>|</div>$", re.I)
|
49
|
+
_RE_STRIP_JIANJIE = re.compile(r"^\s*简介\s*[::]\s*", re.I)
|
50
|
+
_RE_SPACES = re.compile(r"[ \t]+")
|
51
|
+
|
52
|
+
ADS = {"如章节缺失", "本章未完", "下一页继续阅读", "xs63b.com"}
|
53
|
+
|
54
|
+
def parse_book_info(
|
55
|
+
self,
|
56
|
+
html_list: list[str],
|
57
|
+
**kwargs: Any,
|
58
|
+
) -> BookInfoDict | None:
|
59
|
+
if len(html_list) < 2:
|
60
|
+
return None
|
61
|
+
|
62
|
+
info_tree = html.fromstring(html_list[0])
|
63
|
+
catalog_tree = html.fromstring(html_list[1])
|
64
|
+
|
65
|
+
book_name = self._first_str(info_tree.xpath(self.TITLE_SELECTOR))
|
66
|
+
author = self._first_str(info_tree.xpath(self.AUTHOR_SELECTOR))
|
67
|
+
book_type = self._first_str(info_tree.xpath(self.TYPE_SELECTOR))
|
68
|
+
|
69
|
+
serial_status = self._first_str(
|
70
|
+
info_tree.xpath(self.STATUS_SELECTOR),
|
71
|
+
replaces=[("状态:", "")],
|
72
|
+
)
|
73
|
+
serial_status = self._norm_space(serial_status)
|
74
|
+
|
75
|
+
update_time = self._first_str(
|
76
|
+
info_tree.xpath(self.UPDATE_SELECTOR),
|
77
|
+
replaces=[("更新:", "")],
|
78
|
+
)
|
79
|
+
cover_url = self._first_str(info_tree.xpath(self.COVER_SELECTOR))
|
80
|
+
|
81
|
+
# Summary: keep first <br> segment, then cut at "{author}的作品集"
|
82
|
+
summary = ""
|
83
|
+
nodes = info_tree.xpath(self.SUMMARY_SELECTOR)
|
84
|
+
if nodes:
|
85
|
+
node_html = html.tostring(nodes[0], method="html", encoding="unicode")
|
86
|
+
node_html = self._RE_STRIP_DIV.sub("", node_html).strip()
|
87
|
+
first_seg = node_html.split("<br", 1)[0]
|
88
|
+
text = html.fromstring(f"<div>{first_seg}</div>").text_content()
|
89
|
+
text = self._RE_STRIP_JIANJIE.sub("", text).strip()
|
90
|
+
if author:
|
91
|
+
text = text.split(f"{author}的作品集")[0].strip()
|
92
|
+
summary = text
|
93
|
+
|
94
|
+
tags = [book_type] if book_type else []
|
95
|
+
|
96
|
+
chapters: list[ChapterInfoDict] = []
|
97
|
+
for a in catalog_tree.xpath(self.CATALOG_ANCHORS):
|
98
|
+
href = a.get("href") or ""
|
99
|
+
title = (a.text_content() or "").strip()
|
100
|
+
if not href or not title:
|
101
|
+
continue
|
102
|
+
# 'https://www.xs63b.com/xuanhuan/wanyuzhiwang/29546477.html' -> '29546477'
|
103
|
+
chap_id = href.rsplit("/", 1)[-1].split(".")[0]
|
104
|
+
chapters.append({"title": title, "url": href, "chapterId": chap_id})
|
105
|
+
|
106
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
107
|
+
|
108
|
+
return {
|
109
|
+
"book_name": book_name,
|
110
|
+
"author": author,
|
111
|
+
"cover_url": cover_url,
|
112
|
+
"update_time": update_time,
|
113
|
+
"serial_status": serial_status,
|
114
|
+
"summary": summary,
|
115
|
+
"tags": tags,
|
116
|
+
"volumes": volumes,
|
117
|
+
"extra": {},
|
118
|
+
}
|
119
|
+
|
120
|
+
def parse_chapter(
|
121
|
+
self,
|
122
|
+
html_list: list[str],
|
123
|
+
chapter_id: str,
|
124
|
+
**kwargs: Any,
|
125
|
+
) -> ChapterDict | None:
|
126
|
+
if not html_list:
|
127
|
+
return None
|
128
|
+
|
129
|
+
title = ""
|
130
|
+
paragraphs: list[str] = []
|
131
|
+
|
132
|
+
for html_str in html_list:
|
133
|
+
tree = html.fromstring(html_str)
|
134
|
+
|
135
|
+
if not title:
|
136
|
+
h1 = self._first_str(tree.xpath(self.CHAPTER_TITLE_SELECTOR))
|
137
|
+
title = h1.rsplit(" ", 1)[0].strip() if (" " in h1) else h1
|
138
|
+
|
139
|
+
for p in tree.xpath(self.CHAPTER_PARAGRAPHS):
|
140
|
+
cls = p.get("class") or ""
|
141
|
+
pid = p.get("id") or ""
|
142
|
+
if "hid-pages" in cls or "pages" in cls or "contentTip" in pid:
|
143
|
+
continue
|
144
|
+
|
145
|
+
txt = (p.text_content() or "").replace("\xa0", " ")
|
146
|
+
txt = self._RE_SPACES.sub(" ", txt).strip()
|
147
|
+
if not txt or self._is_ad_line(txt):
|
148
|
+
continue
|
149
|
+
|
150
|
+
paragraphs.append(txt)
|
151
|
+
|
152
|
+
content = "\n".join(paragraphs).strip()
|
153
|
+
if not content:
|
154
|
+
return None
|
155
|
+
|
156
|
+
return {
|
157
|
+
"id": chapter_id,
|
158
|
+
"title": title,
|
159
|
+
"content": content,
|
160
|
+
"extra": {"site": "xs63b"},
|
161
|
+
}
|