novel-downloader 1.3.3__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/clean.py +97 -78
- novel_downloader/cli/config.py +177 -0
- novel_downloader/cli/download.py +132 -87
- novel_downloader/cli/export.py +77 -0
- novel_downloader/cli/main.py +21 -28
- novel_downloader/config/__init__.py +1 -25
- novel_downloader/config/adapter.py +32 -31
- novel_downloader/config/loader.py +3 -3
- novel_downloader/config/site_rules.py +1 -2
- novel_downloader/core/__init__.py +3 -6
- novel_downloader/core/downloaders/__init__.py +10 -13
- novel_downloader/core/downloaders/base.py +233 -0
- novel_downloader/core/downloaders/biquge.py +27 -0
- novel_downloader/core/downloaders/common.py +414 -0
- novel_downloader/core/downloaders/esjzone.py +27 -0
- novel_downloader/core/downloaders/linovelib.py +27 -0
- novel_downloader/core/downloaders/qianbi.py +27 -0
- novel_downloader/core/downloaders/qidian.py +352 -0
- novel_downloader/core/downloaders/sfacg.py +27 -0
- novel_downloader/core/downloaders/yamibo.py +27 -0
- novel_downloader/core/exporters/__init__.py +37 -0
- novel_downloader/core/{savers → exporters}/base.py +73 -39
- novel_downloader/core/exporters/biquge.py +25 -0
- novel_downloader/core/exporters/common/__init__.py +12 -0
- novel_downloader/core/{savers → exporters}/common/epub.py +22 -22
- novel_downloader/core/{savers/common/main_saver.py → exporters/common/main_exporter.py} +35 -40
- novel_downloader/core/{savers → exporters}/common/txt.py +20 -23
- novel_downloader/core/{savers → exporters}/epub_utils/__init__.py +8 -3
- novel_downloader/core/{savers → exporters}/epub_utils/css_builder.py +2 -2
- novel_downloader/core/{savers → exporters}/epub_utils/image_loader.py +46 -4
- novel_downloader/core/{savers → exporters}/epub_utils/initializer.py +6 -4
- novel_downloader/core/{savers → exporters}/epub_utils/text_to_html.py +3 -3
- novel_downloader/core/{savers → exporters}/epub_utils/volume_intro.py +2 -2
- novel_downloader/core/exporters/esjzone.py +25 -0
- novel_downloader/core/exporters/linovelib/__init__.py +10 -0
- novel_downloader/core/exporters/linovelib/epub.py +449 -0
- novel_downloader/core/exporters/linovelib/main_exporter.py +127 -0
- novel_downloader/core/exporters/linovelib/txt.py +129 -0
- novel_downloader/core/exporters/qianbi.py +25 -0
- novel_downloader/core/{savers → exporters}/qidian.py +8 -8
- novel_downloader/core/exporters/sfacg.py +25 -0
- novel_downloader/core/exporters/yamibo.py +25 -0
- novel_downloader/core/factory/__init__.py +5 -17
- novel_downloader/core/factory/downloader.py +24 -126
- novel_downloader/core/factory/exporter.py +58 -0
- novel_downloader/core/factory/fetcher.py +96 -0
- novel_downloader/core/factory/parser.py +17 -12
- novel_downloader/core/{requesters → fetchers}/__init__.py +22 -15
- novel_downloader/core/{requesters → fetchers}/base/__init__.py +2 -4
- novel_downloader/core/fetchers/base/browser.py +383 -0
- novel_downloader/core/fetchers/base/rate_limiter.py +86 -0
- novel_downloader/core/fetchers/base/session.py +419 -0
- novel_downloader/core/fetchers/biquge/__init__.py +14 -0
- novel_downloader/core/{requesters/biquge/async_session.py → fetchers/biquge/browser.py} +18 -6
- novel_downloader/core/{requesters → fetchers}/biquge/session.py +23 -30
- novel_downloader/core/fetchers/common/__init__.py +14 -0
- novel_downloader/core/fetchers/common/browser.py +79 -0
- novel_downloader/core/{requesters/common/async_session.py → fetchers/common/session.py} +8 -25
- novel_downloader/core/fetchers/esjzone/__init__.py +14 -0
- novel_downloader/core/fetchers/esjzone/browser.py +202 -0
- novel_downloader/core/{requesters/esjzone/async_session.py → fetchers/esjzone/session.py} +62 -42
- novel_downloader/core/fetchers/linovelib/__init__.py +14 -0
- novel_downloader/core/fetchers/linovelib/browser.py +193 -0
- novel_downloader/core/fetchers/linovelib/session.py +193 -0
- novel_downloader/core/fetchers/qianbi/__init__.py +14 -0
- novel_downloader/core/{requesters/qianbi/session.py → fetchers/qianbi/browser.py} +30 -48
- novel_downloader/core/{requesters/qianbi/async_session.py → fetchers/qianbi/session.py} +18 -6
- novel_downloader/core/fetchers/qidian/__init__.py +14 -0
- novel_downloader/core/fetchers/qidian/browser.py +266 -0
- novel_downloader/core/fetchers/qidian/session.py +326 -0
- novel_downloader/core/fetchers/sfacg/__init__.py +14 -0
- novel_downloader/core/fetchers/sfacg/browser.py +189 -0
- novel_downloader/core/{requesters/sfacg/async_session.py → fetchers/sfacg/session.py} +43 -73
- novel_downloader/core/fetchers/yamibo/__init__.py +14 -0
- novel_downloader/core/fetchers/yamibo/browser.py +229 -0
- novel_downloader/core/{requesters/yamibo/async_session.py → fetchers/yamibo/session.py} +62 -44
- novel_downloader/core/interfaces/__init__.py +8 -12
- novel_downloader/core/interfaces/downloader.py +54 -0
- novel_downloader/core/interfaces/{saver.py → exporter.py} +12 -12
- novel_downloader/core/interfaces/fetcher.py +162 -0
- novel_downloader/core/interfaces/parser.py +6 -7
- novel_downloader/core/parsers/__init__.py +5 -6
- novel_downloader/core/parsers/base.py +9 -13
- novel_downloader/core/parsers/biquge/main_parser.py +12 -13
- novel_downloader/core/parsers/common/helper.py +3 -3
- novel_downloader/core/parsers/common/main_parser.py +39 -34
- novel_downloader/core/parsers/esjzone/main_parser.py +20 -14
- novel_downloader/core/parsers/linovelib/__init__.py +10 -0
- novel_downloader/core/parsers/linovelib/main_parser.py +210 -0
- novel_downloader/core/parsers/qianbi/main_parser.py +21 -15
- novel_downloader/core/parsers/qidian/__init__.py +2 -11
- novel_downloader/core/parsers/qidian/book_info_parser.py +113 -0
- novel_downloader/core/parsers/qidian/{browser/chapter_encrypted.py → chapter_encrypted.py} +162 -135
- novel_downloader/core/parsers/qidian/chapter_normal.py +150 -0
- novel_downloader/core/parsers/qidian/{session/chapter_router.py → chapter_router.py} +15 -15
- novel_downloader/core/parsers/qidian/{browser/main_parser.py → main_parser.py} +49 -40
- novel_downloader/core/parsers/qidian/utils/__init__.py +27 -0
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +145 -0
- novel_downloader/core/parsers/qidian/{shared → utils}/helpers.py +41 -68
- novel_downloader/core/parsers/qidian/{session → utils}/node_decryptor.py +64 -50
- novel_downloader/core/parsers/sfacg/main_parser.py +12 -12
- novel_downloader/core/parsers/yamibo/main_parser.py +10 -10
- novel_downloader/locales/en.json +18 -2
- novel_downloader/locales/zh.json +18 -2
- novel_downloader/models/__init__.py +64 -0
- novel_downloader/models/browser.py +21 -0
- novel_downloader/models/chapter.py +25 -0
- novel_downloader/models/config.py +100 -0
- novel_downloader/models/login.py +20 -0
- novel_downloader/models/site_rules.py +99 -0
- novel_downloader/models/tasks.py +33 -0
- novel_downloader/models/types.py +15 -0
- novel_downloader/resources/config/settings.toml +31 -25
- novel_downloader/resources/json/linovelib_font_map.json +3573 -0
- novel_downloader/tui/__init__.py +7 -0
- novel_downloader/tui/app.py +32 -0
- novel_downloader/tui/main.py +17 -0
- novel_downloader/tui/screens/__init__.py +14 -0
- novel_downloader/tui/screens/home.py +191 -0
- novel_downloader/tui/screens/login.py +74 -0
- novel_downloader/tui/styles/home_layout.tcss +79 -0
- novel_downloader/tui/widgets/richlog_handler.py +24 -0
- novel_downloader/utils/__init__.py +6 -0
- novel_downloader/utils/chapter_storage.py +25 -38
- novel_downloader/utils/constants.py +11 -5
- novel_downloader/utils/cookies.py +66 -0
- novel_downloader/utils/crypto_utils.py +1 -74
- novel_downloader/utils/fontocr/ocr_v1.py +2 -1
- novel_downloader/utils/fontocr/ocr_v2.py +2 -2
- novel_downloader/utils/hash_store.py +10 -18
- novel_downloader/utils/hash_utils.py +3 -2
- novel_downloader/utils/logger.py +2 -3
- novel_downloader/utils/network.py +2 -1
- novel_downloader/utils/text_utils/chapter_formatting.py +6 -1
- novel_downloader/utils/text_utils/font_mapping.py +1 -1
- novel_downloader/utils/text_utils/text_cleaning.py +1 -1
- novel_downloader/utils/time_utils/datetime_utils.py +3 -3
- novel_downloader/utils/time_utils/sleep_utils.py +1 -1
- {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/METADATA +69 -35
- novel_downloader-1.4.1.dist-info/RECORD +170 -0
- {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/WHEEL +1 -1
- {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/entry_points.txt +1 -0
- novel_downloader/cli/interactive.py +0 -66
- novel_downloader/cli/settings.py +0 -177
- novel_downloader/config/models.py +0 -187
- novel_downloader/core/downloaders/base/__init__.py +0 -14
- novel_downloader/core/downloaders/base/base_async.py +0 -153
- novel_downloader/core/downloaders/base/base_sync.py +0 -208
- novel_downloader/core/downloaders/biquge/__init__.py +0 -14
- novel_downloader/core/downloaders/biquge/biquge_async.py +0 -27
- novel_downloader/core/downloaders/biquge/biquge_sync.py +0 -27
- novel_downloader/core/downloaders/common/__init__.py +0 -14
- novel_downloader/core/downloaders/common/common_async.py +0 -210
- novel_downloader/core/downloaders/common/common_sync.py +0 -202
- novel_downloader/core/downloaders/esjzone/__init__.py +0 -14
- novel_downloader/core/downloaders/esjzone/esjzone_async.py +0 -27
- novel_downloader/core/downloaders/esjzone/esjzone_sync.py +0 -27
- novel_downloader/core/downloaders/qianbi/__init__.py +0 -14
- novel_downloader/core/downloaders/qianbi/qianbi_async.py +0 -27
- novel_downloader/core/downloaders/qianbi/qianbi_sync.py +0 -27
- novel_downloader/core/downloaders/qidian/__init__.py +0 -10
- novel_downloader/core/downloaders/qidian/qidian_sync.py +0 -219
- novel_downloader/core/downloaders/sfacg/__init__.py +0 -14
- novel_downloader/core/downloaders/sfacg/sfacg_async.py +0 -27
- novel_downloader/core/downloaders/sfacg/sfacg_sync.py +0 -27
- novel_downloader/core/downloaders/yamibo/__init__.py +0 -14
- novel_downloader/core/downloaders/yamibo/yamibo_async.py +0 -27
- novel_downloader/core/downloaders/yamibo/yamibo_sync.py +0 -27
- novel_downloader/core/factory/requester.py +0 -144
- novel_downloader/core/factory/saver.py +0 -56
- novel_downloader/core/interfaces/async_downloader.py +0 -36
- novel_downloader/core/interfaces/async_requester.py +0 -84
- novel_downloader/core/interfaces/sync_downloader.py +0 -36
- novel_downloader/core/interfaces/sync_requester.py +0 -82
- novel_downloader/core/parsers/qidian/browser/__init__.py +0 -12
- novel_downloader/core/parsers/qidian/browser/chapter_normal.py +0 -93
- novel_downloader/core/parsers/qidian/browser/chapter_router.py +0 -71
- novel_downloader/core/parsers/qidian/session/__init__.py +0 -12
- novel_downloader/core/parsers/qidian/session/chapter_encrypted.py +0 -443
- novel_downloader/core/parsers/qidian/session/chapter_normal.py +0 -115
- novel_downloader/core/parsers/qidian/session/main_parser.py +0 -128
- novel_downloader/core/parsers/qidian/shared/__init__.py +0 -37
- novel_downloader/core/parsers/qidian/shared/book_info_parser.py +0 -150
- novel_downloader/core/requesters/base/async_session.py +0 -410
- novel_downloader/core/requesters/base/browser.py +0 -337
- novel_downloader/core/requesters/base/session.py +0 -378
- novel_downloader/core/requesters/biquge/__init__.py +0 -14
- novel_downloader/core/requesters/common/__init__.py +0 -17
- novel_downloader/core/requesters/common/session.py +0 -113
- novel_downloader/core/requesters/esjzone/__init__.py +0 -13
- novel_downloader/core/requesters/esjzone/session.py +0 -235
- novel_downloader/core/requesters/qianbi/__init__.py +0 -13
- novel_downloader/core/requesters/qidian/__init__.py +0 -21
- novel_downloader/core/requesters/qidian/broswer.py +0 -307
- novel_downloader/core/requesters/qidian/session.py +0 -290
- novel_downloader/core/requesters/sfacg/__init__.py +0 -13
- novel_downloader/core/requesters/sfacg/session.py +0 -242
- novel_downloader/core/requesters/yamibo/__init__.py +0 -13
- novel_downloader/core/requesters/yamibo/session.py +0 -237
- novel_downloader/core/savers/__init__.py +0 -34
- novel_downloader/core/savers/biquge.py +0 -25
- novel_downloader/core/savers/common/__init__.py +0 -12
- novel_downloader/core/savers/esjzone.py +0 -25
- novel_downloader/core/savers/qianbi.py +0 -25
- novel_downloader/core/savers/sfacg.py +0 -25
- novel_downloader/core/savers/yamibo.py +0 -25
- novel_downloader/resources/config/rules.toml +0 -196
- novel_downloader-1.3.3.dist-info/RECORD +0 -166
- {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/top_level.txt +0 -0
@@ -5,13 +5,13 @@ novel_downloader.core.parsers.esjzone.main_parser
|
|
5
5
|
|
6
6
|
"""
|
7
7
|
|
8
|
+
import re
|
8
9
|
from typing import Any
|
9
10
|
|
10
|
-
from lxml import
|
11
|
-
from lxml.etree import _Element
|
11
|
+
from lxml import html
|
12
12
|
|
13
13
|
from novel_downloader.core.parsers.base import BaseParser
|
14
|
-
from novel_downloader.
|
14
|
+
from novel_downloader.models import ChapterDict
|
15
15
|
|
16
16
|
|
17
17
|
class EsjzoneParser(BaseParser):
|
@@ -40,7 +40,7 @@ class EsjzoneParser(BaseParser):
|
|
40
40
|
|
41
41
|
def parse_book_info(
|
42
42
|
self,
|
43
|
-
|
43
|
+
html_list: list[str],
|
44
44
|
**kwargs: Any,
|
45
45
|
) -> dict[str, Any]:
|
46
46
|
"""
|
@@ -49,12 +49,12 @@ class EsjzoneParser(BaseParser):
|
|
49
49
|
注: 由于网站使用了多种不同的分卷格式, 已经尝试兼容常见情况,
|
50
50
|
但仍可能存在未覆盖的 cases
|
51
51
|
|
52
|
-
:param
|
52
|
+
:param html_list: Raw HTML of the book info page.
|
53
53
|
:return: Parsed metadata and chapter structure as a dictionary.
|
54
54
|
"""
|
55
|
-
if not
|
55
|
+
if not html_list or self._is_forum_page(html_list):
|
56
56
|
return {}
|
57
|
-
tree =
|
57
|
+
tree = html.fromstring(html_list[0])
|
58
58
|
result: dict[str, Any] = {}
|
59
59
|
|
60
60
|
result["book_name"] = self._get_text(tree, self._BOOK_NAME_XPATH)
|
@@ -75,8 +75,14 @@ class EsjzoneParser(BaseParser):
|
|
75
75
|
volumes: list[dict[str, Any]] = []
|
76
76
|
current_vol: dict[str, Any] = {}
|
77
77
|
|
78
|
+
def _is_garbage_title(name: str) -> bool:
|
79
|
+
stripped = name.strip()
|
80
|
+
return not stripped or bool(re.fullmatch(r"[\W_]+", stripped))
|
81
|
+
|
78
82
|
def _start_volume(name: str) -> None:
|
79
83
|
nonlocal current_vol
|
84
|
+
if _is_garbage_title(name):
|
85
|
+
return
|
80
86
|
name = name.strip() or "未命名卷"
|
81
87
|
if name == "未命名卷" and current_vol is not None:
|
82
88
|
return
|
@@ -94,7 +100,7 @@ class EsjzoneParser(BaseParser):
|
|
94
100
|
tag = node.tag.lower()
|
95
101
|
|
96
102
|
if tag == "details":
|
97
|
-
# ---- DETAILS
|
103
|
+
# ---- DETAILS-based layout ----
|
98
104
|
summary = node.find("summary")
|
99
105
|
vol_name = summary.text if summary is not None else "未命名卷"
|
100
106
|
_start_volume(vol_name)
|
@@ -134,20 +140,20 @@ class EsjzoneParser(BaseParser):
|
|
134
140
|
|
135
141
|
def parse_chapter(
|
136
142
|
self,
|
137
|
-
|
143
|
+
html_list: list[str],
|
138
144
|
chapter_id: str,
|
139
145
|
**kwargs: Any,
|
140
146
|
) -> ChapterDict | None:
|
141
147
|
"""
|
142
148
|
Parse a single chapter page and extract clean text or simplified HTML.
|
143
149
|
|
144
|
-
:param
|
150
|
+
:param html_list: Raw HTML of the chapter page.
|
145
151
|
:param chapter_id: Identifier of the chapter being parsed.
|
146
152
|
:return: Cleaned chapter content as plain text or minimal HTML.
|
147
153
|
"""
|
148
|
-
if not
|
154
|
+
if not html_list or self._is_forum_page(html_list):
|
149
155
|
return None
|
150
|
-
tree =
|
156
|
+
tree = html.fromstring(html_list[0], parser=None)
|
151
157
|
|
152
158
|
content_lines: list[str] = []
|
153
159
|
content_nodes = tree.xpath(self._CHAPTER_CONTENT_NODES_XPATH)
|
@@ -198,7 +204,7 @@ class EsjzoneParser(BaseParser):
|
|
198
204
|
if not html_str:
|
199
205
|
return False
|
200
206
|
|
201
|
-
tree =
|
207
|
+
tree = html.fromstring(html_str[0])
|
202
208
|
page_title = tree.xpath('string(//div[@class="page-title"]//h1)').strip()
|
203
209
|
if page_title != "論壇":
|
204
210
|
return False
|
@@ -208,7 +214,7 @@ class EsjzoneParser(BaseParser):
|
|
208
214
|
|
209
215
|
@staticmethod
|
210
216
|
def _get_text(
|
211
|
-
tree:
|
217
|
+
tree: html.HtmlElement,
|
212
218
|
xpath: str,
|
213
219
|
join: bool = False,
|
214
220
|
clean_comma: bool = False,
|
@@ -0,0 +1,210 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.linovelib.main_parser
|
4
|
+
---------------------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import json
|
9
|
+
from itertools import islice
|
10
|
+
from pathlib import PurePosixPath
|
11
|
+
from typing import Any
|
12
|
+
|
13
|
+
from lxml import html
|
14
|
+
|
15
|
+
from novel_downloader.core.parsers.base import BaseParser
|
16
|
+
from novel_downloader.models import ChapterDict
|
17
|
+
from novel_downloader.utils.constants import LINOVELIB_FONT_MAP_PATH
|
18
|
+
|
19
|
+
|
20
|
+
class LinovelibParser(BaseParser):
|
21
|
+
""" """
|
22
|
+
|
23
|
+
# Book info XPaths
|
24
|
+
_BOOK_NAME_XPATH = '//div[@class="book-info"]/h1[@class="book-name"]/text()'
|
25
|
+
_AUTHOR_XPATH = '//div[@class="au-name"]/a[1]/text()'
|
26
|
+
_COVER_URL_XPATH = '//div[contains(@class, "book-img")]//img/@src'
|
27
|
+
_UPDATE_TIME_XPATH = (
|
28
|
+
'//div[@class="nums"]/span[contains(text(), "最后更新")]/text()' # noqa: E501
|
29
|
+
)
|
30
|
+
_SERIAL_STATUS_XPATH = '//div[@class="book-label"]/a[@class="state"]/text()'
|
31
|
+
_WORD_COUNT_XPATH = '//div[@class="nums"]/span[contains(text(), "字数")]/text()'
|
32
|
+
_SUMMARY_XPATH = '//div[contains(@class, "book-dec")]/p//text()'
|
33
|
+
|
34
|
+
_CHAPTERS_XPATH = '//div[@class="book-new-chapter"]/div[contains(@class, "tit")]/a'
|
35
|
+
|
36
|
+
# Chapter XPaths
|
37
|
+
_CHAPTER_TITLE_XPATH = "//div[@id='mlfy_main_text']/h1/text()"
|
38
|
+
_CHAPTER_CONTENT_NODES_XPATH = "//div[@id='TextContent']/*[self::p or self::img]"
|
39
|
+
|
40
|
+
_FONT_MAP: dict[str, str] = json.loads(
|
41
|
+
LINOVELIB_FONT_MAP_PATH.read_text(encoding="utf-8")
|
42
|
+
) # 注意 json 前 3500 条的内容不必要不修改
|
43
|
+
_BLANK_SET: set[str] = set(islice(_FONT_MAP.values(), 3500))
|
44
|
+
|
45
|
+
def parse_book_info(
|
46
|
+
self,
|
47
|
+
html_list: list[str],
|
48
|
+
**kwargs: Any,
|
49
|
+
) -> dict[str, Any]:
|
50
|
+
"""
|
51
|
+
Parse a book info page and extract metadata and chapter structure.
|
52
|
+
|
53
|
+
:param html_list: Raw HTML of the book info page.
|
54
|
+
:return: Parsed metadata and chapter structure as a dictionary.
|
55
|
+
"""
|
56
|
+
if not html_list:
|
57
|
+
return {}
|
58
|
+
info_tree = html.fromstring(html_list[0])
|
59
|
+
result: dict[str, Any] = {}
|
60
|
+
|
61
|
+
result["book_name"] = self._safe_xpath(info_tree, self._BOOK_NAME_XPATH)
|
62
|
+
result["author"] = self._safe_xpath(info_tree, self._AUTHOR_XPATH)
|
63
|
+
result["cover_url"] = self._safe_xpath(info_tree, self._COVER_URL_XPATH)
|
64
|
+
result["update_time"] = self._safe_xpath(
|
65
|
+
info_tree, self._UPDATE_TIME_XPATH, replace=("最后更新:", "")
|
66
|
+
)
|
67
|
+
result["serial_status"] = self._safe_xpath(info_tree, self._SERIAL_STATUS_XPATH)
|
68
|
+
result["word_count"] = self._safe_xpath(
|
69
|
+
info_tree, self._WORD_COUNT_XPATH, replace=("字数:", "")
|
70
|
+
)
|
71
|
+
|
72
|
+
result["summary"] = self._extract_intro(info_tree, self._SUMMARY_XPATH)
|
73
|
+
|
74
|
+
vol_pages = html_list[1:]
|
75
|
+
volumes: list[dict[str, Any]] = []
|
76
|
+
for vol_page in vol_pages:
|
77
|
+
vol_tree = html.fromstring(vol_page)
|
78
|
+
volume_cover = self._safe_xpath(vol_tree, self._COVER_URL_XPATH)
|
79
|
+
volume_name = self._safe_xpath(vol_tree, self._BOOK_NAME_XPATH)
|
80
|
+
update_time = self._safe_xpath(
|
81
|
+
vol_tree, self._UPDATE_TIME_XPATH, replace=("最后更新:", "")
|
82
|
+
)
|
83
|
+
word_count = self._safe_xpath(
|
84
|
+
vol_tree, self._WORD_COUNT_XPATH, replace=("字数:", "")
|
85
|
+
)
|
86
|
+
volume_intro = self._extract_intro(vol_tree, self._SUMMARY_XPATH)
|
87
|
+
|
88
|
+
chapters = []
|
89
|
+
chapter_elements = vol_tree.xpath(self._CHAPTERS_XPATH)
|
90
|
+
for a in chapter_elements:
|
91
|
+
title = a.text.strip()
|
92
|
+
url = a.attrib.get("href", "").strip()
|
93
|
+
chap_path = PurePosixPath(url.rstrip("/"))
|
94
|
+
chapters.append(
|
95
|
+
{"title": title, "url": url, "chapterId": chap_path.stem}
|
96
|
+
)
|
97
|
+
|
98
|
+
volumes.append(
|
99
|
+
{
|
100
|
+
"volume_name": volume_name,
|
101
|
+
"volume_cover": volume_cover,
|
102
|
+
"update_time": update_time,
|
103
|
+
"word_count": word_count,
|
104
|
+
"volume_intro": volume_intro,
|
105
|
+
"chapters": chapters,
|
106
|
+
}
|
107
|
+
)
|
108
|
+
result["volumes"] = volumes
|
109
|
+
|
110
|
+
return result
|
111
|
+
|
112
|
+
def parse_chapter(
|
113
|
+
self,
|
114
|
+
html_list: list[str],
|
115
|
+
chapter_id: str,
|
116
|
+
**kwargs: Any,
|
117
|
+
) -> ChapterDict | None:
|
118
|
+
"""
|
119
|
+
Parse chapter pages and extract clean text or simplified HTML.
|
120
|
+
|
121
|
+
:param html_list: Raw HTML of the chapter page.
|
122
|
+
:param chapter_id: Identifier of the chapter being parsed.
|
123
|
+
:return: Cleaned chapter content as plain text or minimal HTML.
|
124
|
+
"""
|
125
|
+
if not html_list:
|
126
|
+
return None
|
127
|
+
title_text: str = ""
|
128
|
+
contents: list[str] = []
|
129
|
+
for curr_html in html_list:
|
130
|
+
is_encrypted = self._is_encrypted(curr_html)
|
131
|
+
tree = html.fromstring(curr_html)
|
132
|
+
|
133
|
+
if not title_text:
|
134
|
+
titles = tree.xpath(self._CHAPTER_TITLE_XPATH)
|
135
|
+
if titles:
|
136
|
+
title_text = titles[0].strip()
|
137
|
+
|
138
|
+
content_container = tree.xpath("//div[@id='TextContent']")
|
139
|
+
if not content_container:
|
140
|
+
continue
|
141
|
+
container = content_container[0]
|
142
|
+
nodes = container.xpath("./p | ./img")
|
143
|
+
all_p = container.xpath("./p")
|
144
|
+
total_p = len(all_p)
|
145
|
+
p_counter = 0
|
146
|
+
|
147
|
+
for node in nodes:
|
148
|
+
tag = node.tag.lower()
|
149
|
+
if tag == "p":
|
150
|
+
raw_text = "".join(node.xpath(".//text()")).strip()
|
151
|
+
if not raw_text:
|
152
|
+
continue
|
153
|
+
|
154
|
+
if is_encrypted and p_counter == total_p - 2:
|
155
|
+
raw_text = self._apply_font_map(raw_text)
|
156
|
+
|
157
|
+
contents.append(raw_text)
|
158
|
+
p_counter += 1
|
159
|
+
|
160
|
+
elif tag == "img":
|
161
|
+
src = node.get("data-src") or node.get("src", "")
|
162
|
+
src = src.strip()
|
163
|
+
if src:
|
164
|
+
contents.append(f'<img src="{src}" />')
|
165
|
+
return {
|
166
|
+
"id": chapter_id,
|
167
|
+
"title": title_text,
|
168
|
+
"content": "\n\n".join(contents),
|
169
|
+
"extra": {"site": "linovelib"},
|
170
|
+
}
|
171
|
+
|
172
|
+
def _safe_xpath(
|
173
|
+
self,
|
174
|
+
tree: html.HtmlElement,
|
175
|
+
path: str,
|
176
|
+
replace: tuple[str, str] | None = None,
|
177
|
+
) -> str:
|
178
|
+
result = tree.xpath(path)
|
179
|
+
if not result:
|
180
|
+
return ""
|
181
|
+
value: str = result[0].strip()
|
182
|
+
if replace:
|
183
|
+
old, new = replace
|
184
|
+
value = value.replace(old, new)
|
185
|
+
return value
|
186
|
+
|
187
|
+
@staticmethod
|
188
|
+
def _extract_intro(tree: html.HtmlElement, xpath: str) -> str:
|
189
|
+
paragraphs = tree.xpath(xpath.replace("//text()", ""))
|
190
|
+
lines = []
|
191
|
+
for p in paragraphs:
|
192
|
+
text_segments = p.xpath(".//text()")
|
193
|
+
cleaned = [seg.strip() for seg in text_segments if seg.strip()]
|
194
|
+
lines.append("\n".join(cleaned))
|
195
|
+
return "\n\n".join(lines)
|
196
|
+
|
197
|
+
@staticmethod
|
198
|
+
def _is_encrypted(html: str) -> bool:
|
199
|
+
"""
|
200
|
+
Determine whether HTML content likely uses encrypted or obfuscated fonts.
|
201
|
+
"""
|
202
|
+
return "CSSStyleSheet" in html
|
203
|
+
|
204
|
+
@classmethod
|
205
|
+
def _apply_font_map(cls, text: str) -> str:
|
206
|
+
"""
|
207
|
+
Apply font mapping to the input text,
|
208
|
+
skipping characters in blank set.
|
209
|
+
"""
|
210
|
+
return "".join(cls._FONT_MAP.get(c, c) for c in text if c not in cls._BLANK_SET)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
"""
|
3
|
-
novel_downloader.core.parsers.
|
3
|
+
novel_downloader.core.parsers.qianbi.main_parser
|
4
4
|
------------------------------------------------
|
5
5
|
|
6
6
|
"""
|
@@ -8,10 +8,10 @@ novel_downloader.core.parsers.biquge.main_parser
|
|
8
8
|
from datetime import datetime
|
9
9
|
from typing import Any
|
10
10
|
|
11
|
-
from lxml import
|
11
|
+
from lxml import html
|
12
12
|
|
13
13
|
from novel_downloader.core.parsers.base import BaseParser
|
14
|
-
from novel_downloader.
|
14
|
+
from novel_downloader.models import ChapterDict
|
15
15
|
|
16
16
|
|
17
17
|
class QianbiParser(BaseParser):
|
@@ -19,20 +19,20 @@ class QianbiParser(BaseParser):
|
|
19
19
|
|
20
20
|
def parse_book_info(
|
21
21
|
self,
|
22
|
-
|
22
|
+
html_list: list[str],
|
23
23
|
**kwargs: Any,
|
24
24
|
) -> dict[str, Any]:
|
25
25
|
"""
|
26
26
|
Parse a book info page and extract metadata and chapter structure.
|
27
27
|
|
28
|
-
:param
|
28
|
+
:param html_list: Raw HTML of the book info pages.
|
29
29
|
:return: Parsed metadata and chapter structure as a dictionary.
|
30
30
|
"""
|
31
|
-
if len(
|
31
|
+
if len(html_list) < 2:
|
32
32
|
return {}
|
33
33
|
|
34
|
-
info_tree =
|
35
|
-
catalog_tree =
|
34
|
+
info_tree = html.fromstring(html_list[0])
|
35
|
+
catalog_tree = html.fromstring(html_list[1])
|
36
36
|
result: dict[str, Any] = {}
|
37
37
|
|
38
38
|
title = info_tree.xpath('//h1[@class="page-title"]/text()')
|
@@ -56,9 +56,7 @@ class QianbiParser(BaseParser):
|
|
56
56
|
'//div[@class="novel-info-item novel-info-content"]/span'
|
57
57
|
)
|
58
58
|
if summary_node and summary_node[0] is not None:
|
59
|
-
result["summary"] =
|
60
|
-
summary_node[0], encoding="unicode", method="text"
|
61
|
-
).strip()
|
59
|
+
result["summary"] = summary_node[0].text_content().strip()
|
62
60
|
else:
|
63
61
|
result["summary"] = ""
|
64
62
|
|
@@ -85,6 +83,8 @@ class QianbiParser(BaseParser):
|
|
85
83
|
if a_tag:
|
86
84
|
title = a_tag[0].xpath(".//span/text()")
|
87
85
|
href = a_tag[0].attrib.get("href", "")
|
86
|
+
if href == "javascript:cid(0)":
|
87
|
+
href = ""
|
88
88
|
chapter_id = (
|
89
89
|
href.split("/")[-1].replace(".html", "") if href else ""
|
90
90
|
)
|
@@ -105,20 +105,20 @@ class QianbiParser(BaseParser):
|
|
105
105
|
|
106
106
|
def parse_chapter(
|
107
107
|
self,
|
108
|
-
|
108
|
+
html_list: list[str],
|
109
109
|
chapter_id: str,
|
110
110
|
**kwargs: Any,
|
111
111
|
) -> ChapterDict | None:
|
112
112
|
"""
|
113
113
|
Parse a single chapter page and extract clean text or simplified HTML.
|
114
114
|
|
115
|
-
:param
|
115
|
+
:param html_list: Raw HTML of the chapter page.
|
116
116
|
:param chapter_id: Identifier of the chapter being parsed.
|
117
117
|
:return: Cleaned chapter content as plain text or minimal HTML.
|
118
118
|
"""
|
119
|
-
if not
|
119
|
+
if not html_list:
|
120
120
|
return None
|
121
|
-
tree =
|
121
|
+
tree = html.fromstring(html_list[0])
|
122
122
|
|
123
123
|
paras = tree.xpath('//div[@class="article-content"]/p/text()')
|
124
124
|
content_text = "\n\n".join(p.strip() for p in paras if p.strip())
|
@@ -131,6 +131,11 @@ class QianbiParser(BaseParser):
|
|
131
131
|
volume = tree.xpath('//h3[@class="text-muted"]/text()')
|
132
132
|
volume_text = volume[0].strip() if volume else ""
|
133
133
|
|
134
|
+
next_href = tree.xpath('//div[@class="footer"]/a[@class="f-right"]/@href')
|
135
|
+
next_chapter_id = (
|
136
|
+
next_href[0].split("/")[-1].replace(".html", "") if next_href else ""
|
137
|
+
)
|
138
|
+
|
134
139
|
return {
|
135
140
|
"id": chapter_id,
|
136
141
|
"title": title_text,
|
@@ -138,5 +143,6 @@ class QianbiParser(BaseParser):
|
|
138
143
|
"extra": {
|
139
144
|
"site": "qianbi",
|
140
145
|
"volume": volume_text,
|
146
|
+
"next_chapter_id": next_chapter_id,
|
141
147
|
},
|
142
148
|
}
|
@@ -3,17 +3,8 @@
|
|
3
3
|
novel_downloader.core.parsers.qidian
|
4
4
|
------------------------------------
|
5
5
|
|
6
|
-
This package provides parsing implementations for the Qidian platform.
|
7
|
-
|
8
|
-
Modules:
|
9
|
-
- browser: Contains `QidianBrowserParser` for browser-rendered page parsing.
|
10
|
-
- session: Contains `QidianSessionParser` for session page parsing.
|
11
6
|
"""
|
12
7
|
|
13
|
-
from .
|
14
|
-
from .session import QidianSessionParser
|
8
|
+
from .main_parser import QidianParser
|
15
9
|
|
16
|
-
__all__ = [
|
17
|
-
"QidianBrowserParser",
|
18
|
-
"QidianSessionParser",
|
19
|
-
]
|
10
|
+
__all__ = ["QidianParser"]
|
@@ -0,0 +1,113 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.qidian.book_info_parser
|
4
|
+
-----------------------------------------------------
|
5
|
+
|
6
|
+
This module provides parsing of Qidian book info pages.
|
7
|
+
|
8
|
+
It extracts metadata such as title, author, cover URL, update
|
9
|
+
time, status, word count, summary, and volume-chapter structure.
|
10
|
+
"""
|
11
|
+
|
12
|
+
import logging
|
13
|
+
from typing import Any
|
14
|
+
|
15
|
+
from lxml import html
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
_AUTHOR_XPATH = (
|
20
|
+
'string(//div[contains(@class, "book-info")]//a[contains(@class, "writer")])'
|
21
|
+
)
|
22
|
+
|
23
|
+
|
24
|
+
def _chapter_url_to_id(url: str) -> str:
|
25
|
+
return url.rstrip("/").split("/")[-1]
|
26
|
+
|
27
|
+
|
28
|
+
def _get_volume_name(
|
29
|
+
vol_elem: html.HtmlElement,
|
30
|
+
) -> str:
|
31
|
+
"""
|
32
|
+
Extracts the volume title from a <div class="volume"> element using lxml.
|
33
|
+
Ignores <a> tags, and extracts text from other elements.
|
34
|
+
"""
|
35
|
+
h3_candidates = vol_elem.xpath(".//h3")
|
36
|
+
if not h3_candidates:
|
37
|
+
return ""
|
38
|
+
texts = vol_elem.xpath(".//h3//text()[not(ancestor::a)]")
|
39
|
+
full_text = "".join(texts).strip()
|
40
|
+
return full_text.split(chr(183))[0].strip()
|
41
|
+
|
42
|
+
|
43
|
+
def parse_book_info(html_str: str) -> dict[str, Any]:
|
44
|
+
"""
|
45
|
+
Extract metadata: title, author, cover_url, update_time, status,
|
46
|
+
word_count, summary, and volumes with chapters.
|
47
|
+
|
48
|
+
:param html_str: Raw HTML of the book info page.
|
49
|
+
:return: A dict containing book metadata.
|
50
|
+
"""
|
51
|
+
info: dict[str, Any] = {}
|
52
|
+
try:
|
53
|
+
doc = html.fromstring(html_str)
|
54
|
+
|
55
|
+
book_name = doc.xpath('string(//h1/em[@id="bookName"])').strip()
|
56
|
+
info["book_name"] = book_name
|
57
|
+
|
58
|
+
author = doc.xpath(_AUTHOR_XPATH).strip()
|
59
|
+
info["author"] = author
|
60
|
+
|
61
|
+
cover_url = doc.xpath('string(//div[@class="book-img"]//img/@src)').strip()
|
62
|
+
info["cover_url"] = cover_url
|
63
|
+
|
64
|
+
update_raw = (
|
65
|
+
doc.xpath('string(//span[contains(@class, "update-time")])')
|
66
|
+
.replace("更新时间", "")
|
67
|
+
.strip()
|
68
|
+
)
|
69
|
+
info["update_time"] = update_raw
|
70
|
+
|
71
|
+
status = doc.xpath('string(//p[@class="tag"]/span[@class="blue"][1])').strip()
|
72
|
+
info["serial_status"] = status
|
73
|
+
|
74
|
+
tags = doc.xpath('//p[@class="tag"]/a[@class="red"]/text()')
|
75
|
+
info["tags"] = [t.strip() for t in tags if t.strip()]
|
76
|
+
|
77
|
+
wc_number = doc.xpath("string(//p[em and cite][1]/em[1])").strip()
|
78
|
+
wc_unit = doc.xpath("string(//p[em and cite][1]/cite[1])").strip()
|
79
|
+
info["word_count"] = (
|
80
|
+
(wc_number + wc_unit) if wc_number and wc_unit else "Unknown"
|
81
|
+
)
|
82
|
+
|
83
|
+
summary = doc.xpath('string(//p[@class="intro"])').strip()
|
84
|
+
info["summary_brief"] = summary
|
85
|
+
|
86
|
+
intro_list = doc.xpath('//div[@class="book-intro"]/p')[0]
|
87
|
+
detail_intro = "\n".join(intro_list.itertext()).strip()
|
88
|
+
info["summary"] = detail_intro
|
89
|
+
|
90
|
+
volumes = []
|
91
|
+
for vol_div in doc.xpath('//div[@class="volume-wrap"]/div[@class="volume"]'):
|
92
|
+
volume_name = _get_volume_name(vol_div)
|
93
|
+
chapters = []
|
94
|
+
for li in vol_div.xpath(".//li"):
|
95
|
+
a = li.xpath(".//a")[0] if li.xpath(".//a") else None
|
96
|
+
if a is None or "href" not in a.attrib:
|
97
|
+
continue
|
98
|
+
href = a.attrib["href"].strip()
|
99
|
+
title = "".join(a.itertext()).strip()
|
100
|
+
chapters.append(
|
101
|
+
{
|
102
|
+
"title": title,
|
103
|
+
"url": href,
|
104
|
+
"chapterId": _chapter_url_to_id(href),
|
105
|
+
}
|
106
|
+
)
|
107
|
+
volumes.append({"volume_name": volume_name, "chapters": chapters})
|
108
|
+
info["volumes"] = volumes
|
109
|
+
|
110
|
+
except Exception as e:
|
111
|
+
logger.warning("[Parser] Error parsing book info: %s", e)
|
112
|
+
|
113
|
+
return info
|