novel-downloader 1.3.0__py3-none-any.whl → 1.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/download.py +1 -1
- novel_downloader/config/adapter.py +3 -0
- novel_downloader/config/models.py +3 -0
- novel_downloader/core/downloaders/__init__.py +23 -1
- novel_downloader/core/downloaders/biquge/__init__.py +2 -0
- novel_downloader/core/downloaders/biquge/biquge_async.py +27 -0
- novel_downloader/core/downloaders/biquge/biquge_sync.py +5 -3
- novel_downloader/core/downloaders/common/common_async.py +5 -3
- novel_downloader/core/downloaders/common/common_sync.py +18 -10
- novel_downloader/core/downloaders/esjzone/__init__.py +14 -0
- novel_downloader/core/downloaders/esjzone/esjzone_async.py +27 -0
- novel_downloader/core/downloaders/esjzone/esjzone_sync.py +27 -0
- novel_downloader/core/downloaders/qianbi/__init__.py +14 -0
- novel_downloader/core/downloaders/qianbi/qianbi_async.py +27 -0
- novel_downloader/core/downloaders/qianbi/qianbi_sync.py +27 -0
- novel_downloader/core/downloaders/qidian/qidian_sync.py +9 -6
- novel_downloader/core/downloaders/sfacg/__init__.py +14 -0
- novel_downloader/core/downloaders/sfacg/sfacg_async.py +27 -0
- novel_downloader/core/downloaders/sfacg/sfacg_sync.py +27 -0
- novel_downloader/core/downloaders/yamibo/__init__.py +14 -0
- novel_downloader/core/downloaders/yamibo/yamibo_async.py +27 -0
- novel_downloader/core/downloaders/yamibo/yamibo_sync.py +27 -0
- novel_downloader/core/factory/downloader.py +35 -7
- novel_downloader/core/factory/parser.py +23 -2
- novel_downloader/core/factory/requester.py +32 -7
- novel_downloader/core/factory/saver.py +14 -2
- novel_downloader/core/interfaces/async_requester.py +3 -3
- novel_downloader/core/interfaces/parser.py +7 -2
- novel_downloader/core/interfaces/sync_requester.py +3 -3
- novel_downloader/core/parsers/__init__.py +15 -5
- novel_downloader/core/parsers/base.py +7 -2
- novel_downloader/core/parsers/biquge/main_parser.py +13 -4
- novel_downloader/core/parsers/common/main_parser.py +13 -4
- novel_downloader/core/parsers/esjzone/__init__.py +10 -0
- novel_downloader/core/parsers/esjzone/main_parser.py +219 -0
- novel_downloader/core/parsers/qianbi/__init__.py +10 -0
- novel_downloader/core/parsers/qianbi/main_parser.py +142 -0
- novel_downloader/core/parsers/qidian/browser/main_parser.py +13 -4
- novel_downloader/core/parsers/qidian/session/main_parser.py +13 -4
- novel_downloader/core/parsers/sfacg/__init__.py +10 -0
- novel_downloader/core/parsers/sfacg/main_parser.py +166 -0
- novel_downloader/core/parsers/yamibo/__init__.py +10 -0
- novel_downloader/core/parsers/yamibo/main_parser.py +194 -0
- novel_downloader/core/requesters/__init__.py +33 -3
- novel_downloader/core/requesters/base/async_session.py +14 -10
- novel_downloader/core/requesters/base/browser.py +4 -7
- novel_downloader/core/requesters/base/session.py +25 -11
- novel_downloader/core/requesters/biquge/__init__.py +2 -0
- novel_downloader/core/requesters/biquge/async_session.py +71 -0
- novel_downloader/core/requesters/biquge/session.py +6 -6
- novel_downloader/core/requesters/common/async_session.py +4 -4
- novel_downloader/core/requesters/common/session.py +6 -6
- novel_downloader/core/requesters/esjzone/__init__.py +13 -0
- novel_downloader/core/requesters/esjzone/async_session.py +211 -0
- novel_downloader/core/requesters/esjzone/session.py +235 -0
- novel_downloader/core/requesters/qianbi/__init__.py +13 -0
- novel_downloader/core/requesters/qianbi/async_session.py +96 -0
- novel_downloader/core/requesters/qianbi/session.py +125 -0
- novel_downloader/core/requesters/qidian/broswer.py +11 -10
- novel_downloader/core/requesters/qidian/session.py +14 -11
- novel_downloader/core/requesters/sfacg/__init__.py +13 -0
- novel_downloader/core/requesters/sfacg/async_session.py +204 -0
- novel_downloader/core/requesters/sfacg/session.py +242 -0
- novel_downloader/core/requesters/yamibo/__init__.py +13 -0
- novel_downloader/core/requesters/yamibo/async_session.py +211 -0
- novel_downloader/core/requesters/yamibo/session.py +237 -0
- novel_downloader/core/savers/__init__.py +15 -3
- novel_downloader/core/savers/base.py +1 -0
- novel_downloader/core/savers/esjzone.py +25 -0
- novel_downloader/core/savers/qianbi.py +25 -0
- novel_downloader/core/savers/sfacg.py +25 -0
- novel_downloader/core/savers/yamibo.py +25 -0
- novel_downloader/locales/en.json +1 -0
- novel_downloader/locales/zh.json +1 -0
- novel_downloader/resources/config/settings.toml +40 -4
- novel_downloader/utils/time_utils/__init__.py +2 -1
- novel_downloader/utils/time_utils/datetime_utils.py +3 -1
- novel_downloader/utils/time_utils/sleep_utils.py +43 -1
- {novel_downloader-1.3.0.dist-info → novel_downloader-1.3.2.dist-info}/METADATA +25 -20
- {novel_downloader-1.3.0.dist-info → novel_downloader-1.3.2.dist-info}/RECORD +85 -47
- {novel_downloader-1.3.0.dist-info → novel_downloader-1.3.2.dist-info}/WHEEL +0 -0
- {novel_downloader-1.3.0.dist-info → novel_downloader-1.3.2.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.3.0.dist-info → novel_downloader-1.3.2.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.3.0.dist-info → novel_downloader-1.3.2.dist-info}/top_level.txt +0 -0
@@ -14,19 +14,40 @@ from novel_downloader.core.interfaces import ParserProtocol
|
|
14
14
|
from novel_downloader.core.parsers import (
|
15
15
|
BiqugeParser,
|
16
16
|
CommonParser,
|
17
|
+
EsjzoneParser,
|
18
|
+
QianbiParser,
|
17
19
|
QidianBrowserParser,
|
18
20
|
QidianSessionParser,
|
21
|
+
SfacgParser,
|
22
|
+
YamiboParser,
|
19
23
|
)
|
20
24
|
|
21
25
|
ParserBuilder = Callable[[ParserConfig], ParserProtocol]
|
22
26
|
|
23
27
|
_site_map: dict[str, dict[str, ParserBuilder]] = {
|
28
|
+
"biquge": {
|
29
|
+
"session": BiqugeParser,
|
30
|
+
"async": BiqugeParser,
|
31
|
+
},
|
32
|
+
"esjzone": {
|
33
|
+
"session": EsjzoneParser,
|
34
|
+
"async": EsjzoneParser,
|
35
|
+
},
|
36
|
+
"qianbi": {
|
37
|
+
"session": QianbiParser,
|
38
|
+
"async": QianbiParser,
|
39
|
+
},
|
24
40
|
"qidian": {
|
25
41
|
"browser": QidianBrowserParser,
|
26
42
|
"session": QidianSessionParser,
|
27
43
|
},
|
28
|
-
"
|
29
|
-
"session":
|
44
|
+
"sfacg": {
|
45
|
+
"session": SfacgParser,
|
46
|
+
"async": SfacgParser,
|
47
|
+
},
|
48
|
+
"yamibo": {
|
49
|
+
"session": YamiboParser,
|
50
|
+
"async": YamiboParser,
|
30
51
|
},
|
31
52
|
}
|
32
53
|
|
@@ -15,30 +15,55 @@ from novel_downloader.core.interfaces import (
|
|
15
15
|
SyncRequesterProtocol,
|
16
16
|
)
|
17
17
|
from novel_downloader.core.requesters import (
|
18
|
+
BiqugeAsyncSession,
|
18
19
|
BiqugeSession,
|
19
20
|
CommonAsyncSession,
|
20
21
|
CommonSession,
|
22
|
+
EsjzoneAsyncSession,
|
23
|
+
EsjzoneSession,
|
24
|
+
QianbiAsyncSession,
|
25
|
+
QianbiSession,
|
21
26
|
QidianBrowser,
|
22
27
|
QidianSession,
|
28
|
+
SfacgAsyncSession,
|
29
|
+
SfacgSession,
|
30
|
+
YamiboAsyncSession,
|
31
|
+
YamiboSession,
|
23
32
|
)
|
24
33
|
|
25
34
|
AsyncRequesterBuilder = Callable[[RequesterConfig], AsyncRequesterProtocol]
|
26
35
|
SyncRequesterBuilder = Callable[[RequesterConfig], SyncRequesterProtocol]
|
27
36
|
|
28
37
|
|
29
|
-
|
30
|
-
|
31
|
-
|
38
|
+
_async_site_map: dict[str, AsyncRequesterBuilder] = {
|
39
|
+
"biquge": BiqugeAsyncSession,
|
40
|
+
"esjzone": EsjzoneAsyncSession,
|
41
|
+
"qianbi": QianbiAsyncSession,
|
42
|
+
"sfacg": SfacgAsyncSession,
|
43
|
+
"yamibo": YamiboAsyncSession,
|
44
|
+
}
|
32
45
|
_sync_site_map: dict[
|
33
46
|
str,
|
34
47
|
dict[str, SyncRequesterBuilder],
|
35
48
|
] = {
|
49
|
+
"biquge": {
|
50
|
+
"session": BiqugeSession,
|
51
|
+
},
|
52
|
+
"esjzone": {
|
53
|
+
"session": EsjzoneSession,
|
54
|
+
},
|
55
|
+
"qianbi": {
|
56
|
+
"session": QianbiSession,
|
57
|
+
},
|
36
58
|
"qidian": {
|
37
59
|
"session": QidianSession,
|
38
60
|
"browser": QidianBrowser,
|
39
61
|
},
|
40
|
-
"
|
41
|
-
"session":
|
62
|
+
"sfacg": {
|
63
|
+
"session": SfacgSession,
|
64
|
+
},
|
65
|
+
"yamibo": {
|
66
|
+
"session": YamiboSession,
|
42
67
|
},
|
43
68
|
}
|
44
69
|
|
@@ -57,8 +82,8 @@ def get_async_requester(
|
|
57
82
|
site_key = site.lower()
|
58
83
|
|
59
84
|
# site-specific
|
60
|
-
|
61
|
-
|
85
|
+
if site_key in _async_site_map:
|
86
|
+
return _async_site_map[site_key](config)
|
62
87
|
|
63
88
|
# fallback
|
64
89
|
site_rules = load_site_rules()
|
@@ -7,17 +7,29 @@ This module implements a factory function for creating saver instances
|
|
7
7
|
based on the site name and parser mode specified in the configuration.
|
8
8
|
"""
|
9
9
|
|
10
|
+
from collections.abc import Callable
|
11
|
+
|
10
12
|
from novel_downloader.config import SaverConfig, load_site_rules
|
11
13
|
from novel_downloader.core.interfaces import SaverProtocol
|
12
14
|
from novel_downloader.core.savers import (
|
13
15
|
BiqugeSaver,
|
14
16
|
CommonSaver,
|
17
|
+
EsjzoneSaver,
|
18
|
+
QianbiSaver,
|
15
19
|
QidianSaver,
|
20
|
+
SfacgSaver,
|
21
|
+
YamiboSaver,
|
16
22
|
)
|
17
23
|
|
18
|
-
|
19
|
-
|
24
|
+
SaverBuilder = Callable[[SaverConfig], SaverProtocol]
|
25
|
+
|
26
|
+
_site_map: dict[str, SaverBuilder] = {
|
20
27
|
"biquge": BiqugeSaver,
|
28
|
+
"esjzone": EsjzoneSaver,
|
29
|
+
"qianbi": QianbiSaver,
|
30
|
+
"qidian": QidianSaver,
|
31
|
+
"sfacg": SfacgSaver,
|
32
|
+
"yamibo": YamiboSaver,
|
21
33
|
}
|
22
34
|
|
23
35
|
|
@@ -40,7 +40,7 @@ class AsyncRequesterProtocol(Protocol):
|
|
40
40
|
self,
|
41
41
|
book_id: str,
|
42
42
|
**kwargs: Any,
|
43
|
-
) -> str:
|
43
|
+
) -> list[str]:
|
44
44
|
"""
|
45
45
|
Fetch the raw HTML (or JSON) of the book info page asynchronously.
|
46
46
|
|
@@ -54,7 +54,7 @@ class AsyncRequesterProtocol(Protocol):
|
|
54
54
|
book_id: str,
|
55
55
|
chapter_id: str,
|
56
56
|
**kwargs: Any,
|
57
|
-
) -> str:
|
57
|
+
) -> list[str]:
|
58
58
|
"""
|
59
59
|
Fetch the raw HTML (or JSON) of a single chapter asynchronously.
|
60
60
|
|
@@ -68,7 +68,7 @@ class AsyncRequesterProtocol(Protocol):
|
|
68
68
|
self,
|
69
69
|
page: int = 1,
|
70
70
|
**kwargs: Any,
|
71
|
-
) -> str:
|
71
|
+
) -> list[str]:
|
72
72
|
"""
|
73
73
|
Optional: Retrieve the HTML content of the authenticated
|
74
74
|
user's bookcase page asynchronously.
|
@@ -21,7 +21,11 @@ class ParserProtocol(Protocol):
|
|
21
21
|
- accept a book_id context for multi-step workflows.
|
22
22
|
"""
|
23
23
|
|
24
|
-
def parse_book_info(
|
24
|
+
def parse_book_info(
|
25
|
+
self,
|
26
|
+
html_str: list[str],
|
27
|
+
**kwargs: Any,
|
28
|
+
) -> dict[str, Any]:
|
25
29
|
"""
|
26
30
|
Parse and return a dictionary of book information from the raw HTML.
|
27
31
|
|
@@ -32,8 +36,9 @@ class ParserProtocol(Protocol):
|
|
32
36
|
|
33
37
|
def parse_chapter(
|
34
38
|
self,
|
35
|
-
html_str: str,
|
39
|
+
html_str: list[str],
|
36
40
|
chapter_id: str,
|
41
|
+
**kwargs: Any,
|
37
42
|
) -> ChapterDict | None:
|
38
43
|
"""
|
39
44
|
Parse and return the text content of one chapter.
|
@@ -38,7 +38,7 @@ class SyncRequesterProtocol(Protocol):
|
|
38
38
|
self,
|
39
39
|
book_id: str,
|
40
40
|
**kwargs: Any,
|
41
|
-
) -> str:
|
41
|
+
) -> list[str]:
|
42
42
|
"""
|
43
43
|
Fetch the raw HTML (or JSON) of the book info page.
|
44
44
|
|
@@ -52,7 +52,7 @@ class SyncRequesterProtocol(Protocol):
|
|
52
52
|
book_id: str,
|
53
53
|
chapter_id: str,
|
54
54
|
**kwargs: Any,
|
55
|
-
) -> str:
|
55
|
+
) -> list[str]:
|
56
56
|
"""
|
57
57
|
Fetch the raw HTML (or JSON) of a single chapter.
|
58
58
|
|
@@ -66,7 +66,7 @@ class SyncRequesterProtocol(Protocol):
|
|
66
66
|
self,
|
67
67
|
page: int = 1,
|
68
68
|
**kwargs: Any,
|
69
|
-
) -> str:
|
69
|
+
) -> list[str]:
|
70
70
|
"""
|
71
71
|
Optional: Retrieve the HTML content of the authenticated user's bookcase page.
|
72
72
|
|
@@ -6,24 +6,34 @@ novel_downloader.core.parsers
|
|
6
6
|
This package defines all site-specific parsing modules
|
7
7
|
for the novel_downloader framework.
|
8
8
|
|
9
|
-
Currently supported:
|
10
|
-
- Qidian (起点中文网)
|
11
|
-
|
12
9
|
Modules:
|
13
|
-
-
|
14
|
-
-
|
10
|
+
- biquge (笔趣阁)
|
11
|
+
- esjzone (ESJ Zone)
|
12
|
+
- qianbi (铅笔小说)
|
13
|
+
- qidian (起点中文网)
|
14
|
+
- sfacg (SF轻小说)
|
15
|
+
- yamibo (百合会)
|
16
|
+
- common (通用架构)
|
15
17
|
"""
|
16
18
|
|
17
19
|
from .biquge import BiqugeParser
|
18
20
|
from .common import CommonParser
|
21
|
+
from .esjzone import EsjzoneParser
|
22
|
+
from .qianbi import QianbiParser
|
19
23
|
from .qidian import (
|
20
24
|
QidianBrowserParser,
|
21
25
|
QidianSessionParser,
|
22
26
|
)
|
27
|
+
from .sfacg import SfacgParser
|
28
|
+
from .yamibo import YamiboParser
|
23
29
|
|
24
30
|
__all__ = [
|
25
31
|
"BiqugeParser",
|
26
32
|
"CommonParser",
|
33
|
+
"EsjzoneParser",
|
34
|
+
"QianbiParser",
|
27
35
|
"QidianBrowserParser",
|
28
36
|
"QidianSessionParser",
|
37
|
+
"SfacgParser",
|
38
|
+
"YamiboParser",
|
29
39
|
]
|
@@ -49,7 +49,11 @@ class BaseParser(ParserProtocol, abc.ABC):
|
|
49
49
|
self._cache_dir = self._base_cache_dir
|
50
50
|
|
51
51
|
@abc.abstractmethod
|
52
|
-
def parse_book_info(
|
52
|
+
def parse_book_info(
|
53
|
+
self,
|
54
|
+
html_str: list[str],
|
55
|
+
**kwargs: Any,
|
56
|
+
) -> dict[str, Any]:
|
53
57
|
"""
|
54
58
|
Parse a book info page and extract metadata and chapter structure.
|
55
59
|
|
@@ -64,8 +68,9 @@ class BaseParser(ParserProtocol, abc.ABC):
|
|
64
68
|
@abc.abstractmethod
|
65
69
|
def parse_chapter(
|
66
70
|
self,
|
67
|
-
html_str: str,
|
71
|
+
html_str: list[str],
|
68
72
|
chapter_id: str,
|
73
|
+
**kwargs: Any,
|
69
74
|
) -> ChapterDict | None:
|
70
75
|
"""
|
71
76
|
Parse a single chapter page and extract clean text or simplified HTML.
|
@@ -18,14 +18,20 @@ from novel_downloader.utils.chapter_storage import ChapterDict
|
|
18
18
|
class BiqugeParser(BaseParser):
|
19
19
|
""" """
|
20
20
|
|
21
|
-
def parse_book_info(
|
21
|
+
def parse_book_info(
|
22
|
+
self,
|
23
|
+
html_str: list[str],
|
24
|
+
**kwargs: Any,
|
25
|
+
) -> dict[str, Any]:
|
22
26
|
"""
|
23
27
|
Parse a book info page and extract metadata and chapter structure.
|
24
28
|
|
25
29
|
:param html: Raw HTML of the book info page.
|
26
30
|
:return: Parsed metadata and chapter structure as a dictionary.
|
27
31
|
"""
|
28
|
-
|
32
|
+
if not html_str:
|
33
|
+
return {}
|
34
|
+
tree = etree.HTML(html_str[0])
|
29
35
|
result: dict[str, Any] = {}
|
30
36
|
|
31
37
|
def extract_text(elem: _Element | None) -> str:
|
@@ -90,8 +96,9 @@ class BiqugeParser(BaseParser):
|
|
90
96
|
|
91
97
|
def parse_chapter(
|
92
98
|
self,
|
93
|
-
html_str: str,
|
99
|
+
html_str: list[str],
|
94
100
|
chapter_id: str,
|
101
|
+
**kwargs: Any,
|
95
102
|
) -> ChapterDict | None:
|
96
103
|
"""
|
97
104
|
Parse a single chapter page and extract clean text or simplified HTML.
|
@@ -100,7 +107,9 @@ class BiqugeParser(BaseParser):
|
|
100
107
|
:param chapter_id: Identifier of the chapter being parsed.
|
101
108
|
:return: Cleaned chapter content as plain text or minimal HTML.
|
102
109
|
"""
|
103
|
-
|
110
|
+
if not html_str:
|
111
|
+
return None
|
112
|
+
tree = etree.HTML(html_str[0], parser=None)
|
104
113
|
|
105
114
|
# 提取标题
|
106
115
|
title_elem = tree.xpath('//div[@class="bookname"]/h1')
|
@@ -35,21 +35,28 @@ class CommonParser(BaseParser):
|
|
35
35
|
self._site = site
|
36
36
|
self._site_rule = site_rule
|
37
37
|
|
38
|
-
def parse_book_info(
|
38
|
+
def parse_book_info(
|
39
|
+
self,
|
40
|
+
html_str: list[str],
|
41
|
+
**kwargs: Any,
|
42
|
+
) -> dict[str, Any]:
|
39
43
|
"""
|
40
44
|
Parse a book info page and extract metadata and chapter structure.
|
41
45
|
|
42
46
|
:param html: Raw HTML of the book info page.
|
43
47
|
:return: Parsed metadata and chapter structure as a dictionary.
|
44
48
|
"""
|
45
|
-
|
49
|
+
if not html_str:
|
50
|
+
return {}
|
51
|
+
extractor = HTMLExtractor(html_str[0])
|
46
52
|
rules = self._site_rule["book_info"]
|
47
53
|
return extractor.extract_book_info(rules)
|
48
54
|
|
49
55
|
def parse_chapter(
|
50
56
|
self,
|
51
|
-
html_str: str,
|
57
|
+
html_str: list[str],
|
52
58
|
chapter_id: str,
|
59
|
+
**kwargs: Any,
|
53
60
|
) -> ChapterDict | None:
|
54
61
|
"""
|
55
62
|
Parse a single chapter page and extract clean text or simplified HTML.
|
@@ -58,7 +65,9 @@ class CommonParser(BaseParser):
|
|
58
65
|
:param chapter_id: Identifier of the chapter being parsed.
|
59
66
|
:return: Cleaned chapter content as plain text or minimal HTML.
|
60
67
|
"""
|
61
|
-
|
68
|
+
if not html_str:
|
69
|
+
return None
|
70
|
+
extractor = HTMLExtractor(html_str[0])
|
62
71
|
chapter_rules = self._site_rule["chapter"]
|
63
72
|
|
64
73
|
# 必须有正文内容
|
@@ -0,0 +1,219 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.esjzone.main_parser
|
4
|
+
-------------------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any
|
9
|
+
|
10
|
+
from lxml import etree
|
11
|
+
from lxml.etree import _Element
|
12
|
+
|
13
|
+
from novel_downloader.core.parsers.base import BaseParser
|
14
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
15
|
+
|
16
|
+
|
17
|
+
class EsjzoneParser(BaseParser):
|
18
|
+
""" """
|
19
|
+
|
20
|
+
# Book info XPaths
|
21
|
+
_BOOK_NAME_XPATH = '//h2[contains(@class, "text-normal")]/text()'
|
22
|
+
_AUTHOR_XPATH = '//li[strong[text()="作者:"]]/a/text()'
|
23
|
+
_COVER_URL_XPATH = '//div[contains(@class,"product-gallery")]//img/@src'
|
24
|
+
_UPDATE_TIME_XPATH = '//li[strong[text()="更新日期:"]]/text()'
|
25
|
+
_WORD_COUNT_XPATH = '//span[@id="txt"]/text()'
|
26
|
+
_TYPE_XPATH = '//li[strong[text()="類型:"]]/text()'
|
27
|
+
_ALT_NAME_XPATH = '//li[strong[text()="其他書名:"]]/text()'
|
28
|
+
_WEB_URL_XPATH = '//li[strong[text()="Web生肉:"]]/a/@href'
|
29
|
+
_SUMMARY_XPATH = '//div[@class="description"]/p//text()'
|
30
|
+
|
31
|
+
# Chapter XPaths
|
32
|
+
_CHAPTER_TEXT_XPATH = 'string(//div[contains(@class, "forum-content")])'
|
33
|
+
_CHAPTER_CONTENT_NODES_XPATH = '//div[contains(@class, "forum-content")]/*'
|
34
|
+
_CHAPTER_TIME_XPATHS = [
|
35
|
+
'//i[contains(@class, "icon-clock")]/following-sibling::text()',
|
36
|
+
'//i[contains(@class, "icon-pen-tool")]/following-sibling::text()',
|
37
|
+
]
|
38
|
+
|
39
|
+
_CHECK_FORUM_XPATH = '//div[@class="page-title"]//ul[@class="breadcrumbs"]/li[not(@class="slash")]//text()' # noqa: E501
|
40
|
+
|
41
|
+
def parse_book_info(
|
42
|
+
self,
|
43
|
+
html_str: list[str],
|
44
|
+
**kwargs: Any,
|
45
|
+
) -> dict[str, Any]:
|
46
|
+
"""
|
47
|
+
Parse a book info page and extract metadata and chapter structure.
|
48
|
+
|
49
|
+
注: 由于网站使用了多种不同的分卷格式, 已经尝试兼容常见情况,
|
50
|
+
但仍可能存在未覆盖的 cases
|
51
|
+
|
52
|
+
:param html: Raw HTML of the book info page.
|
53
|
+
:return: Parsed metadata and chapter structure as a dictionary.
|
54
|
+
"""
|
55
|
+
if not html_str or self._is_forum_page(html_str):
|
56
|
+
return {}
|
57
|
+
tree = etree.HTML(html_str[0])
|
58
|
+
result: dict[str, Any] = {}
|
59
|
+
|
60
|
+
result["book_name"] = self._get_text(tree, self._BOOK_NAME_XPATH)
|
61
|
+
result["author"] = self._get_text(tree, self._AUTHOR_XPATH)
|
62
|
+
result["cover_url"] = self._get_text(tree, self._COVER_URL_XPATH)
|
63
|
+
result["update_time"] = self._get_text(tree, self._UPDATE_TIME_XPATH)
|
64
|
+
result["word_count"] = self._get_text(
|
65
|
+
tree, self._WORD_COUNT_XPATH, clean_comma=True
|
66
|
+
)
|
67
|
+
result["type"] = self._get_text(tree, self._TYPE_XPATH)
|
68
|
+
result["alt_name"] = self._get_text(tree, self._ALT_NAME_XPATH)
|
69
|
+
result["web_url"] = self._get_text(tree, self._WEB_URL_XPATH)
|
70
|
+
# result["summary"] = self._get_text(tree, self._SUMMARY_XPATH, join=True)
|
71
|
+
paras = tree.xpath('//div[@class="description"]/p')
|
72
|
+
texts = [p.xpath("string()").strip() for p in paras]
|
73
|
+
result["summary"] = "\n".join(texts).strip()
|
74
|
+
|
75
|
+
volumes: list[dict[str, Any]] = []
|
76
|
+
current_vol: dict[str, Any] = {}
|
77
|
+
|
78
|
+
def _start_volume(name: str) -> None:
|
79
|
+
nonlocal current_vol
|
80
|
+
name = name.strip() or "未命名卷"
|
81
|
+
if name == "未命名卷" and current_vol is not None:
|
82
|
+
return
|
83
|
+
current_vol = {"volume_name": name, "chapters": []}
|
84
|
+
volumes.append(current_vol)
|
85
|
+
|
86
|
+
_start_volume("單卷")
|
87
|
+
|
88
|
+
nodes = tree.xpath('//div[@id="chapterList"]/details') + tree.xpath(
|
89
|
+
'//div[@id="chapterList"]/*[not(self::details)]'
|
90
|
+
)
|
91
|
+
|
92
|
+
for node in nodes:
|
93
|
+
tag = node.tag.lower()
|
94
|
+
|
95
|
+
if tag == "details":
|
96
|
+
# ---- DETAILS‐based layout ----
|
97
|
+
summary = node.find("summary")
|
98
|
+
vol_name = summary.text if summary is not None else "未命名卷"
|
99
|
+
_start_volume(vol_name)
|
100
|
+
|
101
|
+
# all chapters inside this details
|
102
|
+
for a in node.findall("a"):
|
103
|
+
title = "".join(a.xpath(".//p//text()")).strip()
|
104
|
+
href = a.get("href", "")
|
105
|
+
chap_id = href.rstrip("/").split("/")[-1].split(".", 1)[0]
|
106
|
+
current_vol["chapters"].append(
|
107
|
+
{"title": title, "url": href, "chapterId": chap_id}
|
108
|
+
)
|
109
|
+
|
110
|
+
elif (
|
111
|
+
tag in ("h2",)
|
112
|
+
or (tag == "p" and node.get("class") == "non")
|
113
|
+
or tag == "summary"
|
114
|
+
):
|
115
|
+
# Handle possible volume title markers:
|
116
|
+
# - <h2>: standard volume header
|
117
|
+
# - <p class="non">: alternative volume header style
|
118
|
+
# - <summary>: fallback for stray <summary> tags outside <details>
|
119
|
+
_start_volume(node.xpath("string()"))
|
120
|
+
|
121
|
+
elif tag == "a":
|
122
|
+
# ---- chapter link, attach to current volume ----
|
123
|
+
title = "".join(node.xpath(".//p//text()")).strip()
|
124
|
+
href = node.get("href", "")
|
125
|
+
chap_id = href.rstrip("/").split("/")[-1].split(".", 1)[0]
|
126
|
+
current_vol["chapters"].append(
|
127
|
+
{"title": title, "url": href, "chapterId": chap_id}
|
128
|
+
)
|
129
|
+
volumes = [vol for vol in volumes if vol["chapters"]]
|
130
|
+
result["volumes"] = volumes
|
131
|
+
|
132
|
+
return result
|
133
|
+
|
134
|
+
def parse_chapter(
|
135
|
+
self,
|
136
|
+
html_str: list[str],
|
137
|
+
chapter_id: str,
|
138
|
+
**kwargs: Any,
|
139
|
+
) -> ChapterDict | None:
|
140
|
+
"""
|
141
|
+
Parse a single chapter page and extract clean text or simplified HTML.
|
142
|
+
|
143
|
+
:param html: Raw HTML of the chapter page.
|
144
|
+
:param chapter_id: Identifier of the chapter being parsed.
|
145
|
+
:return: Cleaned chapter content as plain text or minimal HTML.
|
146
|
+
"""
|
147
|
+
if not html_str or self._is_forum_page(html_str):
|
148
|
+
return None
|
149
|
+
tree = etree.HTML(html_str[0], parser=None)
|
150
|
+
|
151
|
+
content_lines: list[str] = []
|
152
|
+
content_nodes = tree.xpath(self._CHAPTER_CONTENT_NODES_XPATH)
|
153
|
+
for node in content_nodes:
|
154
|
+
if node.tag == "p":
|
155
|
+
img_srcs = node.xpath(".//img/@src")
|
156
|
+
if img_srcs:
|
157
|
+
for src in img_srcs:
|
158
|
+
content_lines.append(f'<img src="{src}" />')
|
159
|
+
else:
|
160
|
+
text = "".join(node.xpath(".//text()")).strip()
|
161
|
+
if text:
|
162
|
+
content_lines.append(text)
|
163
|
+
elif node.tag == "a":
|
164
|
+
img_srcs = node.xpath(".//img/@src")
|
165
|
+
for src in img_srcs:
|
166
|
+
content_lines.append(f'<img src="{src}" />')
|
167
|
+
|
168
|
+
content = (
|
169
|
+
"\n\n".join(content_lines).strip()
|
170
|
+
if content_lines
|
171
|
+
else tree.xpath(self._CHAPTER_TEXT_XPATH).strip()
|
172
|
+
)
|
173
|
+
if not content:
|
174
|
+
return None
|
175
|
+
|
176
|
+
title_nodes = tree.xpath("//h2/text()")
|
177
|
+
title = title_nodes[0].strip() if title_nodes else ""
|
178
|
+
|
179
|
+
updated_at = next(
|
180
|
+
(
|
181
|
+
x.strip()
|
182
|
+
for xp in self._CHAPTER_TIME_XPATHS
|
183
|
+
for x in tree.xpath(xp)
|
184
|
+
if x.strip()
|
185
|
+
),
|
186
|
+
"",
|
187
|
+
)
|
188
|
+
|
189
|
+
return {
|
190
|
+
"id": chapter_id,
|
191
|
+
"title": title,
|
192
|
+
"content": content,
|
193
|
+
"extra": {"site": "esjzone", "updated_at": updated_at},
|
194
|
+
}
|
195
|
+
|
196
|
+
def _is_forum_page(self, html_str: list[str]) -> bool:
|
197
|
+
if not html_str:
|
198
|
+
return False
|
199
|
+
|
200
|
+
tree = etree.HTML(html_str[0])
|
201
|
+
page_title = tree.xpath('string(//div[@class="page-title"]//h1)').strip()
|
202
|
+
if page_title != "論壇":
|
203
|
+
return False
|
204
|
+
breadcrumb: list[str] = tree.xpath(self._CHECK_FORUM_XPATH)
|
205
|
+
breadcrumb = [s.strip() for s in breadcrumb if s.strip()]
|
206
|
+
return breadcrumb == ["Home", "論壇"]
|
207
|
+
|
208
|
+
@staticmethod
|
209
|
+
def _get_text(
|
210
|
+
tree: _Element,
|
211
|
+
xpath: str,
|
212
|
+
join: bool = False,
|
213
|
+
clean_comma: bool = False,
|
214
|
+
) -> str:
|
215
|
+
data = tree.xpath(xpath)
|
216
|
+
if not data:
|
217
|
+
return ""
|
218
|
+
text = "\n".join(data) if join else data[0].strip()
|
219
|
+
return text.replace(",", "") if clean_comma else text
|