novel-downloader 1.3.1__py3-none-any.whl → 1.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/download.py +1 -1
- novel_downloader/config/adapter.py +3 -0
- novel_downloader/config/models.py +3 -0
- novel_downloader/core/downloaders/__init__.py +23 -1
- novel_downloader/core/downloaders/biquge/__init__.py +2 -0
- novel_downloader/core/downloaders/biquge/biquge_async.py +27 -0
- novel_downloader/core/downloaders/biquge/biquge_sync.py +5 -3
- novel_downloader/core/downloaders/common/common_async.py +5 -3
- novel_downloader/core/downloaders/common/common_sync.py +18 -10
- novel_downloader/core/downloaders/esjzone/__init__.py +14 -0
- novel_downloader/core/downloaders/esjzone/esjzone_async.py +27 -0
- novel_downloader/core/downloaders/esjzone/esjzone_sync.py +27 -0
- novel_downloader/core/downloaders/qianbi/__init__.py +14 -0
- novel_downloader/core/downloaders/qianbi/qianbi_async.py +27 -0
- novel_downloader/core/downloaders/qianbi/qianbi_sync.py +27 -0
- novel_downloader/core/downloaders/qidian/qidian_sync.py +9 -6
- novel_downloader/core/downloaders/sfacg/__init__.py +14 -0
- novel_downloader/core/downloaders/sfacg/sfacg_async.py +27 -0
- novel_downloader/core/downloaders/sfacg/sfacg_sync.py +27 -0
- novel_downloader/core/downloaders/yamibo/__init__.py +14 -0
- novel_downloader/core/downloaders/yamibo/yamibo_async.py +27 -0
- novel_downloader/core/downloaders/yamibo/yamibo_sync.py +27 -0
- novel_downloader/core/factory/downloader.py +35 -7
- novel_downloader/core/factory/parser.py +23 -2
- novel_downloader/core/factory/requester.py +32 -7
- novel_downloader/core/factory/saver.py +14 -2
- novel_downloader/core/interfaces/async_requester.py +3 -3
- novel_downloader/core/interfaces/parser.py +7 -2
- novel_downloader/core/interfaces/sync_requester.py +3 -3
- novel_downloader/core/parsers/__init__.py +15 -5
- novel_downloader/core/parsers/base.py +7 -2
- novel_downloader/core/parsers/biquge/main_parser.py +13 -4
- novel_downloader/core/parsers/common/main_parser.py +13 -4
- novel_downloader/core/parsers/esjzone/__init__.py +10 -0
- novel_downloader/core/parsers/esjzone/main_parser.py +219 -0
- novel_downloader/core/parsers/qianbi/__init__.py +10 -0
- novel_downloader/core/parsers/qianbi/main_parser.py +142 -0
- novel_downloader/core/parsers/qidian/browser/main_parser.py +13 -4
- novel_downloader/core/parsers/qidian/session/main_parser.py +13 -4
- novel_downloader/core/parsers/sfacg/__init__.py +10 -0
- novel_downloader/core/parsers/sfacg/main_parser.py +166 -0
- novel_downloader/core/parsers/yamibo/__init__.py +10 -0
- novel_downloader/core/parsers/yamibo/main_parser.py +194 -0
- novel_downloader/core/requesters/__init__.py +33 -3
- novel_downloader/core/requesters/base/async_session.py +14 -10
- novel_downloader/core/requesters/base/browser.py +4 -7
- novel_downloader/core/requesters/base/session.py +25 -11
- novel_downloader/core/requesters/biquge/__init__.py +2 -0
- novel_downloader/core/requesters/biquge/async_session.py +71 -0
- novel_downloader/core/requesters/biquge/session.py +6 -6
- novel_downloader/core/requesters/common/async_session.py +4 -4
- novel_downloader/core/requesters/common/session.py +6 -6
- novel_downloader/core/requesters/esjzone/__init__.py +13 -0
- novel_downloader/core/requesters/esjzone/async_session.py +211 -0
- novel_downloader/core/requesters/esjzone/session.py +235 -0
- novel_downloader/core/requesters/qianbi/__init__.py +13 -0
- novel_downloader/core/requesters/qianbi/async_session.py +96 -0
- novel_downloader/core/requesters/qianbi/session.py +125 -0
- novel_downloader/core/requesters/qidian/broswer.py +9 -9
- novel_downloader/core/requesters/qidian/session.py +14 -11
- novel_downloader/core/requesters/sfacg/__init__.py +13 -0
- novel_downloader/core/requesters/sfacg/async_session.py +204 -0
- novel_downloader/core/requesters/sfacg/session.py +242 -0
- novel_downloader/core/requesters/yamibo/__init__.py +13 -0
- novel_downloader/core/requesters/yamibo/async_session.py +211 -0
- novel_downloader/core/requesters/yamibo/session.py +237 -0
- novel_downloader/core/savers/__init__.py +15 -3
- novel_downloader/core/savers/base.py +1 -0
- novel_downloader/core/savers/esjzone.py +25 -0
- novel_downloader/core/savers/qianbi.py +25 -0
- novel_downloader/core/savers/sfacg.py +25 -0
- novel_downloader/core/savers/yamibo.py +25 -0
- novel_downloader/locales/en.json +1 -0
- novel_downloader/locales/zh.json +1 -0
- novel_downloader/resources/config/settings.toml +40 -4
- novel_downloader/utils/time_utils/__init__.py +2 -1
- novel_downloader/utils/time_utils/datetime_utils.py +3 -1
- novel_downloader/utils/time_utils/sleep_utils.py +43 -1
- {novel_downloader-1.3.1.dist-info → novel_downloader-1.3.2.dist-info}/METADATA +25 -20
- {novel_downloader-1.3.1.dist-info → novel_downloader-1.3.2.dist-info}/RECORD +85 -47
- {novel_downloader-1.3.1.dist-info → novel_downloader-1.3.2.dist-info}/WHEEL +0 -0
- {novel_downloader-1.3.1.dist-info → novel_downloader-1.3.2.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.3.1.dist-info → novel_downloader-1.3.2.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.3.1.dist-info → novel_downloader-1.3.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,142 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.biquge.main_parser
|
4
|
+
------------------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from datetime import datetime
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
from lxml import etree
|
12
|
+
|
13
|
+
from novel_downloader.core.parsers.base import BaseParser
|
14
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
15
|
+
|
16
|
+
|
17
|
+
class QianbiParser(BaseParser):
|
18
|
+
""" """
|
19
|
+
|
20
|
+
def parse_book_info(
|
21
|
+
self,
|
22
|
+
html_str: list[str],
|
23
|
+
**kwargs: Any,
|
24
|
+
) -> dict[str, Any]:
|
25
|
+
"""
|
26
|
+
Parse a book info page and extract metadata and chapter structure.
|
27
|
+
|
28
|
+
:param html: Raw HTML of the book info page.
|
29
|
+
:return: Parsed metadata and chapter structure as a dictionary.
|
30
|
+
"""
|
31
|
+
if len(html_str) < 2:
|
32
|
+
return {}
|
33
|
+
|
34
|
+
info_tree = etree.HTML(html_str[0])
|
35
|
+
catalog_tree = etree.HTML(html_str[1])
|
36
|
+
result: dict[str, Any] = {}
|
37
|
+
|
38
|
+
title = info_tree.xpath('//h1[@class="page-title"]/text()')
|
39
|
+
result["book_name"] = title[0].strip() if title else ""
|
40
|
+
|
41
|
+
author = info_tree.xpath('//a[contains(@href,"/author/")]/@title')
|
42
|
+
result["author"] = author[0].strip() if author else ""
|
43
|
+
|
44
|
+
cover = info_tree.xpath('//div[@class="novel-cover"]//img/@data-src')
|
45
|
+
result["cover_url"] = cover[0].strip() if cover else ""
|
46
|
+
|
47
|
+
status = info_tree.xpath(
|
48
|
+
'//a[@class="tag-link" and (text()="完结" or text()="连载")]/text()'
|
49
|
+
)
|
50
|
+
result["serial_status"] = status[0] if status else ""
|
51
|
+
|
52
|
+
word_count_raw = info_tree.xpath('//span[contains(text(), "万字")]/text()')
|
53
|
+
result["word_count"] = word_count_raw[0].strip() if word_count_raw else ""
|
54
|
+
|
55
|
+
summary_node = info_tree.xpath(
|
56
|
+
'//div[@class="novel-info-item novel-info-content"]/span'
|
57
|
+
)
|
58
|
+
if summary_node and summary_node[0] is not None:
|
59
|
+
result["summary"] = etree.tostring(
|
60
|
+
summary_node[0], encoding="unicode", method="text"
|
61
|
+
).strip()
|
62
|
+
else:
|
63
|
+
result["summary"] = ""
|
64
|
+
|
65
|
+
result["update_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
66
|
+
|
67
|
+
volumes: list[dict[str, Any]] = []
|
68
|
+
current_volume = None
|
69
|
+
|
70
|
+
for elem in catalog_tree.xpath('//div[@class="box"]/*'):
|
71
|
+
class_attr = elem.get("class", "")
|
72
|
+
class_list = class_attr.split()
|
73
|
+
|
74
|
+
if elem.tag == "h2" and "module-title" in class_list:
|
75
|
+
if current_volume:
|
76
|
+
volumes.append(current_volume)
|
77
|
+
current_volume = {
|
78
|
+
"volume_name": elem.text.strip() if elem.text else "",
|
79
|
+
"chapters": [],
|
80
|
+
}
|
81
|
+
elif (
|
82
|
+
elem.tag == "div" and "module-row-info" in class_list and current_volume
|
83
|
+
):
|
84
|
+
a_tag = elem.xpath('.//a[@class="module-row-text"]')
|
85
|
+
if a_tag:
|
86
|
+
title = a_tag[0].xpath(".//span/text()")
|
87
|
+
href = a_tag[0].attrib.get("href", "")
|
88
|
+
chapter_id = (
|
89
|
+
href.split("/")[-1].replace(".html", "") if href else ""
|
90
|
+
)
|
91
|
+
current_volume["chapters"].append(
|
92
|
+
{
|
93
|
+
"title": title[0].strip() if title else "",
|
94
|
+
"url": href,
|
95
|
+
"chapterId": chapter_id,
|
96
|
+
}
|
97
|
+
)
|
98
|
+
|
99
|
+
if current_volume:
|
100
|
+
volumes.append(current_volume)
|
101
|
+
|
102
|
+
result["volumes"] = volumes
|
103
|
+
|
104
|
+
return result
|
105
|
+
|
106
|
+
def parse_chapter(
|
107
|
+
self,
|
108
|
+
html_str: list[str],
|
109
|
+
chapter_id: str,
|
110
|
+
**kwargs: Any,
|
111
|
+
) -> ChapterDict | None:
|
112
|
+
"""
|
113
|
+
Parse a single chapter page and extract clean text or simplified HTML.
|
114
|
+
|
115
|
+
:param html: Raw HTML of the chapter page.
|
116
|
+
:param chapter_id: Identifier of the chapter being parsed.
|
117
|
+
:return: Cleaned chapter content as plain text or minimal HTML.
|
118
|
+
"""
|
119
|
+
if not html_str:
|
120
|
+
return None
|
121
|
+
tree = etree.HTML(html_str[0])
|
122
|
+
|
123
|
+
paras = tree.xpath('//div[@class="article-content"]/p/text()')
|
124
|
+
content_text = "\n\n".join(p.strip() for p in paras if p.strip())
|
125
|
+
if not content_text:
|
126
|
+
return None
|
127
|
+
|
128
|
+
title = tree.xpath('//h1[@class="article-title"]/text()')
|
129
|
+
title_text = title[0].strip() if title else ""
|
130
|
+
|
131
|
+
volume = tree.xpath('//h3[@class="text-muted"]/text()')
|
132
|
+
volume_text = volume[0].strip() if volume else ""
|
133
|
+
|
134
|
+
return {
|
135
|
+
"id": chapter_id,
|
136
|
+
"title": title_text,
|
137
|
+
"content": content_text,
|
138
|
+
"extra": {
|
139
|
+
"site": "qianbi",
|
140
|
+
"volume": volume_text,
|
141
|
+
},
|
142
|
+
}
|
@@ -69,26 +69,35 @@ class QidianBrowserParser(BaseParser):
|
|
69
69
|
self._font_debug_dir = self._base_cache_dir / "qidian" / "font_debug"
|
70
70
|
self._font_debug_dir.mkdir(parents=True, exist_ok=True)
|
71
71
|
|
72
|
-
def parse_book_info(
|
72
|
+
def parse_book_info(
|
73
|
+
self,
|
74
|
+
html_str: list[str],
|
75
|
+
**kwargs: Any,
|
76
|
+
) -> dict[str, Any]:
|
73
77
|
"""
|
74
78
|
Parse a book info page and extract metadata and chapter structure.
|
75
79
|
|
76
80
|
:param html_str: Raw HTML of the book info page.
|
77
81
|
:return: Parsed metadata and chapter structure as a dictionary.
|
78
82
|
"""
|
79
|
-
|
83
|
+
if not html_str:
|
84
|
+
return {}
|
85
|
+
return parse_book_info(html_str[0])
|
80
86
|
|
81
87
|
def parse_chapter(
|
82
88
|
self,
|
83
|
-
html_str: str,
|
89
|
+
html_str: list[str],
|
84
90
|
chapter_id: str,
|
91
|
+
**kwargs: Any,
|
85
92
|
) -> ChapterDict | None:
|
86
93
|
"""
|
87
94
|
:param html: Raw HTML of the chapter page.
|
88
95
|
:param chapter_id: Identifier of the chapter being parsed.
|
89
96
|
:return: Cleaned chapter content as plain text.
|
90
97
|
"""
|
91
|
-
|
98
|
+
if not html_str:
|
99
|
+
return None
|
100
|
+
return parse_chapter(self, html_str[0], chapter_id)
|
92
101
|
|
93
102
|
def is_encrypted(self, html_str: str) -> bool:
|
94
103
|
"""
|
@@ -72,26 +72,35 @@ class QidianSessionParser(BaseParser):
|
|
72
72
|
self._font_debug_dir = self._base_cache_dir / "qidian" / "font_debug"
|
73
73
|
self._font_debug_dir.mkdir(parents=True, exist_ok=True)
|
74
74
|
|
75
|
-
def parse_book_info(
|
75
|
+
def parse_book_info(
|
76
|
+
self,
|
77
|
+
html_str: list[str],
|
78
|
+
**kwargs: Any,
|
79
|
+
) -> dict[str, Any]:
|
76
80
|
"""
|
77
81
|
Parse a book info page and extract metadata and chapter structure.
|
78
82
|
|
79
83
|
:param html_str: Raw HTML of the book info page.
|
80
84
|
:return: Parsed metadata and chapter structure as a dictionary.
|
81
85
|
"""
|
82
|
-
|
86
|
+
if not html_str:
|
87
|
+
return {}
|
88
|
+
return parse_book_info(html_str[0])
|
83
89
|
|
84
90
|
def parse_chapter(
|
85
91
|
self,
|
86
|
-
html_str: str,
|
92
|
+
html_str: list[str],
|
87
93
|
chapter_id: str,
|
94
|
+
**kwargs: Any,
|
88
95
|
) -> ChapterDict | None:
|
89
96
|
"""
|
90
97
|
:param html: Raw HTML of the chapter page.
|
91
98
|
:param chapter_id: Identifier of the chapter being parsed.
|
92
99
|
:return: Cleaned chapter content as plain text.
|
93
100
|
"""
|
94
|
-
|
101
|
+
if not html_str:
|
102
|
+
return None
|
103
|
+
return parse_chapter(self, html_str[0], chapter_id)
|
95
104
|
|
96
105
|
def is_encrypted(self, html_str: str) -> bool:
|
97
106
|
"""
|
@@ -0,0 +1,166 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.sfacg.main_parser
|
4
|
+
-----------------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any
|
9
|
+
|
10
|
+
from lxml import etree
|
11
|
+
|
12
|
+
from novel_downloader.core.parsers.base import BaseParser
|
13
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
14
|
+
|
15
|
+
|
16
|
+
class SfacgParser(BaseParser):
|
17
|
+
""" """
|
18
|
+
|
19
|
+
# Book info XPaths
|
20
|
+
_BOOK_NAME_XPATH = '//ul[@class="book_info"]//span[@class="book_newtitle"]/text()'
|
21
|
+
_AUTHOR_INFO_XPATH = '//ul[@class="book_info"]//span[@class="book_info3"]/text()'
|
22
|
+
_UPDATE_TIME_XPATH = '//ul[@class="book_info"]//span[@class="book_info3"]/br/following-sibling::text()' # noqa: E501
|
23
|
+
_COVER_URL_XPATH = '//ul[@class="book_info"]//li/img/@src'
|
24
|
+
_STATUS_XPATH = '//ul[@class="book_info"]//div[@class="book_info2"]/span/text()'
|
25
|
+
_SUMMARY_XPATH = '//ul[@class="book_profile"]/li[@class="book_bk_qs1"]/text()'
|
26
|
+
|
27
|
+
# Catalog XPaths
|
28
|
+
_VOLUME_TITLE_XPATH = '//div[@class="mulu"]/text()'
|
29
|
+
_VOLUME_CONTENT_XPATH = '//div[@class="Content_Frame"]'
|
30
|
+
_CHAPTER_LIST_XPATH = './/ul[@class="mulu_list"]/a'
|
31
|
+
|
32
|
+
# Chapter XPaths
|
33
|
+
_CHAPTER_TEXT_XPATH = (
|
34
|
+
'//div[@class="yuedu Content_Frame"]//div[@style="text-indent: 2em;"]/text()'
|
35
|
+
)
|
36
|
+
_CHAPTER_CONTENT_NODES_XPATH = (
|
37
|
+
'//div[@class="yuedu Content_Frame"]//div[@style="text-indent: 2em;"]/*'
|
38
|
+
)
|
39
|
+
_CHAPTER_TITLE_XPATH = '//ul[@class="menu_top_list book_view_top"]/li[2]/text()'
|
40
|
+
|
41
|
+
def parse_book_info(
|
42
|
+
self,
|
43
|
+
html_str: list[str],
|
44
|
+
**kwargs: Any,
|
45
|
+
) -> dict[str, Any]:
|
46
|
+
"""
|
47
|
+
Parse a book info page and extract metadata and chapter structure.
|
48
|
+
|
49
|
+
:param html: Raw HTML of the book info page.
|
50
|
+
:return: Parsed metadata and chapter structure as a dictionary.
|
51
|
+
"""
|
52
|
+
if len(html_str) < 2:
|
53
|
+
return {}
|
54
|
+
|
55
|
+
info_tree = etree.HTML(html_str[0])
|
56
|
+
catalog_tree = etree.HTML(html_str[1])
|
57
|
+
|
58
|
+
result: dict[str, Any] = {}
|
59
|
+
|
60
|
+
# Book metadata
|
61
|
+
book_name = info_tree.xpath(self._BOOK_NAME_XPATH)
|
62
|
+
result["book_name"] = book_name[0].strip() if book_name else ""
|
63
|
+
|
64
|
+
book_info3 = info_tree.xpath(self._AUTHOR_INFO_XPATH)
|
65
|
+
result["author"] = book_info3[0].split("/")[0].strip() if book_info3 else ""
|
66
|
+
result["word_count"] = (
|
67
|
+
book_info3[0].split("/")[1].strip()
|
68
|
+
if book_info3 and len(book_info3[0].split("/")) > 1
|
69
|
+
else ""
|
70
|
+
)
|
71
|
+
|
72
|
+
book_info3_br = info_tree.xpath(self._UPDATE_TIME_XPATH)
|
73
|
+
result["update_time"] = book_info3_br[0].strip() if book_info3_br else ""
|
74
|
+
|
75
|
+
cover_url = info_tree.xpath(self._COVER_URL_XPATH)
|
76
|
+
result["cover_url"] = "https:" + cover_url[0] if cover_url else ""
|
77
|
+
|
78
|
+
serial_status = info_tree.xpath(self._STATUS_XPATH)
|
79
|
+
result["serial_status"] = next(
|
80
|
+
(s for s in serial_status if "完结" in s or "连载" in s), ""
|
81
|
+
)
|
82
|
+
|
83
|
+
summary = info_tree.xpath(self._SUMMARY_XPATH)
|
84
|
+
result["summary"] = "".join(summary).strip()
|
85
|
+
|
86
|
+
# Chapter structure
|
87
|
+
volume_titles = catalog_tree.xpath(self._VOLUME_TITLE_XPATH)
|
88
|
+
volume_blocks = catalog_tree.xpath(self._VOLUME_CONTENT_XPATH)
|
89
|
+
|
90
|
+
volumes = []
|
91
|
+
for vol_title, vol_block in zip(volume_titles, volume_blocks, strict=False):
|
92
|
+
chapters = []
|
93
|
+
for a in vol_block.xpath(self._CHAPTER_LIST_XPATH):
|
94
|
+
href = a.xpath("./@href")[0] if a.xpath("./@href") else ""
|
95
|
+
title = "".join(a.xpath(".//li//text()")).strip()
|
96
|
+
chapter_id = href.split("/")[-2] if href else ""
|
97
|
+
chapters.append(
|
98
|
+
{
|
99
|
+
"title": title,
|
100
|
+
"url": href,
|
101
|
+
"chapterId": chapter_id,
|
102
|
+
}
|
103
|
+
)
|
104
|
+
volumes.append(
|
105
|
+
{
|
106
|
+
"volume_name": vol_title.strip(),
|
107
|
+
"chapters": chapters,
|
108
|
+
}
|
109
|
+
)
|
110
|
+
result["volumes"] = volumes
|
111
|
+
|
112
|
+
return result
|
113
|
+
|
114
|
+
def parse_chapter(
|
115
|
+
self,
|
116
|
+
html_str: list[str],
|
117
|
+
chapter_id: str,
|
118
|
+
**kwargs: Any,
|
119
|
+
) -> ChapterDict | None:
|
120
|
+
"""
|
121
|
+
Parse a single chapter page and extract clean text or simplified HTML.
|
122
|
+
|
123
|
+
:param html: Raw HTML of the chapter page.
|
124
|
+
:param chapter_id: Identifier of the chapter being parsed.
|
125
|
+
:return: Cleaned chapter content as plain text or minimal HTML.
|
126
|
+
"""
|
127
|
+
if not html_str:
|
128
|
+
return None
|
129
|
+
keywords = [
|
130
|
+
"本章为VIP章节", # 本章为VIP章节,订阅后可立即阅读
|
131
|
+
]
|
132
|
+
if any(kw in html_str[0] for kw in keywords):
|
133
|
+
return None
|
134
|
+
tree = etree.HTML(html_str[0])
|
135
|
+
|
136
|
+
content_lines: list[str] = []
|
137
|
+
content_nodes = tree.xpath(self._CHAPTER_CONTENT_NODES_XPATH)
|
138
|
+
for node in content_nodes:
|
139
|
+
tag = node.tag.lower()
|
140
|
+
if tag == "p":
|
141
|
+
text = "".join(node.xpath(".//text()")).strip()
|
142
|
+
if text:
|
143
|
+
content_lines.append(text)
|
144
|
+
elif tag == "img":
|
145
|
+
src = node.get("src", "").strip()
|
146
|
+
if src:
|
147
|
+
# embed image as HTML tag
|
148
|
+
content_lines.append(f'<img src="{src}" />')
|
149
|
+
|
150
|
+
if not content_lines:
|
151
|
+
raw_text_parts = tree.xpath(self._CHAPTER_TEXT_XPATH)
|
152
|
+
content_lines = [txt.strip() for txt in raw_text_parts if txt.strip()]
|
153
|
+
|
154
|
+
content = "\n\n".join(content_lines).strip()
|
155
|
+
if not content:
|
156
|
+
return None
|
157
|
+
|
158
|
+
title_part = tree.xpath(self._CHAPTER_TITLE_XPATH)
|
159
|
+
title = title_part[0].strip() if title_part else ""
|
160
|
+
|
161
|
+
return {
|
162
|
+
"id": chapter_id,
|
163
|
+
"title": title,
|
164
|
+
"content": content,
|
165
|
+
"extra": {"site": "sfacg"},
|
166
|
+
}
|
@@ -0,0 +1,194 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.yamibo.main_parser
|
4
|
+
------------------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any
|
9
|
+
|
10
|
+
from lxml import etree
|
11
|
+
|
12
|
+
from novel_downloader.core.parsers.base import BaseParser
|
13
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
14
|
+
|
15
|
+
|
16
|
+
class YamiboParser(BaseParser):
|
17
|
+
""" """
|
18
|
+
|
19
|
+
BASE_URL = "https://www.yamibo.com"
|
20
|
+
# Book info XPaths
|
21
|
+
_BOOK_NAME_XPATH = 'string(//h3[contains(@class, "col-md-12")])'
|
22
|
+
_AUTHOR_XPATH = 'string(//h5[contains(@class, "text-warning")])'
|
23
|
+
_COVER_URL_XPATH = '//img[contains(@class, "img-responsive")]/@src'
|
24
|
+
_UPDATE_TIME_XPATH = '//p[contains(text(), "更新时间:")]'
|
25
|
+
_SERIAL_STATUS_XPATH = '//p[contains(text(), "作品状态:")]'
|
26
|
+
_TYPE_XPATH = '//p[contains(text(), "作品分类:")]'
|
27
|
+
_SUMMARY_XPATH = 'string(//div[@id="w0-collapse1"]/div)'
|
28
|
+
|
29
|
+
_VOLUME_NODE_XPATH = (
|
30
|
+
'//div[contains(@class, "panel-info") and contains(@class, "panel-default")]'
|
31
|
+
)
|
32
|
+
_VOLUME_TITLE_XPATH = './/div[contains(@class, "panel-heading")]//a/text()'
|
33
|
+
_CHAPTER_NODE_XPATH = (
|
34
|
+
'.//div[contains(@class, "panel-body")]//a[contains(@href, "view-chapter")]'
|
35
|
+
)
|
36
|
+
_CHAPTER_FLAT_XPATH = (
|
37
|
+
'//div[@class="panel-body"]//a[contains(@href, "view-chapter")]'
|
38
|
+
)
|
39
|
+
|
40
|
+
# Chapter field XPaths
|
41
|
+
_CHAPTER_TITLE_XPATH = "string(//section[contains(@class, 'col-md-9')]//h3)"
|
42
|
+
_CHAPTER_TIME_XPATH = (
|
43
|
+
"//div[contains(@class, 'row')]//div[contains(text(), '更新时间')]"
|
44
|
+
)
|
45
|
+
_CHAPTER_WORD_COUNT_XPATH = (
|
46
|
+
"//div[contains(@class, 'row')]//div[contains(text(), '章节字数')]"
|
47
|
+
)
|
48
|
+
_CHAPTER_CONTENT_XPATH = "//div[@id='w0-collapse1']//p//text()"
|
49
|
+
|
50
|
+
def parse_book_info(
|
51
|
+
self,
|
52
|
+
html_str: list[str],
|
53
|
+
**kwargs: Any,
|
54
|
+
) -> dict[str, Any]:
|
55
|
+
"""
|
56
|
+
Parse a book info page and extract metadata and chapter structure.
|
57
|
+
|
58
|
+
:param html: Raw HTML of the book info page.
|
59
|
+
:return: Parsed metadata and chapter structure as a dictionary.
|
60
|
+
"""
|
61
|
+
if not html_str:
|
62
|
+
return {}
|
63
|
+
|
64
|
+
tree = etree.HTML(html_str[0])
|
65
|
+
result: dict[str, Any] = {}
|
66
|
+
|
67
|
+
result["book_name"] = tree.xpath(self._BOOK_NAME_XPATH).strip()
|
68
|
+
result["author"] = tree.xpath(self._AUTHOR_XPATH).strip()
|
69
|
+
|
70
|
+
cover = tree.xpath(self._COVER_URL_XPATH)
|
71
|
+
result["cover_url"] = f"{self.BASE_URL}{cover[0]}" if cover else ""
|
72
|
+
|
73
|
+
update_node = tree.xpath(self._UPDATE_TIME_XPATH)
|
74
|
+
result["update_time"] = (
|
75
|
+
update_node[0].xpath("string()").replace("更新时间:", "").strip()
|
76
|
+
if update_node
|
77
|
+
else ""
|
78
|
+
)
|
79
|
+
|
80
|
+
serial_node = tree.xpath(self._SERIAL_STATUS_XPATH)
|
81
|
+
result["serial_status"] = (
|
82
|
+
serial_node[0].xpath("string()").replace("作品状态:", "").strip()
|
83
|
+
if serial_node
|
84
|
+
else ""
|
85
|
+
)
|
86
|
+
|
87
|
+
type_node = tree.xpath(self._TYPE_XPATH)
|
88
|
+
result["type"] = (
|
89
|
+
type_node[0].xpath("string()").replace("作品分类:", "").strip()
|
90
|
+
if type_node
|
91
|
+
else ""
|
92
|
+
)
|
93
|
+
|
94
|
+
result["summary"] = tree.xpath(self._SUMMARY_XPATH).strip()
|
95
|
+
|
96
|
+
volumes = []
|
97
|
+
volume_nodes = tree.xpath(self._VOLUME_NODE_XPATH)
|
98
|
+
|
99
|
+
if volume_nodes:
|
100
|
+
for volume_node in volume_nodes:
|
101
|
+
title_node = volume_node.xpath(self._VOLUME_TITLE_XPATH)
|
102
|
+
volume_name = title_node[0].strip() if title_node else "未命名卷"
|
103
|
+
|
104
|
+
chapter_nodes = volume_node.xpath(self._CHAPTER_NODE_XPATH)
|
105
|
+
chapters = []
|
106
|
+
for chap in chapter_nodes:
|
107
|
+
title = chap.xpath("string()").strip()
|
108
|
+
url = chap.get("href", "")
|
109
|
+
chapter_id = url.split("id=")[-1] if "id=" in url else ""
|
110
|
+
chapters.append(
|
111
|
+
{
|
112
|
+
"title": title,
|
113
|
+
"url": url,
|
114
|
+
"chapterId": chapter_id,
|
115
|
+
}
|
116
|
+
)
|
117
|
+
|
118
|
+
volumes.append(
|
119
|
+
{
|
120
|
+
"volume_name": volume_name,
|
121
|
+
"chapters": chapters,
|
122
|
+
}
|
123
|
+
)
|
124
|
+
|
125
|
+
else:
|
126
|
+
# fallback: flat list
|
127
|
+
chapter_nodes = tree.xpath(self._CHAPTER_FLAT_XPATH)
|
128
|
+
chapters = []
|
129
|
+
for chap in chapter_nodes:
|
130
|
+
title = chap.xpath("string()").strip()
|
131
|
+
url = chap.get("href", "")
|
132
|
+
chapter_id = url.split("id=")[-1] if "id=" in url else ""
|
133
|
+
chapters.append(
|
134
|
+
{
|
135
|
+
"title": title,
|
136
|
+
"url": url,
|
137
|
+
"chapterId": chapter_id,
|
138
|
+
}
|
139
|
+
)
|
140
|
+
|
141
|
+
volumes = [
|
142
|
+
{
|
143
|
+
"volume_name": "单卷",
|
144
|
+
"chapters": chapters,
|
145
|
+
}
|
146
|
+
]
|
147
|
+
|
148
|
+
result["volumes"] = volumes
|
149
|
+
|
150
|
+
return result
|
151
|
+
|
152
|
+
def parse_chapter(
|
153
|
+
self,
|
154
|
+
html_str: list[str],
|
155
|
+
chapter_id: str,
|
156
|
+
**kwargs: Any,
|
157
|
+
) -> ChapterDict | None:
|
158
|
+
"""
|
159
|
+
Parse a single chapter page and extract clean text or simplified HTML.
|
160
|
+
|
161
|
+
:param html: Raw HTML of the chapter page.
|
162
|
+
:param chapter_id: Identifier of the chapter being parsed.
|
163
|
+
:return: Cleaned chapter content as plain text or minimal HTML.
|
164
|
+
"""
|
165
|
+
if not html_str:
|
166
|
+
return None
|
167
|
+
tree = etree.HTML(html_str[0])
|
168
|
+
|
169
|
+
content_lines = tree.xpath(self._CHAPTER_CONTENT_XPATH)
|
170
|
+
content = "\n\n".join(line.strip() for line in content_lines if line.strip())
|
171
|
+
if not content:
|
172
|
+
return None
|
173
|
+
|
174
|
+
title = tree.xpath(self._CHAPTER_TITLE_XPATH).strip()
|
175
|
+
|
176
|
+
update_node = tree.xpath(self._CHAPTER_TIME_XPATH)
|
177
|
+
updated_at = (
|
178
|
+
update_node[0].text.strip().replace("更新时间:", "") if update_node else ""
|
179
|
+
)
|
180
|
+
|
181
|
+
word_node = tree.xpath(self._CHAPTER_WORD_COUNT_XPATH)
|
182
|
+
word = word_node[0].text.strip().replace("章节字数:", "") if word_node else ""
|
183
|
+
word_count = int(word) if word.isdigit() else 0
|
184
|
+
|
185
|
+
return {
|
186
|
+
"id": chapter_id,
|
187
|
+
"title": title,
|
188
|
+
"content": content,
|
189
|
+
"extra": {
|
190
|
+
"site": "yamibo",
|
191
|
+
"word_count": word_count,
|
192
|
+
"updated_at": updated_at,
|
193
|
+
},
|
194
|
+
}
|
@@ -9,27 +9,57 @@ to perform network interactions, such as logging in, sending requests,
|
|
9
9
|
or interacting with browser/session-based sources.
|
10
10
|
|
11
11
|
Subpackages:
|
12
|
-
-
|
13
|
-
-
|
14
|
-
-
|
12
|
+
- biquge (笔趣阁)
|
13
|
+
- esjzone (ESJ Zone)
|
14
|
+
- qianbi (铅笔小说)
|
15
|
+
- qidian (起点中文网)
|
16
|
+
- sfacg (SF轻小说)
|
17
|
+
- yamibo (百合会)
|
18
|
+
- common (通用架构)
|
15
19
|
"""
|
16
20
|
|
17
21
|
from .biquge import (
|
22
|
+
BiqugeAsyncSession,
|
18
23
|
BiqugeSession,
|
19
24
|
)
|
20
25
|
from .common import (
|
21
26
|
CommonAsyncSession,
|
22
27
|
CommonSession,
|
23
28
|
)
|
29
|
+
from .esjzone import (
|
30
|
+
EsjzoneAsyncSession,
|
31
|
+
EsjzoneSession,
|
32
|
+
)
|
33
|
+
from .qianbi import (
|
34
|
+
QianbiAsyncSession,
|
35
|
+
QianbiSession,
|
36
|
+
)
|
24
37
|
from .qidian import (
|
25
38
|
QidianBrowser,
|
26
39
|
QidianSession,
|
27
40
|
)
|
41
|
+
from .sfacg import (
|
42
|
+
SfacgAsyncSession,
|
43
|
+
SfacgSession,
|
44
|
+
)
|
45
|
+
from .yamibo import (
|
46
|
+
YamiboAsyncSession,
|
47
|
+
YamiboSession,
|
48
|
+
)
|
28
49
|
|
29
50
|
__all__ = [
|
51
|
+
"BiqugeAsyncSession",
|
30
52
|
"BiqugeSession",
|
31
53
|
"CommonAsyncSession",
|
32
54
|
"CommonSession",
|
55
|
+
"EsjzoneAsyncSession",
|
56
|
+
"EsjzoneSession",
|
57
|
+
"QianbiAsyncSession",
|
58
|
+
"QianbiSession",
|
33
59
|
"QidianBrowser",
|
34
60
|
"QidianSession",
|
61
|
+
"SfacgAsyncSession",
|
62
|
+
"SfacgSession",
|
63
|
+
"YamiboAsyncSession",
|
64
|
+
"YamiboSession",
|
35
65
|
]
|