novel-downloader 1.4.5__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +2 -2
- novel_downloader/cli/config.py +1 -83
- novel_downloader/cli/download.py +4 -5
- novel_downloader/cli/export.py +4 -1
- novel_downloader/cli/main.py +2 -0
- novel_downloader/cli/search.py +123 -0
- novel_downloader/config/__init__.py +3 -10
- novel_downloader/config/adapter.py +190 -54
- novel_downloader/config/loader.py +2 -3
- novel_downloader/core/__init__.py +13 -13
- novel_downloader/core/downloaders/__init__.py +10 -11
- novel_downloader/core/downloaders/base.py +152 -26
- novel_downloader/core/downloaders/biquge.py +5 -1
- novel_downloader/core/downloaders/common.py +157 -378
- novel_downloader/core/downloaders/esjzone.py +5 -1
- novel_downloader/core/downloaders/linovelib.py +5 -1
- novel_downloader/core/downloaders/qianbi.py +291 -4
- novel_downloader/core/downloaders/qidian.py +199 -285
- novel_downloader/core/downloaders/registry.py +67 -0
- novel_downloader/core/downloaders/sfacg.py +5 -1
- novel_downloader/core/downloaders/yamibo.py +5 -1
- novel_downloader/core/exporters/__init__.py +10 -11
- novel_downloader/core/exporters/base.py +87 -7
- novel_downloader/core/exporters/biquge.py +5 -8
- novel_downloader/core/exporters/common/__init__.py +2 -2
- novel_downloader/core/exporters/common/epub.py +82 -166
- novel_downloader/core/exporters/common/main_exporter.py +0 -60
- novel_downloader/core/exporters/common/txt.py +82 -83
- novel_downloader/core/exporters/epub_util.py +157 -1330
- novel_downloader/core/exporters/esjzone.py +5 -8
- novel_downloader/core/exporters/linovelib/__init__.py +2 -2
- novel_downloader/core/exporters/linovelib/epub.py +157 -212
- novel_downloader/core/exporters/linovelib/main_exporter.py +2 -59
- novel_downloader/core/exporters/linovelib/txt.py +67 -63
- novel_downloader/core/exporters/qianbi.py +5 -8
- novel_downloader/core/exporters/qidian.py +14 -4
- novel_downloader/core/exporters/registry.py +53 -0
- novel_downloader/core/exporters/sfacg.py +5 -8
- novel_downloader/core/exporters/txt_util.py +67 -0
- novel_downloader/core/exporters/yamibo.py +5 -8
- novel_downloader/core/fetchers/__init__.py +19 -24
- novel_downloader/core/fetchers/base/__init__.py +3 -3
- novel_downloader/core/fetchers/base/browser.py +23 -4
- novel_downloader/core/fetchers/base/session.py +30 -5
- novel_downloader/core/fetchers/biquge/__init__.py +3 -3
- novel_downloader/core/fetchers/biquge/browser.py +5 -0
- novel_downloader/core/fetchers/biquge/session.py +6 -1
- novel_downloader/core/fetchers/esjzone/__init__.py +3 -3
- novel_downloader/core/fetchers/esjzone/browser.py +5 -0
- novel_downloader/core/fetchers/esjzone/session.py +6 -1
- novel_downloader/core/fetchers/linovelib/__init__.py +3 -3
- novel_downloader/core/fetchers/linovelib/browser.py +6 -1
- novel_downloader/core/fetchers/linovelib/session.py +6 -1
- novel_downloader/core/fetchers/qianbi/__init__.py +3 -3
- novel_downloader/core/fetchers/qianbi/browser.py +5 -0
- novel_downloader/core/fetchers/qianbi/session.py +5 -0
- novel_downloader/core/fetchers/qidian/__init__.py +3 -3
- novel_downloader/core/fetchers/qidian/browser.py +12 -4
- novel_downloader/core/fetchers/qidian/session.py +11 -3
- novel_downloader/core/fetchers/registry.py +71 -0
- novel_downloader/core/fetchers/sfacg/__init__.py +3 -3
- novel_downloader/core/fetchers/sfacg/browser.py +5 -0
- novel_downloader/core/fetchers/sfacg/session.py +5 -0
- novel_downloader/core/fetchers/yamibo/__init__.py +3 -3
- novel_downloader/core/fetchers/yamibo/browser.py +5 -0
- novel_downloader/core/fetchers/yamibo/session.py +6 -1
- novel_downloader/core/interfaces/__init__.py +7 -5
- novel_downloader/core/interfaces/searcher.py +18 -0
- novel_downloader/core/parsers/__init__.py +10 -11
- novel_downloader/core/parsers/{biquge/main_parser.py → biquge.py} +7 -2
- novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +7 -2
- novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +7 -2
- novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +7 -2
- novel_downloader/core/parsers/qidian/__init__.py +2 -2
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +23 -21
- novel_downloader/core/parsers/qidian/chapter_normal.py +1 -1
- novel_downloader/core/parsers/qidian/main_parser.py +10 -21
- novel_downloader/core/parsers/qidian/utils/__init__.py +11 -11
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +5 -6
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
- novel_downloader/core/parsers/registry.py +68 -0
- novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +7 -2
- novel_downloader/core/parsers/{yamibo/main_parser.py → yamibo.py} +7 -2
- novel_downloader/core/searchers/__init__.py +20 -0
- novel_downloader/core/searchers/base.py +92 -0
- novel_downloader/core/searchers/biquge.py +83 -0
- novel_downloader/core/searchers/esjzone.py +84 -0
- novel_downloader/core/searchers/qianbi.py +131 -0
- novel_downloader/core/searchers/qidian.py +87 -0
- novel_downloader/core/searchers/registry.py +63 -0
- novel_downloader/locales/en.json +12 -4
- novel_downloader/locales/zh.json +12 -4
- novel_downloader/models/__init__.py +4 -30
- novel_downloader/models/config.py +12 -6
- novel_downloader/models/search.py +16 -0
- novel_downloader/models/types.py +0 -2
- novel_downloader/resources/config/settings.toml +31 -4
- novel_downloader/resources/css_styles/intro.css +83 -0
- novel_downloader/resources/css_styles/main.css +30 -89
- novel_downloader/utils/__init__.py +52 -0
- novel_downloader/utils/chapter_storage.py +244 -224
- novel_downloader/utils/constants.py +1 -21
- novel_downloader/utils/epub/__init__.py +34 -0
- novel_downloader/utils/epub/builder.py +377 -0
- novel_downloader/utils/epub/constants.py +77 -0
- novel_downloader/utils/epub/documents.py +403 -0
- novel_downloader/utils/epub/models.py +134 -0
- novel_downloader/utils/epub/utils.py +212 -0
- novel_downloader/utils/file_utils/__init__.py +10 -14
- novel_downloader/utils/file_utils/io.py +20 -51
- novel_downloader/utils/file_utils/normalize.py +2 -2
- novel_downloader/utils/file_utils/sanitize.py +2 -3
- novel_downloader/utils/fontocr/__init__.py +5 -5
- novel_downloader/utils/{hash_store.py → fontocr/hash_store.py} +4 -3
- novel_downloader/utils/{hash_utils.py → fontocr/hash_utils.py} +2 -2
- novel_downloader/utils/fontocr/ocr_v1.py +13 -1
- novel_downloader/utils/fontocr/ocr_v2.py +13 -1
- novel_downloader/utils/fontocr/ocr_v3.py +744 -0
- novel_downloader/utils/i18n.py +2 -0
- novel_downloader/utils/logger.py +2 -0
- novel_downloader/utils/network.py +110 -251
- novel_downloader/utils/state.py +1 -0
- novel_downloader/utils/text_utils/__init__.py +18 -17
- novel_downloader/utils/text_utils/diff_display.py +4 -5
- novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
- novel_downloader/utils/text_utils/text_cleaner.py +179 -0
- novel_downloader/utils/text_utils/truncate_utils.py +62 -0
- novel_downloader/utils/time_utils/__init__.py +3 -3
- novel_downloader/utils/time_utils/datetime_utils.py +4 -5
- novel_downloader/utils/time_utils/sleep_utils.py +2 -3
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/METADATA +2 -2
- novel_downloader-1.5.0.dist-info/RECORD +164 -0
- novel_downloader/config/site_rules.py +0 -94
- novel_downloader/core/factory/__init__.py +0 -20
- novel_downloader/core/factory/downloader.py +0 -73
- novel_downloader/core/factory/exporter.py +0 -58
- novel_downloader/core/factory/fetcher.py +0 -96
- novel_downloader/core/factory/parser.py +0 -86
- novel_downloader/core/fetchers/common/__init__.py +0 -14
- novel_downloader/core/fetchers/common/browser.py +0 -79
- novel_downloader/core/fetchers/common/session.py +0 -79
- novel_downloader/core/parsers/biquge/__init__.py +0 -10
- novel_downloader/core/parsers/common/__init__.py +0 -13
- novel_downloader/core/parsers/common/helper.py +0 -323
- novel_downloader/core/parsers/common/main_parser.py +0 -106
- novel_downloader/core/parsers/esjzone/__init__.py +0 -10
- novel_downloader/core/parsers/linovelib/__init__.py +0 -10
- novel_downloader/core/parsers/qianbi/__init__.py +0 -10
- novel_downloader/core/parsers/sfacg/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/__init__.py +0 -10
- novel_downloader/models/browser.py +0 -21
- novel_downloader/models/site_rules.py +0 -99
- novel_downloader/models/tasks.py +0 -33
- novel_downloader/resources/css_styles/volume-intro.css +0 -56
- novel_downloader/resources/json/replace_word_map.json +0 -4
- novel_downloader/resources/text/blacklist.txt +0 -22
- novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
- novel_downloader/utils/text_utils/font_mapping.py +0 -28
- novel_downloader/utils/text_utils/text_cleaning.py +0 -107
- novel_downloader-1.4.5.dist-info/RECORD +0 -165
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/WHEEL +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/top_level.txt +0 -0
@@ -11,16 +11,14 @@ from __future__ import annotations
|
|
11
11
|
|
12
12
|
import json
|
13
13
|
import logging
|
14
|
-
from pathlib import Path
|
15
14
|
from typing import TYPE_CHECKING, Any
|
16
15
|
|
17
16
|
import tinycss2
|
18
17
|
from lxml import html
|
19
18
|
|
20
19
|
from novel_downloader.models import ChapterDict
|
21
|
-
from novel_downloader.utils
|
22
|
-
|
23
|
-
apply_font_mapping,
|
20
|
+
from novel_downloader.utils import (
|
21
|
+
download,
|
24
22
|
truncate_half_lines,
|
25
23
|
)
|
26
24
|
|
@@ -70,10 +68,9 @@ def parse_encrypted_chapter(
|
|
70
68
|
)
|
71
69
|
return None
|
72
70
|
|
73
|
-
|
74
|
-
if parser.
|
75
|
-
|
76
|
-
debug_base_dir.mkdir(parents=True, exist_ok=True)
|
71
|
+
debug_dir = parser._debug_dir / "font_debug" / "qidian" / chapter_id
|
72
|
+
if parser.save_font_debug:
|
73
|
+
debug_dir.mkdir(parents=True, exist_ok=True)
|
77
74
|
|
78
75
|
css_str = chapter_info["css"]
|
79
76
|
randomFont_str = chapter_info["randomFont"]
|
@@ -98,8 +95,10 @@ def parse_encrypted_chapter(
|
|
98
95
|
rand_path.parent.mkdir(parents=True, exist_ok=True)
|
99
96
|
rand_path.write_bytes(bytes(rf["data"]))
|
100
97
|
|
101
|
-
fixed_path =
|
102
|
-
url=fixedFontWoff2_url,
|
98
|
+
fixed_path = download(
|
99
|
+
url=fixedFontWoff2_url,
|
100
|
+
target_dir=parser._fixed_font_dir,
|
101
|
+
stream=True,
|
103
102
|
)
|
104
103
|
if fixed_path is None:
|
105
104
|
raise ValueError("fixed_path is None: failed to download font")
|
@@ -125,16 +124,16 @@ def parse_encrypted_chapter(
|
|
125
124
|
return None
|
126
125
|
main_paragraphs = extract_paragraphs_recursively(raw_html, chapter_id)
|
127
126
|
|
128
|
-
if
|
129
|
-
main_paragraphs_path =
|
127
|
+
if parser.save_font_debug:
|
128
|
+
main_paragraphs_path = debug_dir / "main_paragraphs_debug.json"
|
130
129
|
main_paragraphs_path.write_text(
|
131
130
|
json.dumps(main_paragraphs, ensure_ascii=False, indent=2),
|
132
131
|
encoding="utf-8",
|
133
132
|
)
|
134
133
|
|
135
134
|
paragraphs_rules = parse_rule(css_str)
|
136
|
-
if
|
137
|
-
paragraphs_rules_path =
|
135
|
+
if parser.save_font_debug:
|
136
|
+
paragraphs_rules_path = debug_dir / "paragraphs_rules_debug.json"
|
138
137
|
paragraphs_rules_path.write_text(
|
139
138
|
json.dumps(paragraphs_rules, ensure_ascii=False, indent=2),
|
140
139
|
encoding="utf-8",
|
@@ -146,16 +145,16 @@ def parse_encrypted_chapter(
|
|
146
145
|
paragraphs_rules,
|
147
146
|
end_number,
|
148
147
|
)
|
149
|
-
if
|
150
|
-
paragraphs_str_path =
|
148
|
+
if parser.save_font_debug:
|
149
|
+
paragraphs_str_path = debug_dir / f"{chapter_id}_debug.txt"
|
151
150
|
paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
|
152
151
|
|
153
152
|
# Run OCR + fallback mapping
|
154
153
|
char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
|
155
154
|
refl_set = set(refl_list)
|
156
155
|
char_set = char_set - refl_set
|
157
|
-
if
|
158
|
-
char_sets_path =
|
156
|
+
if parser.save_font_debug:
|
157
|
+
char_sets_path = debug_dir / "char_set_debug.txt"
|
159
158
|
temp = f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}"
|
160
159
|
char_sets_path.write_text(
|
161
160
|
temp,
|
@@ -169,15 +168,18 @@ def parse_encrypted_chapter(
|
|
169
168
|
refl_set=refl_set,
|
170
169
|
chapter_id=chapter_id,
|
171
170
|
)
|
172
|
-
if
|
173
|
-
mapping_json_path =
|
171
|
+
if parser.save_font_debug:
|
172
|
+
mapping_json_path = debug_dir / "font_mapping.json"
|
174
173
|
mapping_json_path.write_text(
|
175
174
|
json.dumps(mapping_result, ensure_ascii=False, indent=2),
|
176
175
|
encoding="utf-8",
|
177
176
|
)
|
178
177
|
|
179
178
|
# Reconstruct final readable text
|
180
|
-
original_text = apply_font_mapping(
|
179
|
+
original_text = parser._font_ocr.apply_font_mapping(
|
180
|
+
text=paragraphs_str,
|
181
|
+
font_map=mapping_result,
|
182
|
+
)
|
181
183
|
|
182
184
|
final_paragraphs_str = "\n\n".join(
|
183
185
|
line.strip() for line in original_text.splitlines() if line.strip()
|
@@ -15,7 +15,7 @@ from typing import TYPE_CHECKING
|
|
15
15
|
from lxml import html
|
16
16
|
|
17
17
|
from novel_downloader.models import ChapterDict
|
18
|
-
from novel_downloader.utils
|
18
|
+
from novel_downloader.utils import truncate_half_lines
|
19
19
|
|
20
20
|
from .utils import (
|
21
21
|
extract_chapter_info,
|
@@ -13,9 +13,10 @@ from pathlib import Path
|
|
13
13
|
from typing import TYPE_CHECKING, Any
|
14
14
|
|
15
15
|
from novel_downloader.core.parsers.base import BaseParser
|
16
|
+
from novel_downloader.core.parsers.registry import register_parser
|
16
17
|
from novel_downloader.models import ChapterDict, ParserConfig
|
18
|
+
from novel_downloader.utils import find_cookie_value
|
17
19
|
from novel_downloader.utils.constants import DATA_DIR
|
18
|
-
from novel_downloader.utils.cookies import find_cookie_value
|
19
20
|
|
20
21
|
from .book_info_parser import parse_book_info
|
21
22
|
from .chapter_router import parse_chapter
|
@@ -27,6 +28,10 @@ if TYPE_CHECKING:
|
|
27
28
|
from novel_downloader.utils.fontocr import FontOCR
|
28
29
|
|
29
30
|
|
31
|
+
@register_parser(
|
32
|
+
site_keys=["qidian", "qd"],
|
33
|
+
backends=["session", "browser"],
|
34
|
+
)
|
30
35
|
class QidianParser(BaseParser):
|
31
36
|
"""
|
32
37
|
Parser for Qidian site.
|
@@ -47,11 +52,10 @@ class QidianParser(BaseParser):
|
|
47
52
|
# Extract and store parser flags from config
|
48
53
|
self._use_truncation = config.use_truncation
|
49
54
|
self._decode_font: bool = config.decode_font
|
50
|
-
self._save_font_debug: bool = config.save_font_debug
|
51
55
|
|
52
56
|
self._fixed_font_dir: Path = self._base_cache_dir / "fixed_fonts"
|
53
57
|
self._fixed_font_dir.mkdir(parents=True, exist_ok=True)
|
54
|
-
self.
|
58
|
+
self._debug_dir: Path = Path.cwd() / "debug"
|
55
59
|
|
56
60
|
state_files = [
|
57
61
|
DATA_DIR / "qidian" / "browser_state.cookies",
|
@@ -80,8 +84,6 @@ class QidianParser(BaseParser):
|
|
80
84
|
vec_weight=config.vec_weight,
|
81
85
|
font_debug=config.save_font_debug,
|
82
86
|
)
|
83
|
-
self._font_debug_dir = self._base_cache_dir / "qidian" / "font_debug"
|
84
|
-
self._font_debug_dir.mkdir(parents=True, exist_ok=True)
|
85
87
|
|
86
88
|
def parse_book_info(
|
87
89
|
self,
|
@@ -121,19 +123,6 @@ class QidianParser(BaseParser):
|
|
121
123
|
"""
|
122
124
|
return is_encrypted(html_str)
|
123
125
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
Folders are only created if corresponding debug/save flags are enabled.
|
128
|
-
"""
|
129
|
-
base = self._base_cache_dir
|
130
|
-
|
131
|
-
# Font debug folder
|
132
|
-
if self._save_font_debug and self.book_id:
|
133
|
-
self._font_debug_dir = base / self.book_id / "font_debug"
|
134
|
-
self._font_debug_dir.mkdir(parents=True, exist_ok=True)
|
135
|
-
else:
|
136
|
-
self._font_debug_dir = None
|
137
|
-
|
138
|
-
def _on_book_id_set(self) -> None:
|
139
|
-
self._init_cache_folders()
|
126
|
+
@property
|
127
|
+
def save_font_debug(self) -> bool:
|
128
|
+
return self._config.save_font_debug
|
@@ -5,17 +5,6 @@ novel_downloader.core.parsers.qidian.utils
|
|
5
5
|
|
6
6
|
"""
|
7
7
|
|
8
|
-
from .helpers import (
|
9
|
-
can_view_chapter,
|
10
|
-
extract_chapter_info,
|
11
|
-
find_ssr_page_context,
|
12
|
-
is_duplicated,
|
13
|
-
is_encrypted,
|
14
|
-
is_restricted_page,
|
15
|
-
vip_status,
|
16
|
-
)
|
17
|
-
from .node_decryptor import QidianNodeDecryptor, get_decryptor
|
18
|
-
|
19
8
|
__all__ = [
|
20
9
|
"find_ssr_page_context",
|
21
10
|
"extract_chapter_info",
|
@@ -27,3 +16,14 @@ __all__ = [
|
|
27
16
|
"QidianNodeDecryptor",
|
28
17
|
"get_decryptor",
|
29
18
|
]
|
19
|
+
|
20
|
+
from .helpers import (
|
21
|
+
can_view_chapter,
|
22
|
+
extract_chapter_info,
|
23
|
+
find_ssr_page_context,
|
24
|
+
is_duplicated,
|
25
|
+
is_encrypted,
|
26
|
+
is_restricted_page,
|
27
|
+
vip_status,
|
28
|
+
)
|
29
|
+
from .node_decryptor import QidianNodeDecryptor, get_decryptor
|
@@ -9,6 +9,11 @@ GitHub releases.
|
|
9
9
|
|
10
10
|
from __future__ import annotations
|
11
11
|
|
12
|
+
__all__ = [
|
13
|
+
"ensure_decryptor",
|
14
|
+
"RELEASE_VERSION",
|
15
|
+
]
|
16
|
+
|
12
17
|
import hashlib
|
13
18
|
import platform
|
14
19
|
import stat
|
@@ -137,9 +142,3 @@ def _make_executable(p: Path) -> None:
|
|
137
142
|
p.chmod(mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
|
138
143
|
except PermissionError:
|
139
144
|
pass
|
140
|
-
|
141
|
-
|
142
|
-
__all__ = [
|
143
|
-
"ensure_decryptor",
|
144
|
-
"RELEASE_VERSION",
|
145
|
-
]
|
@@ -80,10 +80,10 @@ class QidianNodeDecryptor:
|
|
80
80
|
|
81
81
|
# 3) Download the Fock JS module from Qidian CDN if missing
|
82
82
|
if not self.QIDIAN_FOCK_JS_PATH.exists():
|
83
|
-
from novel_downloader.utils.network import
|
83
|
+
from novel_downloader.utils.network import download
|
84
84
|
|
85
85
|
try:
|
86
|
-
|
86
|
+
download(
|
87
87
|
self.QIDIAN_FOCK_JS_URL,
|
88
88
|
self.script_dir,
|
89
89
|
on_exist="overwrite",
|
@@ -0,0 +1,68 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.registry
|
4
|
+
--------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
__all__ = ["register_parser", "get_parser"]
|
9
|
+
|
10
|
+
from collections.abc import Callable, Sequence
|
11
|
+
from typing import TypeVar
|
12
|
+
|
13
|
+
from novel_downloader.core.interfaces import ParserProtocol
|
14
|
+
from novel_downloader.models import ParserConfig
|
15
|
+
|
16
|
+
ParserBuilder = Callable[[ParserConfig], ParserProtocol]
|
17
|
+
|
18
|
+
P = TypeVar("P", bound=ParserProtocol)
|
19
|
+
_PARSER_MAP: dict[str, dict[str, ParserBuilder]] = {}
|
20
|
+
|
21
|
+
|
22
|
+
def register_parser(
|
23
|
+
site_keys: Sequence[str],
|
24
|
+
backends: Sequence[str],
|
25
|
+
) -> Callable[[type[P]], type[P]]:
|
26
|
+
"""
|
27
|
+
Decorator to register a parser class under given keys.
|
28
|
+
|
29
|
+
:param site_keys: Sequence of site identifiers
|
30
|
+
:param backends: Sequence of backend types
|
31
|
+
:return: A class decorator that populates _PARSER_MAP.
|
32
|
+
"""
|
33
|
+
|
34
|
+
def decorator(cls: type[P]) -> type[P]:
|
35
|
+
for site in site_keys:
|
36
|
+
site_lower = site.lower()
|
37
|
+
bucket = _PARSER_MAP.setdefault(site_lower, {})
|
38
|
+
for backend in backends:
|
39
|
+
bucket[backend] = cls
|
40
|
+
return cls
|
41
|
+
|
42
|
+
return decorator
|
43
|
+
|
44
|
+
|
45
|
+
def get_parser(site: str, config: ParserConfig) -> ParserProtocol:
|
46
|
+
"""
|
47
|
+
Returns a site-specific parser instance.
|
48
|
+
|
49
|
+
:param site: Site name (e.g., 'qidian')
|
50
|
+
:param config: Configuration for the parser
|
51
|
+
:return: An instance of a parser class
|
52
|
+
"""
|
53
|
+
site_key = site.lower()
|
54
|
+
try:
|
55
|
+
backend_map = _PARSER_MAP[site_key]
|
56
|
+
except KeyError as err:
|
57
|
+
raise ValueError(f"Unsupported site: {site!r}") from err
|
58
|
+
|
59
|
+
mode = config.mode
|
60
|
+
try:
|
61
|
+
parser_cls = backend_map[mode]
|
62
|
+
except KeyError as err:
|
63
|
+
raise ValueError(
|
64
|
+
f"Unsupported parser mode {mode!r} for site {site!r}. "
|
65
|
+
f"Available modes: {list(backend_map)}"
|
66
|
+
) from err
|
67
|
+
|
68
|
+
return parser_cls(config)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
"""
|
3
|
-
novel_downloader.core.parsers.sfacg
|
4
|
-
|
3
|
+
novel_downloader.core.parsers.sfacg
|
4
|
+
-----------------------------------
|
5
5
|
|
6
6
|
"""
|
7
7
|
|
@@ -10,9 +10,14 @@ from typing import Any
|
|
10
10
|
from lxml import html
|
11
11
|
|
12
12
|
from novel_downloader.core.parsers.base import BaseParser
|
13
|
+
from novel_downloader.core.parsers.registry import register_parser
|
13
14
|
from novel_downloader.models import ChapterDict
|
14
15
|
|
15
16
|
|
17
|
+
@register_parser(
|
18
|
+
site_keys=["sfacg"],
|
19
|
+
backends=["session", "browser"],
|
20
|
+
)
|
16
21
|
class SfacgParser(BaseParser):
|
17
22
|
""" """
|
18
23
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
"""
|
3
|
-
novel_downloader.core.parsers.yamibo
|
4
|
-
|
3
|
+
novel_downloader.core.parsers.yamibo
|
4
|
+
------------------------------------
|
5
5
|
|
6
6
|
"""
|
7
7
|
|
@@ -10,9 +10,14 @@ from typing import Any
|
|
10
10
|
from lxml import html
|
11
11
|
|
12
12
|
from novel_downloader.core.parsers.base import BaseParser
|
13
|
+
from novel_downloader.core.parsers.registry import register_parser
|
13
14
|
from novel_downloader.models import ChapterDict
|
14
15
|
|
15
16
|
|
17
|
+
@register_parser(
|
18
|
+
site_keys=["yamibo"],
|
19
|
+
backends=["session", "browser"],
|
20
|
+
)
|
16
21
|
class YamiboParser(BaseParser):
|
17
22
|
""" """
|
18
23
|
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.searchers
|
4
|
+
-------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
__all__ = [
|
9
|
+
"search",
|
10
|
+
"BiqugeSearcher",
|
11
|
+
"EsjzoneSearcher",
|
12
|
+
"QianbiSearcher",
|
13
|
+
"QidianSearcher",
|
14
|
+
]
|
15
|
+
|
16
|
+
from .biquge import BiqugeSearcher
|
17
|
+
from .esjzone import EsjzoneSearcher
|
18
|
+
from .qianbi import QianbiSearcher
|
19
|
+
from .qidian import QidianSearcher
|
20
|
+
from .registry import search
|
@@ -0,0 +1,92 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.searchers.base
|
4
|
+
------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import abc
|
9
|
+
from typing import Any
|
10
|
+
from urllib.parse import quote_plus
|
11
|
+
|
12
|
+
import requests
|
13
|
+
|
14
|
+
from novel_downloader.core.interfaces import SearcherProtocol
|
15
|
+
from novel_downloader.models import SearchResult
|
16
|
+
from novel_downloader.utils.constants import DEFAULT_USER_HEADERS
|
17
|
+
|
18
|
+
|
19
|
+
class BaseSearcher(abc.ABC, SearcherProtocol):
|
20
|
+
site_name: str
|
21
|
+
_session = requests.Session()
|
22
|
+
_DEFAULT_TIMEOUT: tuple[int, int] = (5, 10)
|
23
|
+
|
24
|
+
@classmethod
|
25
|
+
def search(cls, keyword: str, limit: int | None = None) -> list[SearchResult]:
|
26
|
+
html = cls._fetch_html(keyword)
|
27
|
+
return cls._parse_html(html, limit)
|
28
|
+
|
29
|
+
@classmethod
|
30
|
+
@abc.abstractmethod
|
31
|
+
def _fetch_html(cls, keyword: str) -> str:
|
32
|
+
"""Get raw HTML from search API or page"""
|
33
|
+
pass
|
34
|
+
|
35
|
+
@classmethod
|
36
|
+
@abc.abstractmethod
|
37
|
+
def _parse_html(cls, html_str: str, limit: int | None = None) -> list[SearchResult]:
|
38
|
+
"""Parse HTML into standard search result list"""
|
39
|
+
pass
|
40
|
+
|
41
|
+
@classmethod
|
42
|
+
def _http_get(
|
43
|
+
cls,
|
44
|
+
url: str,
|
45
|
+
*,
|
46
|
+
params: dict[str, str] | None = None,
|
47
|
+
headers: dict[str, str] | None = None,
|
48
|
+
timeout: tuple[int, int] | None = None,
|
49
|
+
**kwargs: Any,
|
50
|
+
) -> requests.Response:
|
51
|
+
"""
|
52
|
+
Helper for GET requests with default headers, timeout, and error-raising.
|
53
|
+
"""
|
54
|
+
hdrs = {**DEFAULT_USER_HEADERS, **(headers or {})}
|
55
|
+
resp = cls._session.get(
|
56
|
+
url,
|
57
|
+
params=params,
|
58
|
+
headers=hdrs,
|
59
|
+
timeout=timeout or cls._DEFAULT_TIMEOUT,
|
60
|
+
**kwargs,
|
61
|
+
)
|
62
|
+
resp.raise_for_status()
|
63
|
+
return resp
|
64
|
+
|
65
|
+
@classmethod
|
66
|
+
def _http_post(
|
67
|
+
cls,
|
68
|
+
url: str,
|
69
|
+
*,
|
70
|
+
data: dict[str, str] | str | None = None,
|
71
|
+
headers: dict[str, str] | None = None,
|
72
|
+
timeout: tuple[int, int] | None = None,
|
73
|
+
**kwargs: Any,
|
74
|
+
) -> requests.Response:
|
75
|
+
"""
|
76
|
+
Helper for POST requests with default headers, timeout, and error-raising.
|
77
|
+
"""
|
78
|
+
hdrs = {**DEFAULT_USER_HEADERS, **(headers or {})}
|
79
|
+
resp = cls._session.post(
|
80
|
+
url,
|
81
|
+
data=data,
|
82
|
+
headers=hdrs,
|
83
|
+
timeout=timeout or cls._DEFAULT_TIMEOUT,
|
84
|
+
**kwargs,
|
85
|
+
)
|
86
|
+
resp.raise_for_status()
|
87
|
+
return resp
|
88
|
+
|
89
|
+
@staticmethod
|
90
|
+
def _quote(q: str) -> str:
|
91
|
+
"""URL-encode a query string safely."""
|
92
|
+
return quote_plus(q)
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.searchers.biquge
|
4
|
+
--------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
|
10
|
+
from lxml import html
|
11
|
+
|
12
|
+
from novel_downloader.core.searchers.base import BaseSearcher
|
13
|
+
from novel_downloader.core.searchers.registry import register_searcher
|
14
|
+
from novel_downloader.models import SearchResult
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
@register_searcher(
|
20
|
+
site_keys=["biquge", "bqg"],
|
21
|
+
)
|
22
|
+
class BiqugeSearcher(BaseSearcher):
|
23
|
+
site_name = "biquge"
|
24
|
+
priority = 5
|
25
|
+
SEARCH_URL = "http://www.b520.cc/modules/article/search.php"
|
26
|
+
|
27
|
+
@classmethod
|
28
|
+
def _fetch_html(cls, keyword: str) -> str:
|
29
|
+
"""
|
30
|
+
Fetch raw HTML from Biquge's search page.
|
31
|
+
|
32
|
+
:param keyword: The search term to query on Biquge.
|
33
|
+
:return: HTML text of the search results page, or an empty string on fail.
|
34
|
+
"""
|
35
|
+
params = {"searchkey": keyword}
|
36
|
+
try:
|
37
|
+
response = cls._http_get(cls.SEARCH_URL, params=params)
|
38
|
+
return response.text
|
39
|
+
except Exception:
|
40
|
+
logger.error(
|
41
|
+
"Failed to fetch HTML for keyword '%s' from '%s'",
|
42
|
+
keyword,
|
43
|
+
cls.SEARCH_URL,
|
44
|
+
exc_info=True,
|
45
|
+
)
|
46
|
+
return ""
|
47
|
+
|
48
|
+
@classmethod
|
49
|
+
def _parse_html(cls, html_str: str, limit: int | None = None) -> list[SearchResult]:
|
50
|
+
"""
|
51
|
+
Parse raw HTML from Biquge search results into list of SearchResult.
|
52
|
+
|
53
|
+
:param html_str: Raw HTML string from Biquge search results page.
|
54
|
+
:param limit: Maximum number of results to return, or None for all.
|
55
|
+
:return: List of SearchResult dicts.
|
56
|
+
"""
|
57
|
+
doc = html.fromstring(html_str)
|
58
|
+
rows = doc.xpath('//table[@class="grid"]//tr[position()>1]')
|
59
|
+
results: list[SearchResult] = []
|
60
|
+
|
61
|
+
for idx, row in enumerate(rows):
|
62
|
+
if limit is not None and idx >= limit:
|
63
|
+
break
|
64
|
+
# Title and book_id
|
65
|
+
title_elem = row.xpath(".//td[1]/a")[0]
|
66
|
+
title = title_elem.text_content().strip()
|
67
|
+
href = title_elem.get("href", "").strip("/")
|
68
|
+
book_id = href.split("/")[0] if href else ""
|
69
|
+
# Author
|
70
|
+
author = row.xpath(".//td[3]")[0].text_content().strip()
|
71
|
+
# Compute priority
|
72
|
+
prio = cls.priority + idx
|
73
|
+
|
74
|
+
results.append(
|
75
|
+
SearchResult(
|
76
|
+
site=cls.site_name,
|
77
|
+
book_id=book_id,
|
78
|
+
title=title,
|
79
|
+
author=author,
|
80
|
+
priority=prio,
|
81
|
+
)
|
82
|
+
)
|
83
|
+
return results
|
@@ -0,0 +1,84 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.searchers.esjzone
|
4
|
+
---------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
|
10
|
+
from lxml import html
|
11
|
+
|
12
|
+
from novel_downloader.core.searchers.base import BaseSearcher
|
13
|
+
from novel_downloader.core.searchers.registry import register_searcher
|
14
|
+
from novel_downloader.models import SearchResult
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
@register_searcher(
|
20
|
+
site_keys=["esjzone"],
|
21
|
+
)
|
22
|
+
class EsjzoneSearcher(BaseSearcher):
|
23
|
+
site_name = "esjzone"
|
24
|
+
priority = 3
|
25
|
+
SEARCH_URL = "https://www.esjzone.cc/tags/{query}/"
|
26
|
+
|
27
|
+
@classmethod
|
28
|
+
def _fetch_html(cls, keyword: str) -> str:
|
29
|
+
"""
|
30
|
+
Fetch raw HTML from Esjzone's search page.
|
31
|
+
|
32
|
+
:param keyword: The search term to query on Esjzone.
|
33
|
+
:return: HTML text of the search results page, or an empty string on fail.
|
34
|
+
"""
|
35
|
+
url = cls.SEARCH_URL.format(query=cls._quote(keyword))
|
36
|
+
try:
|
37
|
+
response = cls._http_get(url)
|
38
|
+
return response.text
|
39
|
+
except Exception:
|
40
|
+
logger.error(
|
41
|
+
"Failed to fetch HTML for keyword '%s' from '%s'",
|
42
|
+
keyword,
|
43
|
+
url,
|
44
|
+
exc_info=True,
|
45
|
+
)
|
46
|
+
return ""
|
47
|
+
|
48
|
+
@classmethod
|
49
|
+
def _parse_html(cls, html_str: str, limit: int | None = None) -> list[SearchResult]:
|
50
|
+
"""
|
51
|
+
Parse raw HTML from Esjzone search results into list of SearchResult.
|
52
|
+
|
53
|
+
:param html_str: Raw HTML string from Esjzone search results page.
|
54
|
+
:param limit: Maximum number of results to return, or None for all.
|
55
|
+
:return: List of SearchResult dicts.
|
56
|
+
"""
|
57
|
+
doc = html.fromstring(html_str)
|
58
|
+
cards = doc.xpath('//div[contains(@class,"card-body")]')
|
59
|
+
results: list[SearchResult] = []
|
60
|
+
|
61
|
+
for idx, card in enumerate(cards):
|
62
|
+
if limit is not None and idx >= limit:
|
63
|
+
break
|
64
|
+
# Title and book_id
|
65
|
+
link = card.xpath('.//h5[@class="card-title"]/a')[0]
|
66
|
+
title = link.text_content().strip()
|
67
|
+
href = link.get("href", "")
|
68
|
+
# href format: /detail/<book_id>.html
|
69
|
+
book_id = href.strip("/").replace("detail/", "").replace(".html", "")
|
70
|
+
# Author
|
71
|
+
author_link = card.xpath('.//div[@class="card-author"]/a')[0]
|
72
|
+
author = author_link.text_content().strip()
|
73
|
+
# Compute priority incrementally
|
74
|
+
prio = cls.priority + idx
|
75
|
+
results.append(
|
76
|
+
SearchResult(
|
77
|
+
site=cls.site_name,
|
78
|
+
book_id=book_id,
|
79
|
+
title=title,
|
80
|
+
author=author,
|
81
|
+
priority=prio,
|
82
|
+
)
|
83
|
+
)
|
84
|
+
return results
|