novel-downloader 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. novel_downloader/__init__.py +14 -0
  2. novel_downloader/cli/__init__.py +14 -0
  3. novel_downloader/cli/clean.py +134 -0
  4. novel_downloader/cli/download.py +98 -0
  5. novel_downloader/cli/interactive.py +67 -0
  6. novel_downloader/cli/main.py +45 -0
  7. novel_downloader/cli/settings.py +177 -0
  8. novel_downloader/config/__init__.py +52 -0
  9. novel_downloader/config/adapter.py +150 -0
  10. novel_downloader/config/loader.py +177 -0
  11. novel_downloader/config/models.py +170 -0
  12. novel_downloader/config/site_rules.py +97 -0
  13. novel_downloader/core/__init__.py +25 -0
  14. novel_downloader/core/downloaders/__init__.py +20 -0
  15. novel_downloader/core/downloaders/base_downloader.py +187 -0
  16. novel_downloader/core/downloaders/common_downloader.py +192 -0
  17. novel_downloader/core/downloaders/qidian_downloader.py +208 -0
  18. novel_downloader/core/factory/__init__.py +21 -0
  19. novel_downloader/core/factory/downloader_factory.py +62 -0
  20. novel_downloader/core/factory/parser_factory.py +62 -0
  21. novel_downloader/core/factory/requester_factory.py +62 -0
  22. novel_downloader/core/factory/saver_factory.py +49 -0
  23. novel_downloader/core/interfaces/__init__.py +28 -0
  24. novel_downloader/core/interfaces/downloader_protocol.py +37 -0
  25. novel_downloader/core/interfaces/parser_protocol.py +40 -0
  26. novel_downloader/core/interfaces/requester_protocol.py +65 -0
  27. novel_downloader/core/interfaces/saver_protocol.py +61 -0
  28. novel_downloader/core/parsers/__init__.py +28 -0
  29. novel_downloader/core/parsers/base_parser.py +96 -0
  30. novel_downloader/core/parsers/common_parser/__init__.py +14 -0
  31. novel_downloader/core/parsers/common_parser/helper.py +321 -0
  32. novel_downloader/core/parsers/common_parser/main_parser.py +86 -0
  33. novel_downloader/core/parsers/qidian_parser/__init__.py +20 -0
  34. novel_downloader/core/parsers/qidian_parser/browser/__init__.py +13 -0
  35. novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +498 -0
  36. novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +97 -0
  37. novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +70 -0
  38. novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +110 -0
  39. novel_downloader/core/parsers/qidian_parser/session/__init__.py +13 -0
  40. novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +451 -0
  41. novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +119 -0
  42. novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +67 -0
  43. novel_downloader/core/parsers/qidian_parser/session/main_parser.py +113 -0
  44. novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +164 -0
  45. novel_downloader/core/parsers/qidian_parser/shared/__init__.py +38 -0
  46. novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +95 -0
  47. novel_downloader/core/parsers/qidian_parser/shared/helpers.py +133 -0
  48. novel_downloader/core/requesters/__init__.py +27 -0
  49. novel_downloader/core/requesters/base_browser.py +210 -0
  50. novel_downloader/core/requesters/base_session.py +243 -0
  51. novel_downloader/core/requesters/common_requester/__init__.py +14 -0
  52. novel_downloader/core/requesters/common_requester/common_session.py +126 -0
  53. novel_downloader/core/requesters/qidian_requester/__init__.py +22 -0
  54. novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +377 -0
  55. novel_downloader/core/requesters/qidian_requester/qidian_session.py +202 -0
  56. novel_downloader/core/savers/__init__.py +20 -0
  57. novel_downloader/core/savers/base_saver.py +169 -0
  58. novel_downloader/core/savers/common_saver/__init__.py +13 -0
  59. novel_downloader/core/savers/common_saver/common_epub.py +232 -0
  60. novel_downloader/core/savers/common_saver/common_txt.py +176 -0
  61. novel_downloader/core/savers/common_saver/main_saver.py +86 -0
  62. novel_downloader/core/savers/epub_utils/__init__.py +27 -0
  63. novel_downloader/core/savers/epub_utils/css_builder.py +68 -0
  64. novel_downloader/core/savers/epub_utils/initializer.py +98 -0
  65. novel_downloader/core/savers/epub_utils/text_to_html.py +132 -0
  66. novel_downloader/core/savers/epub_utils/volume_intro.py +61 -0
  67. novel_downloader/core/savers/qidian_saver.py +22 -0
  68. novel_downloader/locales/en.json +91 -0
  69. novel_downloader/locales/zh.json +91 -0
  70. novel_downloader/resources/config/rules.toml +196 -0
  71. novel_downloader/resources/config/settings.yaml +70 -0
  72. novel_downloader/resources/css_styles/main.css +104 -0
  73. novel_downloader/resources/css_styles/volume-intro.css +56 -0
  74. novel_downloader/resources/images/volume_border.png +0 -0
  75. novel_downloader/resources/js_scripts/qidian_decrypt_node.js +82 -0
  76. novel_downloader/resources/json/replace_word_map.json +4 -0
  77. novel_downloader/resources/text/blacklist.txt +22 -0
  78. novel_downloader/utils/__init__.py +0 -0
  79. novel_downloader/utils/cache.py +24 -0
  80. novel_downloader/utils/constants.py +158 -0
  81. novel_downloader/utils/crypto_utils.py +144 -0
  82. novel_downloader/utils/file_utils/__init__.py +43 -0
  83. novel_downloader/utils/file_utils/io.py +252 -0
  84. novel_downloader/utils/file_utils/normalize.py +68 -0
  85. novel_downloader/utils/file_utils/sanitize.py +77 -0
  86. novel_downloader/utils/fontocr/__init__.py +23 -0
  87. novel_downloader/utils/fontocr/ocr_v1.py +304 -0
  88. novel_downloader/utils/fontocr/ocr_v2.py +658 -0
  89. novel_downloader/utils/hash_store.py +288 -0
  90. novel_downloader/utils/hash_utils.py +103 -0
  91. novel_downloader/utils/i18n.py +41 -0
  92. novel_downloader/utils/logger.py +104 -0
  93. novel_downloader/utils/model_loader.py +72 -0
  94. novel_downloader/utils/network.py +287 -0
  95. novel_downloader/utils/state.py +156 -0
  96. novel_downloader/utils/text_utils/__init__.py +27 -0
  97. novel_downloader/utils/text_utils/chapter_formatting.py +46 -0
  98. novel_downloader/utils/text_utils/diff_display.py +75 -0
  99. novel_downloader/utils/text_utils/font_mapping.py +31 -0
  100. novel_downloader/utils/text_utils/text_cleaning.py +57 -0
  101. novel_downloader/utils/time_utils/__init__.py +22 -0
  102. novel_downloader/utils/time_utils/datetime_utils.py +146 -0
  103. novel_downloader/utils/time_utils/sleep_utils.py +49 -0
  104. novel_downloader-1.1.1.dist-info/METADATA +137 -0
  105. novel_downloader-1.1.1.dist-info/RECORD +109 -0
  106. novel_downloader-1.1.1.dist-info/WHEEL +5 -0
  107. novel_downloader-1.1.1.dist-info/entry_points.txt +2 -0
  108. novel_downloader-1.1.1.dist-info/licenses/LICENSE +21 -0
  109. novel_downloader-1.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,321 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.parsers.common_parser.helpers
5
+ ---------------------------------------------------
6
+
7
+ Shared utility functions for parsing Common pages.
8
+ """
9
+
10
+ import logging
11
+ import re
12
+ from typing import Any, Dict, Iterable, Iterator, List, Optional, cast
13
+
14
+ from bs4 import BeautifulSoup, Tag
15
+
16
+ from novel_downloader.config import (
17
+ BookInfoRules,
18
+ FieldRules,
19
+ RuleStep,
20
+ VolumesRules,
21
+ )
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ def html_to_soup(html_str: str) -> BeautifulSoup:
27
+ """
28
+ Convert an HTML string to a BeautifulSoup object with fallback.
29
+
30
+ :param html_str: Raw HTML string.
31
+ :return: Parsed BeautifulSoup object.
32
+ """
33
+ try:
34
+ return BeautifulSoup(html_str, "lxml")
35
+ except Exception as e:
36
+ logger.warning("[Parser] lxml parse failed, falling back: %s", e)
37
+ return BeautifulSoup(html_str, "html.parser")
38
+
39
+
40
+ class HTMLExtractor:
41
+ """
42
+ HTML extraction engine that applies a sequence of RuleSteps to
43
+ pull data out of a page.
44
+ """
45
+
46
+ def __init__(self, html: str):
47
+ self._html = html
48
+ self._soup = html_to_soup(html)
49
+
50
+ def extract_book_info(self, rules: BookInfoRules) -> Dict[str, Any]:
51
+ """
52
+ Extract structured book information from HTML according to the given rules.
53
+
54
+ Only non-empty fields in the rules are processed.
55
+
56
+ :param rules: Extraction configuration specifying how to extract.
57
+ :return: A dictionary containing extracted book information.
58
+ """
59
+ book_info: Dict[str, Any] = {}
60
+
61
+ for field_name, field_rules in rules.items():
62
+ if field_rules is None:
63
+ continue
64
+
65
+ if field_name == "volumes":
66
+ book_info[field_name] = self.extract_volumes_structure(
67
+ cast(VolumesRules, field_rules)
68
+ )
69
+ else:
70
+ steps = cast(FieldRules, field_rules)["steps"]
71
+ book_info[field_name] = self.extract_field(steps)
72
+
73
+ return book_info
74
+
75
+ def extract_field(self, steps: List[RuleStep]) -> str:
76
+ """
77
+ Execute a list of extraction steps on the given HTML.
78
+
79
+ - If any step yields None, stops processing further steps.
80
+ - At the end, always returns a str:
81
+ * If current is a list, converts items to text and joins with '\n'.
82
+ * If current is a Tag, extracts its .get_text().
83
+ * Else, uses str().
84
+ """
85
+
86
+ def flatten_list(items: Iterable[Any]) -> Iterator[Any]:
87
+ for item in items:
88
+ if isinstance(item, list):
89
+ yield from flatten_list(item)
90
+ else:
91
+ yield item
92
+
93
+ def to_text(item: Any) -> str:
94
+ if isinstance(item, Tag):
95
+ return str(item.get_text().strip())
96
+ return str(item).strip()
97
+
98
+ current: Any = self._soup
99
+
100
+ for step in steps:
101
+ t = step.get("type")
102
+ if t == "select_one":
103
+ sel = step.get("selector")
104
+ current = current.select_one(sel) if sel else None
105
+
106
+ elif t == "select":
107
+ sel = step.get("selector")
108
+ lst = current.select(sel) if sel else []
109
+ idx = step.get("index")
110
+ current = lst[idx] if idx is not None and idx < len(lst) else lst
111
+
112
+ elif t == "exclude":
113
+ sel = step.get("selector")
114
+ for elem in current.select(sel or ""):
115
+ elem.decompose()
116
+
117
+ elif t == "find":
118
+ nm = step.get("name")
119
+ attrs = step.get("attrs") or {}
120
+ current = current.find(nm, attrs=attrs)
121
+
122
+ elif t == "find_all":
123
+ nm = step.get("name")
124
+ attrs = step.get("attrs") or {}
125
+ lst = current.find_all(nm, attrs=attrs, limit=step.get("limit"))
126
+ idx = step.get("index")
127
+ current = lst[idx] if idx is not None and idx < len(lst) else lst
128
+
129
+ elif t == "text":
130
+ if isinstance(current, list):
131
+ current = [elem.get_text() for elem in current]
132
+ elif isinstance(current, Tag):
133
+ current = current.get_text()
134
+
135
+ elif t == "strip":
136
+ chars = step.get("chars")
137
+ if isinstance(current, list):
138
+ current = [c.strip(chars) for c in current]
139
+ elif isinstance(current, str):
140
+ current = current.strip(chars)
141
+
142
+ elif t == "regex":
143
+ txt = str(current or "")
144
+ pat = step.get("pattern") or ""
145
+ flags = step.get("flags")
146
+ flags = flags if flags is not None else 0
147
+ match = re.compile(pat, flags).search(txt)
148
+ if match:
149
+ template = step.get("template")
150
+ if template:
151
+ s = template
152
+ for i in range(1, len(match.groups()) + 1):
153
+ s = s.replace(f"${i}", match.group(i) or "")
154
+ current = s
155
+ else:
156
+ grp = step.get("group")
157
+ grp = grp if grp is not None else 0
158
+ current = match.group(grp)
159
+ else:
160
+ current = ""
161
+
162
+ elif t == "replace":
163
+ old = step.get("old")
164
+ old = old if old is not None else ""
165
+
166
+ new = step.get("new")
167
+ new = new if new is not None else ""
168
+
169
+ cnt = step.get("count")
170
+ cnt = cnt if cnt is not None else -1
171
+
172
+ if isinstance(current, list):
173
+ current = [c.replace(old, new, cnt) for c in current]
174
+ elif isinstance(current, str):
175
+ current = current.replace(old, new, cnt)
176
+
177
+ elif t == "split":
178
+ sep = step.get("sep", "")
179
+ idx = step.get("index")
180
+ idx = idx if idx is not None else 0
181
+ parts = (current or "").split(sep)
182
+ current = parts[idx] if idx < len(parts) else ""
183
+
184
+ elif t == "join":
185
+ sep = step.get("sep")
186
+ sep = sep if sep is not None else ""
187
+ if isinstance(current, list):
188
+ current = sep.join(current)
189
+
190
+ elif t == "attr":
191
+ name = step.get("attr")
192
+ if isinstance(current, list):
193
+ current = [elem.get(name, "") for elem in current]
194
+ elif isinstance(current, Tag):
195
+ current = current.get(name, "")
196
+
197
+ else:
198
+ raise ValueError(f"Unsupported step type: {t}")
199
+
200
+ if current is None:
201
+ break
202
+
203
+ # Final normalization
204
+ if isinstance(current, list):
205
+ flat = list(flatten_list(current))
206
+ texts = [to_text(x) for x in flat if x is not None]
207
+ return "\n".join(texts)
208
+ if isinstance(current, Tag):
209
+ return str(current.get_text().strip())
210
+ return str(current or "").strip()
211
+
212
+ def extract_mixed_volumes(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
213
+ """
214
+ Special mode: mixed <volume> and <chapter> under same parent.
215
+ (e.g., dt / dd pattern in BiQuGe)
216
+ """
217
+ list_selector = volume_rule.get("list_selector")
218
+ volume_selector = volume_rule.get("volume_selector")
219
+ chapter_selector = volume_rule.get("chapter_selector")
220
+ volume_name_steps = volume_rule.get("volume_name_steps")
221
+ chapter_steps_list = volume_rule.get("chapter_steps")
222
+
223
+ if not (
224
+ list_selector and volume_selector and chapter_selector and volume_name_steps
225
+ ):
226
+ raise ValueError(
227
+ "volume_mode='mixed' 时, 必须提供 list_selector, volume_selector, "
228
+ "chapter_selector 和 volume_name_steps"
229
+ )
230
+
231
+ volumes: List[Dict[str, Any]] = []
232
+ current_volume: Optional[Dict[str, Any]] = None
233
+ if not chapter_steps_list:
234
+ chapter_steps_list = []
235
+ chapter_info_steps = {item["key"]: item["steps"] for item in chapter_steps_list}
236
+
237
+ list_area = self._soup.select_one(list_selector)
238
+ if not list_area:
239
+ raise ValueError(f"找不到 list_selector: {list_selector}")
240
+
241
+ for elem in list_area.find_all(
242
+ [volume_selector, chapter_selector], recursive=True
243
+ ):
244
+ if elem.name == volume_selector:
245
+ extractor = HTMLExtractor(str(elem))
246
+ volume_name = extractor.extract_field(volume_name_steps)
247
+ current_volume = {"volume_name": volume_name, "chapters": []}
248
+ volumes.append(current_volume)
249
+
250
+ elif elem.name == chapter_selector and current_volume is not None:
251
+ chap_extractor = HTMLExtractor(str(elem))
252
+ chapter_data = {}
253
+ for field, steps in chapter_info_steps.items():
254
+ chapter_data[field] = chap_extractor.extract_field(steps)
255
+ current_volume["chapters"].append(chapter_data)
256
+
257
+ return volumes
258
+
259
+ def extract_volume_blocks(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
260
+ volume_selector = volume_rule["volume_selector"]
261
+ chapter_selector = volume_rule["chapter_selector"]
262
+ volume_name_steps = volume_rule["volume_name_steps"]
263
+ chapter_steps_list = volume_rule["chapter_steps"]
264
+ if not (volume_selector and volume_name_steps):
265
+ raise ValueError(
266
+ "has_volume=True 时, 必须提供 volume_selector 和 volume_name_steps"
267
+ )
268
+ volumes = []
269
+ chapter_info_steps = {item["key"]: item["steps"] for item in chapter_steps_list}
270
+ for vol in self._soup.select(volume_selector):
271
+ extractor = HTMLExtractor(str(vol))
272
+ volume_name = extractor.extract_field(volume_name_steps)
273
+
274
+ chapters = []
275
+ for chap in vol.select(chapter_selector):
276
+ chap_extractor = HTMLExtractor(str(chap))
277
+ chapter_data = {}
278
+ for field, steps in chapter_info_steps.items():
279
+ chapter_data[field] = chap_extractor.extract_field(steps)
280
+ chapters.append(chapter_data)
281
+
282
+ volumes.append({"volume_name": volume_name, "chapters": chapters})
283
+
284
+ return volumes
285
+
286
+ def extract_flat_chapters(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
287
+ chapter_selector = volume_rule["chapter_selector"]
288
+ chapter_steps_list = volume_rule["chapter_steps"]
289
+ volume_selector = volume_rule.get("volume_selector")
290
+ volumes = []
291
+ chapter_info_steps = {item["key"]: item["steps"] for item in chapter_steps_list}
292
+
293
+ if volume_selector:
294
+ candidates = self._soup.select(volume_selector)
295
+ else:
296
+ candidates = [self._soup]
297
+
298
+ all_chapters = []
299
+ for area in candidates:
300
+ for chap in area.select(chapter_selector):
301
+ chap_extractor = HTMLExtractor(str(chap))
302
+ chapter_data = {}
303
+ for field, steps in chapter_info_steps.items():
304
+ chapter_data[field] = chap_extractor.extract_field(steps)
305
+ all_chapters.append(chapter_data)
306
+
307
+ volumes.append({"volume_name": "未分卷", "chapters": all_chapters})
308
+
309
+ return volumes
310
+
311
+ def extract_volumes_structure(
312
+ self, volume_rule: VolumesRules
313
+ ) -> List[Dict[str, Any]]:
314
+ volume_mode = volume_rule.get("volume_mode", "normal")
315
+ if volume_mode == "mixed":
316
+ return self.extract_mixed_volumes(volume_rule)
317
+
318
+ if volume_rule.get("has_volume", True):
319
+ return self.extract_volume_blocks(volume_rule)
320
+ else:
321
+ return self.extract_flat_chapters(volume_rule)
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.parsers.common_parser.main_parser
5
+ -------------------------------------------------------
6
+
7
+ This package provides parsing components for handling
8
+ Common pages.
9
+ """
10
+
11
+ from typing import Any, Dict
12
+
13
+ from novel_downloader.config import ParserConfig, SiteRules
14
+
15
+ from ..base_parser import BaseParser
16
+ from .helper import HTMLExtractor
17
+
18
+
19
+ class CommonParser(BaseParser):
20
+ """
21
+ CommonParser extends BaseParser to support site-specific parsing rules.
22
+
23
+ It accepts additional site information and site-specific rules during initialization
24
+ """
25
+
26
+ def __init__(self, config: ParserConfig, site: str, site_rule: SiteRules):
27
+ """
28
+ Initialize the parser with configuration, site name, and site-specific rules.
29
+
30
+ :param config: ParserConfig object controlling parsing behavior.
31
+ :param site: Name of the site this parser is targeting.
32
+ :param site_rule: SiteRules object containing parsing rules for the site.
33
+ """
34
+ super().__init__(config)
35
+ self._site = site
36
+ self._site_rule = site_rule
37
+
38
+ def parse_book_info(self, html_str: str) -> Dict[str, Any]:
39
+ """
40
+ Parse a book info page and extract metadata and chapter structure.
41
+
42
+ :param html: Raw HTML of the book info page.
43
+ :return: Parsed metadata and chapter structure as a dictionary.
44
+ """
45
+ extractor = HTMLExtractor(html_str)
46
+ rules = self._site_rule["book_info"]
47
+ return extractor.extract_book_info(rules)
48
+
49
+ def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
50
+ """
51
+ Parse a single chapter page and extract clean text or simplified HTML.
52
+
53
+ :param html: Raw HTML of the chapter page.
54
+ :param chapter_id: Identifier of the chapter being parsed.
55
+ :return: Cleaned chapter content as plain text or minimal HTML.
56
+ """
57
+ extractor = HTMLExtractor(html_str)
58
+ chapter_rules = self._site_rule["chapter"]
59
+
60
+ # 必须有正文内容
61
+ content_steps = chapter_rules.get("content")
62
+ if not content_steps:
63
+ raise ValueError(f"No chapter content steps defined for site: {self._site}")
64
+
65
+ title_steps = chapter_rules.get("title")
66
+ title = extractor.extract_field(title_steps["steps"]) if title_steps else ""
67
+ content = extractor.extract_field(content_steps["steps"])
68
+ if not content:
69
+ return {}
70
+
71
+ return {
72
+ "id": chapter_id,
73
+ "title": title or "Untitled",
74
+ "content": content,
75
+ "site": self._site,
76
+ }
77
+
78
+ @property
79
+ def site(self) -> str:
80
+ """Return the site name."""
81
+ return self._site
82
+
83
+ @property
84
+ def site_rule(self) -> SiteRules:
85
+ """Return the site-specific rules."""
86
+ return self._site_rule
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.parsers.qidian_parser
5
+ -------------------------------------------
6
+
7
+ This package provides parsing implementations for the Qidian platform.
8
+
9
+ Modules:
10
+ - browser: Contains `QidianBrowserParser` for browser-rendered page parsing.
11
+ - session: Contains `QidianSessionParser` for session page parsing.
12
+ """
13
+
14
+ from .browser import QidianBrowserParser
15
+ from .session import QidianSessionParser
16
+
17
+ __all__ = [
18
+ "QidianBrowserParser",
19
+ "QidianSessionParser",
20
+ ]
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.parsers.qidian_parser.browser
5
+ ---------------------------------------------------
6
+
7
+ This package provides parsing components for handling Qidian
8
+ pages that have been rendered by a browser engine.
9
+ """
10
+
11
+ from .main_parser import QidianBrowserParser
12
+
13
+ __all__ = ["QidianBrowserParser"]