novel-downloader 1.4.5__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +2 -2
  3. novel_downloader/cli/config.py +1 -83
  4. novel_downloader/cli/download.py +4 -5
  5. novel_downloader/cli/export.py +4 -1
  6. novel_downloader/cli/main.py +2 -0
  7. novel_downloader/cli/search.py +123 -0
  8. novel_downloader/config/__init__.py +3 -10
  9. novel_downloader/config/adapter.py +190 -54
  10. novel_downloader/config/loader.py +2 -3
  11. novel_downloader/core/__init__.py +13 -13
  12. novel_downloader/core/downloaders/__init__.py +10 -11
  13. novel_downloader/core/downloaders/base.py +152 -26
  14. novel_downloader/core/downloaders/biquge.py +5 -1
  15. novel_downloader/core/downloaders/common.py +157 -378
  16. novel_downloader/core/downloaders/esjzone.py +5 -1
  17. novel_downloader/core/downloaders/linovelib.py +5 -1
  18. novel_downloader/core/downloaders/qianbi.py +291 -4
  19. novel_downloader/core/downloaders/qidian.py +199 -285
  20. novel_downloader/core/downloaders/registry.py +67 -0
  21. novel_downloader/core/downloaders/sfacg.py +5 -1
  22. novel_downloader/core/downloaders/yamibo.py +5 -1
  23. novel_downloader/core/exporters/__init__.py +10 -11
  24. novel_downloader/core/exporters/base.py +87 -7
  25. novel_downloader/core/exporters/biquge.py +5 -8
  26. novel_downloader/core/exporters/common/__init__.py +2 -2
  27. novel_downloader/core/exporters/common/epub.py +82 -166
  28. novel_downloader/core/exporters/common/main_exporter.py +0 -60
  29. novel_downloader/core/exporters/common/txt.py +82 -83
  30. novel_downloader/core/exporters/epub_util.py +157 -1330
  31. novel_downloader/core/exporters/esjzone.py +5 -8
  32. novel_downloader/core/exporters/linovelib/__init__.py +2 -2
  33. novel_downloader/core/exporters/linovelib/epub.py +157 -212
  34. novel_downloader/core/exporters/linovelib/main_exporter.py +2 -59
  35. novel_downloader/core/exporters/linovelib/txt.py +67 -63
  36. novel_downloader/core/exporters/qianbi.py +5 -8
  37. novel_downloader/core/exporters/qidian.py +14 -4
  38. novel_downloader/core/exporters/registry.py +53 -0
  39. novel_downloader/core/exporters/sfacg.py +5 -8
  40. novel_downloader/core/exporters/txt_util.py +67 -0
  41. novel_downloader/core/exporters/yamibo.py +5 -8
  42. novel_downloader/core/fetchers/__init__.py +19 -24
  43. novel_downloader/core/fetchers/base/__init__.py +3 -3
  44. novel_downloader/core/fetchers/base/browser.py +23 -4
  45. novel_downloader/core/fetchers/base/session.py +30 -5
  46. novel_downloader/core/fetchers/biquge/__init__.py +3 -3
  47. novel_downloader/core/fetchers/biquge/browser.py +5 -0
  48. novel_downloader/core/fetchers/biquge/session.py +6 -1
  49. novel_downloader/core/fetchers/esjzone/__init__.py +3 -3
  50. novel_downloader/core/fetchers/esjzone/browser.py +5 -0
  51. novel_downloader/core/fetchers/esjzone/session.py +6 -1
  52. novel_downloader/core/fetchers/linovelib/__init__.py +3 -3
  53. novel_downloader/core/fetchers/linovelib/browser.py +6 -1
  54. novel_downloader/core/fetchers/linovelib/session.py +6 -1
  55. novel_downloader/core/fetchers/qianbi/__init__.py +3 -3
  56. novel_downloader/core/fetchers/qianbi/browser.py +5 -0
  57. novel_downloader/core/fetchers/qianbi/session.py +5 -0
  58. novel_downloader/core/fetchers/qidian/__init__.py +3 -3
  59. novel_downloader/core/fetchers/qidian/browser.py +12 -4
  60. novel_downloader/core/fetchers/qidian/session.py +11 -3
  61. novel_downloader/core/fetchers/registry.py +71 -0
  62. novel_downloader/core/fetchers/sfacg/__init__.py +3 -3
  63. novel_downloader/core/fetchers/sfacg/browser.py +5 -0
  64. novel_downloader/core/fetchers/sfacg/session.py +5 -0
  65. novel_downloader/core/fetchers/yamibo/__init__.py +3 -3
  66. novel_downloader/core/fetchers/yamibo/browser.py +5 -0
  67. novel_downloader/core/fetchers/yamibo/session.py +6 -1
  68. novel_downloader/core/interfaces/__init__.py +7 -5
  69. novel_downloader/core/interfaces/searcher.py +18 -0
  70. novel_downloader/core/parsers/__init__.py +10 -11
  71. novel_downloader/core/parsers/{biquge/main_parser.py → biquge.py} +7 -2
  72. novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +7 -2
  73. novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +7 -2
  74. novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +7 -2
  75. novel_downloader/core/parsers/qidian/__init__.py +2 -2
  76. novel_downloader/core/parsers/qidian/chapter_encrypted.py +23 -21
  77. novel_downloader/core/parsers/qidian/chapter_normal.py +1 -1
  78. novel_downloader/core/parsers/qidian/main_parser.py +10 -21
  79. novel_downloader/core/parsers/qidian/utils/__init__.py +11 -11
  80. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +5 -6
  81. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
  82. novel_downloader/core/parsers/registry.py +68 -0
  83. novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +7 -2
  84. novel_downloader/core/parsers/{yamibo/main_parser.py → yamibo.py} +7 -2
  85. novel_downloader/core/searchers/__init__.py +20 -0
  86. novel_downloader/core/searchers/base.py +92 -0
  87. novel_downloader/core/searchers/biquge.py +83 -0
  88. novel_downloader/core/searchers/esjzone.py +84 -0
  89. novel_downloader/core/searchers/qianbi.py +131 -0
  90. novel_downloader/core/searchers/qidian.py +87 -0
  91. novel_downloader/core/searchers/registry.py +63 -0
  92. novel_downloader/locales/en.json +12 -4
  93. novel_downloader/locales/zh.json +12 -4
  94. novel_downloader/models/__init__.py +4 -30
  95. novel_downloader/models/config.py +12 -6
  96. novel_downloader/models/search.py +16 -0
  97. novel_downloader/models/types.py +0 -2
  98. novel_downloader/resources/config/settings.toml +31 -4
  99. novel_downloader/resources/css_styles/intro.css +83 -0
  100. novel_downloader/resources/css_styles/main.css +30 -89
  101. novel_downloader/utils/__init__.py +52 -0
  102. novel_downloader/utils/chapter_storage.py +244 -224
  103. novel_downloader/utils/constants.py +1 -21
  104. novel_downloader/utils/epub/__init__.py +34 -0
  105. novel_downloader/utils/epub/builder.py +377 -0
  106. novel_downloader/utils/epub/constants.py +77 -0
  107. novel_downloader/utils/epub/documents.py +403 -0
  108. novel_downloader/utils/epub/models.py +134 -0
  109. novel_downloader/utils/epub/utils.py +212 -0
  110. novel_downloader/utils/file_utils/__init__.py +10 -14
  111. novel_downloader/utils/file_utils/io.py +20 -51
  112. novel_downloader/utils/file_utils/normalize.py +2 -2
  113. novel_downloader/utils/file_utils/sanitize.py +2 -3
  114. novel_downloader/utils/fontocr/__init__.py +5 -5
  115. novel_downloader/utils/{hash_store.py → fontocr/hash_store.py} +4 -3
  116. novel_downloader/utils/{hash_utils.py → fontocr/hash_utils.py} +2 -2
  117. novel_downloader/utils/fontocr/ocr_v1.py +13 -1
  118. novel_downloader/utils/fontocr/ocr_v2.py +13 -1
  119. novel_downloader/utils/fontocr/ocr_v3.py +744 -0
  120. novel_downloader/utils/i18n.py +2 -0
  121. novel_downloader/utils/logger.py +2 -0
  122. novel_downloader/utils/network.py +110 -251
  123. novel_downloader/utils/state.py +1 -0
  124. novel_downloader/utils/text_utils/__init__.py +18 -17
  125. novel_downloader/utils/text_utils/diff_display.py +4 -5
  126. novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
  127. novel_downloader/utils/text_utils/text_cleaner.py +179 -0
  128. novel_downloader/utils/text_utils/truncate_utils.py +62 -0
  129. novel_downloader/utils/time_utils/__init__.py +3 -3
  130. novel_downloader/utils/time_utils/datetime_utils.py +4 -5
  131. novel_downloader/utils/time_utils/sleep_utils.py +2 -3
  132. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/METADATA +2 -2
  133. novel_downloader-1.5.0.dist-info/RECORD +164 -0
  134. novel_downloader/config/site_rules.py +0 -94
  135. novel_downloader/core/factory/__init__.py +0 -20
  136. novel_downloader/core/factory/downloader.py +0 -73
  137. novel_downloader/core/factory/exporter.py +0 -58
  138. novel_downloader/core/factory/fetcher.py +0 -96
  139. novel_downloader/core/factory/parser.py +0 -86
  140. novel_downloader/core/fetchers/common/__init__.py +0 -14
  141. novel_downloader/core/fetchers/common/browser.py +0 -79
  142. novel_downloader/core/fetchers/common/session.py +0 -79
  143. novel_downloader/core/parsers/biquge/__init__.py +0 -10
  144. novel_downloader/core/parsers/common/__init__.py +0 -13
  145. novel_downloader/core/parsers/common/helper.py +0 -323
  146. novel_downloader/core/parsers/common/main_parser.py +0 -106
  147. novel_downloader/core/parsers/esjzone/__init__.py +0 -10
  148. novel_downloader/core/parsers/linovelib/__init__.py +0 -10
  149. novel_downloader/core/parsers/qianbi/__init__.py +0 -10
  150. novel_downloader/core/parsers/sfacg/__init__.py +0 -10
  151. novel_downloader/core/parsers/yamibo/__init__.py +0 -10
  152. novel_downloader/models/browser.py +0 -21
  153. novel_downloader/models/site_rules.py +0 -99
  154. novel_downloader/models/tasks.py +0 -33
  155. novel_downloader/resources/css_styles/volume-intro.css +0 -56
  156. novel_downloader/resources/json/replace_word_map.json +0 -4
  157. novel_downloader/resources/text/blacklist.txt +0 -22
  158. novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
  159. novel_downloader/utils/text_utils/font_mapping.py +0 -28
  160. novel_downloader/utils/text_utils/text_cleaning.py +0 -107
  161. novel_downloader-1.4.5.dist-info/RECORD +0 -165
  162. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/WHEEL +0 -0
  163. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/entry_points.txt +0 -0
  164. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/licenses/LICENSE +0 -0
  165. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/top_level.txt +0 -0
@@ -5,16 +5,39 @@ novel_downloader.core.downloaders.qianbi
5
5
 
6
6
  """
7
7
 
8
- from novel_downloader.core.downloaders.common import CommonDownloader
8
+ import asyncio
9
+ from collections.abc import AsyncIterator, Awaitable, Callable
10
+ from contextlib import asynccontextmanager
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from novel_downloader.core.downloaders.base import BaseDownloader
15
+ from novel_downloader.core.downloaders.registry import register_downloader
9
16
  from novel_downloader.core.interfaces import (
10
17
  FetcherProtocol,
11
18
  ParserProtocol,
12
19
  )
13
- from novel_downloader.models import DownloaderConfig
20
+ from novel_downloader.models import (
21
+ BookConfig,
22
+ ChapterDict,
23
+ DownloaderConfig,
24
+ )
25
+ from novel_downloader.utils import (
26
+ ChapterStorage,
27
+ async_sleep_with_random_delay,
28
+ )
29
+
30
+
31
+ @register_downloader(site_keys=["qianbi"])
32
+ class QianbiDownloader(BaseDownloader):
33
+ """
34
+ Downloader for Qianbi (铅笔) novels.
14
35
 
36
+ Repairs missing chapter IDs by following 'next' links, then downloads
37
+ each chapter as a unit (fetch -> parse -> enqueue storage).
38
+ """
15
39
 
16
- class QianbiDownloader(CommonDownloader):
17
- """"""
40
+ DEFAULT_SOURCE_ID = 0
18
41
 
19
42
  def __init__(
20
43
  self,
@@ -23,3 +46,267 @@ class QianbiDownloader(CommonDownloader):
23
46
  config: DownloaderConfig,
24
47
  ):
25
48
  super().__init__(fetcher, parser, config, "qianbi")
49
+
50
+ async def _download_one(
51
+ self,
52
+ book: BookConfig,
53
+ *,
54
+ progress_hook: Callable[[int, int], Awaitable[None]] | None = None,
55
+ **kwargs: Any,
56
+ ) -> None:
57
+ """
58
+ The full download logic for a single book.
59
+
60
+ :param book: BookConfig with at least 'book_id'.
61
+ """
62
+ TAG = "[Downloader]"
63
+ book_id = book["book_id"]
64
+ start_id = book.get("start_id")
65
+ end_id = book.get("end_id")
66
+ ignore_set = set(book.get("ignore_ids", []))
67
+
68
+ # prepare storage & dirs
69
+ raw_base = self._raw_data_dir / book_id
70
+ raw_base.mkdir(parents=True, exist_ok=True)
71
+ html_dir = self._debug_dir / book_id / "html"
72
+ chapter_storage = ChapterStorage(
73
+ raw_base=raw_base,
74
+ priorities=self._priorities,
75
+ )
76
+ chapter_storage.connect()
77
+
78
+ # load or fetch metadata
79
+ book_info = await self.load_book_info(book_id=book_id, html_dir=html_dir)
80
+ book_info = await self._repair_chapter_ids(
81
+ book_id,
82
+ book_info,
83
+ chapter_storage,
84
+ html_dir,
85
+ )
86
+
87
+ vols = book_info.get("volumes", [])
88
+ total_chapters = sum(len(v.get("chapters", [])) for v in vols)
89
+ if total_chapters == 0:
90
+ self.logger.warning("%s 书籍没有章节可下载: %s", TAG, book_id)
91
+ return
92
+
93
+ # concurrency primitives
94
+ sem = asyncio.Semaphore(self.workers)
95
+ cid_q: asyncio.Queue[str | None] = asyncio.Queue()
96
+ save_q: asyncio.Queue[ChapterDict | None] = asyncio.Queue()
97
+ batch: list[ChapterDict] = []
98
+ completed = 0
99
+
100
+ async def _flush_batch() -> None:
101
+ nonlocal batch, completed
102
+ if not batch:
103
+ return
104
+
105
+ try:
106
+ chapter_storage.upsert_chapters(batch, self.DEFAULT_SOURCE_ID)
107
+ except Exception as e:
108
+ self.logger.error(
109
+ "[Storage] batch upsert failed (size=%d): %s",
110
+ len(batch),
111
+ e,
112
+ exc_info=True,
113
+ )
114
+ else:
115
+ completed += len(batch)
116
+ if progress_hook:
117
+ await progress_hook(completed, total_chapters)
118
+ finally:
119
+ batch.clear()
120
+
121
+ async def storage_worker(q: asyncio.Queue[ChapterDict | None]) -> None:
122
+ while True:
123
+ item = await q.get()
124
+ q.task_done()
125
+ if item is None:
126
+ # final flush before exit
127
+ if batch:
128
+ await _flush_batch()
129
+ break
130
+ batch.append(item)
131
+ if len(batch) >= self.storage_batch_size:
132
+ await _flush_batch()
133
+
134
+ async def producer() -> None:
135
+ nonlocal completed
136
+ async for cid in self._chapter_ids(vols, start_id, end_id):
137
+ if self.skip_existing and chapter_storage.exists(cid):
138
+ completed += 1
139
+ if progress_hook:
140
+ await progress_hook(completed, total_chapters)
141
+ else:
142
+ await cid_q.put(cid)
143
+
144
+ @asynccontextmanager
145
+ async def task_group_ctx() -> AsyncIterator[asyncio.TaskGroup]:
146
+ async with asyncio.TaskGroup() as tg:
147
+ # start chapter workers
148
+ for _ in range(self.workers):
149
+ tg.create_task(
150
+ self._chapter_worker(
151
+ book_id,
152
+ ignore_set,
153
+ cid_q,
154
+ save_q,
155
+ sem,
156
+ )
157
+ )
158
+ # start storage worker
159
+ tg.create_task(storage_worker(save_q))
160
+ yield tg
161
+
162
+ # run producer + workers
163
+ async with task_group_ctx():
164
+ # produce all CidTask
165
+ await producer()
166
+
167
+ # signal chapter workers to exit
168
+ for _ in range(self.workers):
169
+ await cid_q.put(None)
170
+ await cid_q.join()
171
+
172
+ # signal storage worker to exit
173
+ await save_q.put(None)
174
+ await save_q.join()
175
+
176
+ # final flush to catch any remaining items
177
+ await _flush_batch()
178
+
179
+ chapter_storage.close()
180
+ self.logger.info(
181
+ "%s Novel '%s' download completed.",
182
+ TAG,
183
+ book_info.get("book_name", "unknown"),
184
+ )
185
+
186
+ async def _repair_chapter_ids(
187
+ self,
188
+ book_id: str,
189
+ book_info: dict[str, Any],
190
+ storage: ChapterStorage,
191
+ html_dir: Path,
192
+ ) -> dict[str, Any]:
193
+ """
194
+ Fill in missing chapterId fields by retrieving the previous chapter
195
+ and following its 'next_chapter_id'. Uses storage to avoid refetching.
196
+ """
197
+ prev_cid: str = ""
198
+ for vol in book_info.get("volumes", []):
199
+ for chap in vol.get("chapters", []):
200
+ cid = chap.get("chapterId")
201
+ if cid:
202
+ prev_cid = cid
203
+ continue
204
+
205
+ # no valid previous to follow
206
+ if not prev_cid:
207
+ continue
208
+
209
+ # missing id: try storage
210
+ data = storage.get_best_chapter(prev_cid)
211
+ if not data:
212
+ # fetch+parse previous to discover next
213
+ data = await self._process_chapter(book_id, prev_cid, html_dir)
214
+ if not data:
215
+ self.logger.warning(
216
+ "failed to fetch chapter %s, skipping repair",
217
+ prev_cid,
218
+ )
219
+ continue
220
+ storage.upsert_chapter(data, self.DEFAULT_SOURCE_ID)
221
+ await async_sleep_with_random_delay(
222
+ self.request_interval,
223
+ mul_spread=1.1,
224
+ max_sleep=self.request_interval + 2,
225
+ )
226
+
227
+ next_cid = data.get("extra", {}).get("next_chapter_id")
228
+ if not next_cid:
229
+ self.logger.warning(
230
+ "No next_chapter_id in data for %s",
231
+ prev_cid,
232
+ )
233
+ continue
234
+
235
+ self.logger.info(
236
+ "repaired chapterId: set to %s (from prev %s)",
237
+ next_cid,
238
+ prev_cid,
239
+ )
240
+ chap["chapterId"] = next_cid
241
+ prev_cid = next_cid
242
+
243
+ self._save_book_info(book_id, book_info)
244
+ return book_info
245
+
246
+ async def _chapter_worker(
247
+ self,
248
+ book_id: str,
249
+ ignore_set: set[str],
250
+ cid_q: asyncio.Queue[str | None],
251
+ save_q: asyncio.Queue[ChapterDict | None],
252
+ sem: asyncio.Semaphore,
253
+ ) -> None:
254
+ """
255
+ Worker that processes one chapter at a time:
256
+ fetch + parse with retry, then enqueue to save_q.
257
+ """
258
+ html_dir = self._debug_dir / book_id / "html"
259
+ while True:
260
+ cid = await cid_q.get()
261
+ if cid is None:
262
+ cid_q.task_done()
263
+ break
264
+ if not cid or cid in ignore_set:
265
+ cid_q.task_done()
266
+ continue
267
+
268
+ async with sem:
269
+ chap = await self._process_chapter(book_id, cid, html_dir)
270
+
271
+ if chap:
272
+ await save_q.put(chap)
273
+
274
+ cid_q.task_done()
275
+ await async_sleep_with_random_delay(
276
+ self.request_interval,
277
+ mul_spread=1.1,
278
+ max_sleep=self.request_interval + 2,
279
+ )
280
+
281
+ async def _process_chapter(
282
+ self,
283
+ book_id: str,
284
+ cid: str,
285
+ html_dir: Path,
286
+ ) -> ChapterDict | None:
287
+ """
288
+ Fetches, saves raw HTML, parses a single chapter,
289
+ retrying up to self.retry_times.
290
+
291
+ :return: ChapterDict on success, or None on failure.
292
+ """
293
+ for attempt in range(self.retry_times + 1):
294
+ try:
295
+ html_list = await self.fetcher.get_book_chapter(book_id, cid)
296
+ self._save_html_pages(html_dir, cid, html_list)
297
+ chap = await asyncio.to_thread(
298
+ self.parser.parse_chapter, html_list, cid
299
+ )
300
+ if not chap:
301
+ raise ValueError("Empty parse result")
302
+ return chap
303
+ except Exception as e:
304
+ if attempt < self.retry_times:
305
+ self.logger.info(f"[ChapterWorker] Retry {cid} ({attempt+1}): {e}")
306
+ backoff = self.backoff_factor * (2**attempt)
307
+ await async_sleep_with_random_delay(
308
+ base=backoff, mul_spread=1.2, max_sleep=backoff + 3
309
+ )
310
+ else:
311
+ self.logger.warning(f"[ChapterWorker] Failed {cid}: {e}")
312
+ return None