novel-downloader 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/download.py +70 -11
  3. novel_downloader/config/adapter.py +43 -9
  4. novel_downloader/core/__init__.py +19 -1
  5. novel_downloader/core/downloaders/base.py +26 -29
  6. novel_downloader/core/downloaders/biquge.py +1 -3
  7. novel_downloader/core/downloaders/common.py +41 -7
  8. novel_downloader/core/downloaders/esjzone.py +1 -3
  9. novel_downloader/core/downloaders/linovelib.py +1 -3
  10. novel_downloader/core/downloaders/qianbi.py +1 -3
  11. novel_downloader/core/downloaders/qidian.py +61 -37
  12. novel_downloader/core/downloaders/sfacg.py +1 -3
  13. novel_downloader/core/downloaders/yamibo.py +1 -3
  14. novel_downloader/core/exporters/common/epub.py +153 -68
  15. novel_downloader/core/exporters/epub_util.py +1358 -0
  16. novel_downloader/core/exporters/linovelib/epub.py +147 -190
  17. novel_downloader/core/factory/downloader.py +3 -6
  18. novel_downloader/core/fetchers/base/browser.py +32 -12
  19. novel_downloader/core/fetchers/esjzone/browser.py +8 -6
  20. novel_downloader/core/fetchers/qidian/browser.py +62 -10
  21. novel_downloader/core/fetchers/yamibo/browser.py +3 -3
  22. novel_downloader/core/interfaces/downloader.py +13 -12
  23. novel_downloader/core/parsers/qidian/chapter_encrypted.py +11 -2
  24. novel_downloader/core/parsers/qidian/chapter_normal.py +8 -1
  25. novel_downloader/core/parsers/qidian/main_parser.py +7 -2
  26. novel_downloader/core/parsers/qidian/utils/__init__.py +2 -0
  27. novel_downloader/core/parsers/qidian/utils/helpers.py +9 -0
  28. novel_downloader/locales/en.json +2 -0
  29. novel_downloader/locales/zh.json +2 -0
  30. novel_downloader/models/__init__.py +2 -0
  31. novel_downloader/models/config.py +9 -0
  32. novel_downloader/resources/config/settings.toml +1 -0
  33. novel_downloader/tui/screens/home.py +13 -6
  34. novel_downloader/utils/constants.py +0 -29
  35. novel_downloader/utils/{model_loader.py → fontocr/model_loader.py} +2 -2
  36. novel_downloader/utils/fontocr/ocr_v1.py +2 -1
  37. novel_downloader/utils/fontocr/ocr_v2.py +2 -1
  38. novel_downloader/utils/text_utils/__init__.py +8 -1
  39. novel_downloader/utils/text_utils/text_cleaning.py +51 -0
  40. {novel_downloader-1.4.1.dist-info → novel_downloader-1.4.3.dist-info}/METADATA +5 -2
  41. {novel_downloader-1.4.1.dist-info → novel_downloader-1.4.3.dist-info}/RECORD +45 -50
  42. novel_downloader/core/exporters/epub_utils/__init__.py +0 -40
  43. novel_downloader/core/exporters/epub_utils/css_builder.py +0 -75
  44. novel_downloader/core/exporters/epub_utils/image_loader.py +0 -131
  45. novel_downloader/core/exporters/epub_utils/initializer.py +0 -100
  46. novel_downloader/core/exporters/epub_utils/text_to_html.py +0 -178
  47. novel_downloader/core/exporters/epub_utils/volume_intro.py +0 -60
  48. {novel_downloader-1.4.1.dist-info → novel_downloader-1.4.3.dist-info}/WHEEL +0 -0
  49. {novel_downloader-1.4.1.dist-info → novel_downloader-1.4.3.dist-info}/entry_points.txt +0 -0
  50. {novel_downloader-1.4.1.dist-info → novel_downloader-1.4.3.dist-info}/licenses/LICENSE +0 -0
  51. {novel_downloader-1.4.1.dist-info → novel_downloader-1.4.3.dist-info}/top_level.txt +0 -0
@@ -13,11 +13,11 @@ from typing import Any, cast
13
13
 
14
14
  from novel_downloader.core.downloaders.base import BaseDownloader
15
15
  from novel_downloader.core.interfaces import (
16
- ExporterProtocol,
17
16
  FetcherProtocol,
18
17
  ParserProtocol,
19
18
  )
20
19
  from novel_downloader.models import (
20
+ BookConfig,
21
21
  ChapterDict,
22
22
  CidTask,
23
23
  DownloaderConfig,
@@ -40,15 +40,14 @@ class QidianDownloader(BaseDownloader):
40
40
  self,
41
41
  fetcher: FetcherProtocol,
42
42
  parser: ParserProtocol,
43
- exporter: ExporterProtocol,
44
43
  config: DownloaderConfig,
45
44
  ):
46
45
  config.request_interval = max(1.0, config.request_interval)
47
- super().__init__(fetcher, parser, exporter, config, "qidian")
46
+ super().__init__(fetcher, parser, config, "qidian")
48
47
 
49
48
  async def _download_one(
50
49
  self,
51
- book_id: str,
50
+ book: BookConfig,
52
51
  *,
53
52
  progress_hook: Callable[[int, int], Awaitable[None]] | None = None,
54
53
  **kwargs: Any,
@@ -56,9 +55,13 @@ class QidianDownloader(BaseDownloader):
56
55
  """
57
56
  The full download logic for a single book.
58
57
 
59
- :param book_id: The identifier of the book to download.
58
+ :param book: BookConfig with at least 'book_id'.
60
59
  """
61
60
  TAG = "[Downloader]"
61
+ book_id = book["book_id"]
62
+ start_id = book.get("start_id")
63
+ end_id = book.get("end_id")
64
+ ignore_set = set(book.get("ignore_ids", []))
62
65
 
63
66
  raw_base = self.raw_data_dir / book_id
64
67
  cache_base = self.cache_dir / book_id
@@ -140,6 +143,10 @@ class QidianDownloader(BaseDownloader):
140
143
  cid_queue.task_done()
141
144
  continue
142
145
 
146
+ if cid in ignore_set:
147
+ cid_queue.task_done()
148
+ continue
149
+
143
150
  try:
144
151
  html_list = await self.fetcher.get_book_chapter(book_id, cid)
145
152
  await html_queue.put(
@@ -194,40 +201,39 @@ class QidianDownloader(BaseDownloader):
194
201
  skip_retry = False
195
202
  try:
196
203
  chap_json: ChapterDict | None = None
197
- if self.is_restricted_page(task.html_list):
204
+ if self.check_restricted(task.html_list):
198
205
  self.logger.info(
199
206
  "[Parser] Skipped restricted page for cid %s", task.cid
200
207
  )
201
208
  skip_retry = True
202
- else:
203
- chap_json = await asyncio.to_thread(
204
- self.parser.parse_chapter,
205
- task.html_list,
209
+ raise ValueError("Restricted content detected")
210
+
211
+ is_encrypted = self.check_encrypted(task.html_list)
212
+ chap_json = await asyncio.to_thread(
213
+ self.parser.parse_chapter,
214
+ task.html_list,
215
+ task.cid,
216
+ )
217
+ if is_encrypted:
218
+ skip_retry = True
219
+ if self.save_html:
220
+ folder = chapters_html_dir / (
221
+ "html_encrypted" if is_encrypted else "html_plain"
222
+ )
223
+ html_path = folder / f"{task.cid}.html"
224
+ save_as_txt(task.html_list[0], html_path, on_exist="skip")
225
+ self.logger.debug(
226
+ "%s Saved raw HTML for chapter %s to %s",
227
+ TAG,
206
228
  task.cid,
229
+ html_path,
207
230
  )
208
- if self.check_encrypted(task.html_list):
209
- skip_retry = True
210
231
  if chap_json:
211
232
  await save_queue.put(chap_json)
212
233
  self.logger.info(
213
234
  "[Parser] saved chapter %s",
214
235
  task.cid,
215
236
  )
216
- if self.save_html:
217
- is_encrypted = chap_json.get("extra", {}).get(
218
- "encrypted", False
219
- )
220
- folder = chapters_html_dir / (
221
- "html_encrypted" if is_encrypted else "html_plain"
222
- )
223
- html_path = folder / f"{task.cid}.html"
224
- save_as_txt(task.html_list[0], html_path, on_exist="skip")
225
- self.logger.debug(
226
- "%s Saved raw HTML for chapter %s to %s",
227
- TAG,
228
- task.cid,
229
- html_path,
230
- )
231
237
  else:
232
238
  raise ValueError("Empty parse result")
233
239
  except Exception as e:
@@ -296,20 +302,40 @@ class QidianDownloader(BaseDownloader):
296
302
  )
297
303
  )
298
304
 
299
- last_cid: str | None = None
305
+ found_start = start_id is None
306
+ stop_early = False
307
+
300
308
  for vol in book_info.get("volumes", []):
301
309
  chapters = vol.get("chapters", [])
302
310
  for chap in chapters:
311
+ if stop_early:
312
+ break
313
+
303
314
  cid = chap.get("chapterId")
304
- if cid and normal_cs.exists(cid) and self.skip_existing:
315
+ if not cid:
316
+ continue
317
+
318
+ if not found_start:
319
+ if cid == start_id:
320
+ found_start = True
321
+ else:
322
+ completed_count += 1
323
+ continue
324
+
325
+ if end_id is not None and cid == end_id:
326
+ stop_early = True
327
+
328
+ if cid in ignore_set:
329
+ continue
330
+
331
+ if normal_cs.exists(cid) and self.skip_existing:
305
332
  completed_count += 1
306
- if progress_hook:
307
- await progress_hook(completed_count, total_chapters)
308
- last_cid = cid
309
333
  continue
310
334
 
311
- await cid_queue.put(CidTask(cid=cid, prev_cid=last_cid))
312
- last_cid = cid
335
+ await cid_queue.put(CidTask(cid=cid, prev_cid=None))
336
+
337
+ if stop_early:
338
+ break
313
339
 
314
340
  await cid_queue.join()
315
341
  await html_queue.join()
@@ -323,8 +349,6 @@ class QidianDownloader(BaseDownloader):
323
349
  normal_cs.close()
324
350
  encrypted_cs.close()
325
351
 
326
- await asyncio.to_thread(self.exporter.export, book_id)
327
-
328
352
  self.logger.info(
329
353
  "%s Novel '%s' download completed.",
330
354
  TAG,
@@ -333,7 +357,7 @@ class QidianDownloader(BaseDownloader):
333
357
  return
334
358
 
335
359
  @staticmethod
336
- def is_restricted_page(html_list: list[str]) -> bool:
360
+ def check_restricted(html_list: list[str]) -> bool:
337
361
  """
338
362
  Return True if page content indicates access restriction
339
363
  (e.g. not subscribed/purchased).
@@ -7,7 +7,6 @@ novel_downloader.core.downloaders.sfacg
7
7
 
8
8
  from novel_downloader.core.downloaders.common import CommonDownloader
9
9
  from novel_downloader.core.interfaces import (
10
- ExporterProtocol,
11
10
  FetcherProtocol,
12
11
  ParserProtocol,
13
12
  )
@@ -21,7 +20,6 @@ class SfacgDownloader(CommonDownloader):
21
20
  self,
22
21
  fetcher: FetcherProtocol,
23
22
  parser: ParserProtocol,
24
- exporter: ExporterProtocol,
25
23
  config: DownloaderConfig,
26
24
  ):
27
- super().__init__(fetcher, parser, exporter, config, "sfacg")
25
+ super().__init__(fetcher, parser, config, "sfacg")
@@ -7,7 +7,6 @@ novel_downloader.core.downloaders.yamibo
7
7
 
8
8
  from novel_downloader.core.downloaders.common import CommonDownloader
9
9
  from novel_downloader.core.interfaces import (
10
- ExporterProtocol,
11
10
  FetcherProtocol,
12
11
  ParserProtocol,
13
12
  )
@@ -21,7 +20,6 @@ class YamiboDownloader(CommonDownloader):
21
20
  self,
22
21
  fetcher: FetcherProtocol,
23
22
  parser: ParserProtocol,
24
- exporter: ExporterProtocol,
25
23
  config: DownloaderConfig,
26
24
  ):
27
- super().__init__(fetcher, parser, exporter, config, "yamibo")
25
+ super().__init__(fetcher, parser, config, "yamibo")
@@ -8,25 +8,19 @@ Contains the logic for exporting novel content as a single `.epub` file.
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
+ import html
11
12
  import json
13
+ import re
12
14
  from pathlib import Path
13
15
  from typing import TYPE_CHECKING
14
16
 
15
- from ebooklib import epub
16
-
17
- from novel_downloader.core.exporters.epub_utils import (
18
- add_images_from_dir,
19
- chapter_txt_to_html,
20
- create_css_items,
21
- create_volume_intro,
22
- generate_book_intro_html,
23
- init_epub,
24
- inline_remote_images,
25
- )
26
- from novel_downloader.utils.constants import (
27
- EPUB_OPTIONS,
28
- EPUB_TEXT_FOLDER,
17
+ from novel_downloader.core.exporters.epub_util import (
18
+ Book,
19
+ Chapter,
20
+ StyleSheet,
21
+ Volume,
29
22
  )
23
+ from novel_downloader.utils.constants import CSS_MAIN_PATH
30
24
  from novel_downloader.utils.file_utils import sanitize_filename
31
25
  from novel_downloader.utils.network import download_image
32
26
  from novel_downloader.utils.text_utils import clean_chapter_title
@@ -34,6 +28,16 @@ from novel_downloader.utils.text_utils import clean_chapter_title
34
28
  if TYPE_CHECKING:
35
29
  from .main_exporter import CommonExporter
36
30
 
31
+ _IMAGE_WRAPPER = (
32
+ '<div class="duokan-image-single illus"><img src="../Images/{filename}" /></div>'
33
+ )
34
+ _IMG_TAG_PATTERN = re.compile(
35
+ r'<img\s+[^>]*src=[\'"]([^\'"]+)[\'"][^>]*>', re.IGNORECASE
36
+ )
37
+ _RAW_HTML_RE = re.compile(
38
+ r'^(<img\b[^>]*?\/>|<div class="duokan-image-single illus">.*?<\/div>)$', re.DOTALL
39
+ )
40
+
37
41
 
38
42
  def common_export_as_epub(
39
43
  exporter: CommonExporter,
@@ -71,12 +75,12 @@ def common_export_as_epub(
71
75
  return
72
76
 
73
77
  book_name = book_info.get("book_name", book_id)
78
+ book_author = book_info.get("author", "")
74
79
  exporter.logger.info(
75
80
  "%s Starting EPUB generation: %s (ID: %s)", TAG, book_name, book_id
76
81
  )
77
82
 
78
83
  # --- Generate intro + cover ---
79
- intro_html = generate_book_intro_html(book_info)
80
84
  cover_path: Path | None = None
81
85
  cover_url = book_info.get("cover_url", "")
82
86
  if config.include_cover and cover_url:
@@ -90,49 +94,56 @@ def common_export_as_epub(
90
94
  exporter.logger.warning("Failed to download cover from %s", cover_url)
91
95
 
92
96
  # --- Initialize EPUB ---
93
- book, spine, toc_list = init_epub(
94
- book_info=book_info,
95
- book_id=book_id,
96
- intro_html=intro_html,
97
- book_cover_path=cover_path,
98
- include_toc=config.include_toc,
97
+ book = Book(
98
+ title=book_name,
99
+ author=book_author,
100
+ description=book_info.get("summary", ""),
101
+ cover_path=cover_path,
102
+ subject=book_info.get("subject", []),
103
+ serial_status=book_info.get("serial_status", ""),
104
+ word_count=book_info.get("word_count", ""),
105
+ uid=f"{exporter.site}_{book_id}",
106
+ )
107
+ main_css = StyleSheet(
108
+ id="main_style",
109
+ content=CSS_MAIN_PATH.read_text(encoding="utf-8"),
110
+ filename="main.css",
99
111
  )
100
- for css in create_css_items(
101
- include_main=True,
102
- include_volume=True,
103
- ):
104
- book.add_item(css)
112
+ book.add_stylesheet(main_css)
105
113
 
106
114
  # --- Compile chapters ---
107
115
  volumes = book_info.get("volumes", [])
108
116
  for vol_index, vol in enumerate(volumes, start=1):
109
- raw_vol_name = vol.get("volume_name", "").strip()
110
- vol_name = clean_chapter_title(raw_vol_name) or f"Unknown Volume {vol_index}"
117
+ raw_vol_name = vol.get("volume_name", "")
118
+ raw_vol_name = raw_vol_name.replace(book_name, "").strip()
119
+ vol_name = raw_vol_name or f"Volume {vol_index}"
111
120
  exporter.logger.info("Processing volume %d: %s", vol_index, vol_name)
112
121
 
113
- # Volume intro
114
- vol_intro = epub.EpubHtml(
122
+ vol_cover_path: Path | None = None
123
+ vol_cover_url = vol.get("volume_cover", "")
124
+ if vol_cover_url:
125
+ vol_cover_path = download_image(
126
+ vol_cover_url,
127
+ img_dir,
128
+ on_exist="skip",
129
+ )
130
+
131
+ curr_vol = Volume(
132
+ id=f"vol_{vol_index}",
115
133
  title=vol_name,
116
- file_name=f"{EPUB_TEXT_FOLDER}/volume_intro_{vol_index}.xhtml",
117
- lang="zh",
134
+ intro=vol.get("volume_intro", ""),
135
+ cover=vol_cover_path,
118
136
  )
119
- vol_intro.content = create_volume_intro(vol_name, vol.get("volume_intro", ""))
120
- vol_intro.add_link(
121
- href="../Styles/volume-intro.css",
122
- rel="stylesheet",
123
- type="text/css",
124
- )
125
- book.add_item(vol_intro)
126
- spine.append(vol_intro)
127
-
128
- section = epub.Section(vol_name, vol_intro.file_name)
129
- chapter_items: list[epub.EpubHtml] = []
130
137
 
131
138
  for chap in vol.get("chapters", []):
132
139
  chap_id = chap.get("chapterId")
133
140
  chap_title = chap.get("title", "")
134
141
  if not chap_id:
135
- exporter.logger.warning("%s Missing chapterId, skipping: %s", TAG, chap)
142
+ exporter.logger.warning(
143
+ "%s Missing chapterId, skipping: %s",
144
+ TAG,
145
+ chap,
146
+ )
136
147
  continue
137
148
 
138
149
  chapter_data = exporter._get_chapter(book_id, chap_id)
@@ -147,36 +158,28 @@ def common_export_as_epub(
147
158
 
148
159
  title = clean_chapter_title(chapter_data.get("title", "")) or chap_id
149
160
  content: str = chapter_data.get("content", "")
150
- content = inline_remote_images(content, img_dir)
151
- chap_html = chapter_txt_to_html(
161
+ content, img_paths = _inline_remote_images(content, img_dir)
162
+ chap_html = _txt_to_html(
152
163
  chapter_title=title,
153
164
  chapter_text=content,
154
- author_say=chapter_data.get("author_say", ""),
165
+ extras={
166
+ "作者说": chapter_data.get("author_say", ""),
167
+ },
155
168
  )
156
-
157
- chap_path = f"{EPUB_TEXT_FOLDER}/{chap_id}.xhtml"
158
- item = epub.EpubHtml(title=chap_title, file_name=chap_path, lang="zh")
159
- item.content = chap_html
160
- item.add_link(
161
- href="../Styles/main.css",
162
- rel="stylesheet",
163
- type="text/css",
169
+ curr_vol.add_chapter(
170
+ Chapter(
171
+ id=f"c_{chap_id}",
172
+ title=title,
173
+ content=chap_html,
174
+ css=[main_css],
175
+ )
164
176
  )
165
- book.add_item(item)
166
- spine.append(item)
167
- chapter_items.append(item)
168
-
169
- toc_list.append((section, chapter_items))
177
+ for img_path in img_paths:
178
+ book.add_image(img_path)
170
179
 
171
- book = add_images_from_dir(book, img_dir)
180
+ book.add_volume(curr_vol)
172
181
 
173
182
  # --- 5. Finalize EPUB ---
174
- exporter.logger.info("%s Building TOC and spine...", TAG)
175
- book.toc = toc_list
176
- book.spine = spine
177
- book.add_item(epub.EpubNcx())
178
- book.add_item(epub.EpubNav())
179
-
180
183
  out_name = exporter.get_filename(
181
184
  title=book_name,
182
185
  author=book_info.get("author"),
@@ -185,8 +188,90 @@ def common_export_as_epub(
185
188
  out_path = out_dir / sanitize_filename(out_name)
186
189
 
187
190
  try:
188
- epub.write_epub(out_path, book, EPUB_OPTIONS)
191
+ book.export(out_path)
189
192
  exporter.logger.info("%s EPUB successfully written to %s", TAG, out_path)
190
193
  except Exception as e:
191
194
  exporter.logger.error("%s Failed to write EPUB to %s: %s", TAG, out_path, e)
192
195
  return
196
+
197
+
198
+ def _inline_remote_images(
199
+ content: str,
200
+ image_dir: str | Path,
201
+ ) -> tuple[str, list[Path]]:
202
+ """
203
+ Download every remote `<img src="...">` in `content` into `image_dir`,
204
+ and replace the original tag with _IMAGE_WRAPPER
205
+ pointing to the local filename.
206
+
207
+ :param content: HTML/text of the chapter containing <img> tags.
208
+ :param image_dir: Directory to save downloaded images into.
209
+ :return: A tuple (modified_content, list_of_downloaded_image_paths).
210
+ """
211
+ downloaded_images: list[Path] = []
212
+
213
+ def _replace(match: re.Match[str]) -> str:
214
+ url = match.group(1)
215
+ try:
216
+ # download_image returns a Path or None
217
+ local_path = download_image(
218
+ url,
219
+ image_dir,
220
+ target_name=None,
221
+ on_exist="skip",
222
+ )
223
+ if not local_path:
224
+ return match.group(0)
225
+
226
+ downloaded_images.append(local_path)
227
+ return _IMAGE_WRAPPER.format(filename=local_path.name)
228
+ except Exception:
229
+ return match.group(0)
230
+
231
+ modified_content = _IMG_TAG_PATTERN.sub(_replace, content)
232
+ return modified_content, downloaded_images
233
+
234
+
235
+ def _txt_to_html(
236
+ chapter_title: str,
237
+ chapter_text: str,
238
+ extras: dict[str, str] | None = None,
239
+ ) -> str:
240
+ """
241
+ Convert chapter text and author note to styled HTML.
242
+
243
+ :param chapter_title: Title of the chapter.
244
+ :param chapter_text: Main content of the chapter.
245
+ :param extras: Optional dict of titles and content, e.g. {"作者说": "text"}.
246
+ :return: Rendered HTML as a string.
247
+ """
248
+
249
+ def _render_block(text: str) -> str:
250
+ lines = (line.strip() for line in text.splitlines() if line.strip())
251
+ out = []
252
+ for line in lines:
253
+ # preserve raw HTML, otherwise wrap in <p>
254
+ if _RAW_HTML_RE.match(line):
255
+ out.append(line)
256
+ else:
257
+ out.append(f"<p>{html.escape(line)}</p>")
258
+ return "\n".join(out)
259
+
260
+ parts = []
261
+ parts.append(f"<h2>{html.escape(chapter_title)}</h2>")
262
+ parts.append(_render_block(chapter_text))
263
+
264
+ if extras:
265
+ for title, note in extras.items():
266
+ note = note.strip()
267
+ if not note:
268
+ continue
269
+ parts.extend(
270
+ [
271
+ "<hr />",
272
+ f"<p>{html.escape(title)}</p>",
273
+ _render_block(note),
274
+ ]
275
+ )
276
+
277
+ return "\n".join(parts)