novel-downloader 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/download.py +70 -11
  3. novel_downloader/config/adapter.py +43 -9
  4. novel_downloader/core/__init__.py +19 -1
  5. novel_downloader/core/downloaders/base.py +26 -29
  6. novel_downloader/core/downloaders/biquge.py +1 -3
  7. novel_downloader/core/downloaders/common.py +41 -7
  8. novel_downloader/core/downloaders/esjzone.py +1 -3
  9. novel_downloader/core/downloaders/linovelib.py +1 -3
  10. novel_downloader/core/downloaders/qianbi.py +1 -3
  11. novel_downloader/core/downloaders/qidian.py +61 -37
  12. novel_downloader/core/downloaders/sfacg.py +1 -3
  13. novel_downloader/core/downloaders/yamibo.py +1 -3
  14. novel_downloader/core/exporters/common/epub.py +153 -68
  15. novel_downloader/core/exporters/epub_util.py +1358 -0
  16. novel_downloader/core/exporters/linovelib/epub.py +147 -190
  17. novel_downloader/core/factory/downloader.py +3 -6
  18. novel_downloader/core/fetchers/base/browser.py +32 -12
  19. novel_downloader/core/fetchers/esjzone/browser.py +8 -6
  20. novel_downloader/core/fetchers/qidian/browser.py +62 -10
  21. novel_downloader/core/fetchers/yamibo/browser.py +3 -3
  22. novel_downloader/core/interfaces/downloader.py +13 -12
  23. novel_downloader/core/parsers/qidian/chapter_encrypted.py +11 -2
  24. novel_downloader/core/parsers/qidian/chapter_normal.py +8 -1
  25. novel_downloader/core/parsers/qidian/main_parser.py +7 -2
  26. novel_downloader/core/parsers/qidian/utils/__init__.py +2 -0
  27. novel_downloader/core/parsers/qidian/utils/helpers.py +9 -0
  28. novel_downloader/locales/en.json +2 -0
  29. novel_downloader/locales/zh.json +2 -0
  30. novel_downloader/models/__init__.py +2 -0
  31. novel_downloader/models/config.py +9 -0
  32. novel_downloader/resources/config/settings.toml +1 -0
  33. novel_downloader/tui/screens/home.py +13 -6
  34. novel_downloader/utils/constants.py +0 -29
  35. novel_downloader/utils/{model_loader.py → fontocr/model_loader.py} +2 -2
  36. novel_downloader/utils/fontocr/ocr_v1.py +2 -1
  37. novel_downloader/utils/fontocr/ocr_v2.py +2 -1
  38. novel_downloader/utils/text_utils/__init__.py +8 -1
  39. novel_downloader/utils/text_utils/text_cleaning.py +51 -0
  40. {novel_downloader-1.4.1.dist-info → novel_downloader-1.4.3.dist-info}/METADATA +5 -2
  41. {novel_downloader-1.4.1.dist-info → novel_downloader-1.4.3.dist-info}/RECORD +45 -50
  42. novel_downloader/core/exporters/epub_utils/__init__.py +0 -40
  43. novel_downloader/core/exporters/epub_utils/css_builder.py +0 -75
  44. novel_downloader/core/exporters/epub_utils/image_loader.py +0 -131
  45. novel_downloader/core/exporters/epub_utils/initializer.py +0 -100
  46. novel_downloader/core/exporters/epub_utils/text_to_html.py +0 -178
  47. novel_downloader/core/exporters/epub_utils/volume_intro.py +0 -60
  48. {novel_downloader-1.4.1.dist-info → novel_downloader-1.4.3.dist-info}/WHEEL +0 -0
  49. {novel_downloader-1.4.1.dist-info → novel_downloader-1.4.3.dist-info}/entry_points.txt +0 -0
  50. {novel_downloader-1.4.1.dist-info → novel_downloader-1.4.3.dist-info}/licenses/LICENSE +0 -0
  51. {novel_downloader-1.4.1.dist-info → novel_downloader-1.4.3.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ novel_downloader.core.fetchers.qidian.browser
5
5
 
6
6
  """
7
7
 
8
+ import asyncio
8
9
  from typing import Any
9
10
 
10
11
  from playwright.async_api import Page
@@ -189,18 +190,35 @@ class QidianBrowser(BaseBrowser):
189
190
  """
190
191
  try:
191
192
  page = await self.context.new_page()
192
- await page.goto(self.HOMEPAGE_URL, wait_until="networkidle")
193
193
  await self._login_auto(page)
194
194
  await self._dismiss_overlay(page)
195
- sign_in_elem = await page.query_selector(".sign-in")
196
- if sign_in_elem and await sign_in_elem.is_visible():
197
- self.logger.debug("[auth] Sign-in element visible.")
198
- await page.close()
199
- return False
200
- else:
201
- self.logger.debug("[auth] Sign-in element not found.")
202
- await page.close()
195
+ await page.goto(self.HOMEPAGE_URL, wait_until="networkidle")
196
+ sign_in_elem = await page.query_selector("#login-box .sign-in")
197
+ sign_out_elem = await page.query_selector("#login-box .sign-out")
198
+
199
+ sign_in_class = (
200
+ (await sign_in_elem.get_attribute("class") or "")
201
+ if sign_in_elem
202
+ else ""
203
+ )
204
+ sign_out_class = (
205
+ (await sign_out_elem.get_attribute("class") or "")
206
+ if sign_out_elem
207
+ else ""
208
+ )
209
+
210
+ sign_in_hidden = "hidden" in sign_in_class
211
+ sign_out_hidden = "hidden" in sign_out_class
212
+
213
+ await page.close()
214
+
215
+ # if sign_in_visible and not sign_out_visible:
216
+ if not sign_in_hidden and sign_out_hidden:
217
+ self.logger.debug("[auth] Detected as logged in.")
203
218
  return True
219
+ else:
220
+ self.logger.debug("[auth] Detected as not logged in.")
221
+ return False
204
222
  except Exception as e:
205
223
  self.logger.warning("[auth] Error while checking login status: %s", e)
206
224
  return False
@@ -220,7 +238,10 @@ class QidianBrowser(BaseBrowser):
220
238
 
221
239
  self.logger.debug("[auth] Overlay mask detected; attempting to close.")
222
240
 
223
- iframe_element = await page.query_selector('iframe[name="loginIfr"]')
241
+ iframe_element = await page.wait_for_selector(
242
+ "#loginIfr",
243
+ timeout=timeout * 1000,
244
+ )
224
245
  if iframe_element is None:
225
246
  self.logger.debug("[auth] Login iframe not found.")
226
247
  return
@@ -261,6 +282,37 @@ class QidianBrowser(BaseBrowser):
261
282
  btn = await page.query_selector("#login-btn")
262
283
  if btn and await btn.is_visible():
263
284
  await btn.click()
285
+ tasks = [
286
+ asyncio.create_task(
287
+ page.wait_for_selector(
288
+ "div.mask",
289
+ timeout=timeout * 1000,
290
+ )
291
+ ),
292
+ asyncio.create_task(
293
+ page.wait_for_selector(
294
+ "div.qdlogin-wrap",
295
+ timeout=timeout * 1000,
296
+ )
297
+ ),
298
+ asyncio.create_task(
299
+ page.wait_for_url(
300
+ lambda url: "login" not in url,
301
+ timeout=timeout * 1000,
302
+ )
303
+ ),
304
+ ]
305
+ done, pending = await asyncio.wait(
306
+ tasks,
307
+ timeout=timeout + 1,
308
+ return_when=asyncio.FIRST_COMPLETED,
309
+ )
310
+ for task in pending:
311
+ task.cancel()
312
+ if done:
313
+ self.logger.debug("[auth] Login flow proceeded after button click.")
314
+ else:
315
+ self.logger.warning("[auth] Timeout waiting for login to proceed.")
264
316
  except Exception as e:
265
317
  self.logger.debug("[auth] Failed to click login button: %s", e)
266
318
  return
@@ -48,8 +48,8 @@ class YamiboBrowser(BaseBrowser):
48
48
  return False
49
49
 
50
50
  for i in range(1, attempt + 1):
51
+ login_page = await self.context.new_page()
51
52
  try:
52
- login_page = await self.context.new_page()
53
53
  await login_page.goto(self.LOGIN_URL, wait_until="networkidle")
54
54
 
55
55
  await login_page.fill("#loginform-username", username)
@@ -68,8 +68,6 @@ class YamiboBrowser(BaseBrowser):
68
68
  f"[auth] No URL change after login attempt {i}: {e}"
69
69
  )
70
70
 
71
- await login_page.close()
72
-
73
71
  self._is_logged_in = await self._check_login_status()
74
72
  if self._is_logged_in:
75
73
  self.logger.info(f"[auth] Login successful on attempt {i}.")
@@ -83,6 +81,8 @@ class YamiboBrowser(BaseBrowser):
83
81
  self.logger.error(
84
82
  f"[auth] Unexpected error during login attempt {i}: {e}"
85
83
  )
84
+ finally:
85
+ await login_page.close()
86
86
 
87
87
  self.logger.error(f"[auth] Login failed after {attempt} attempt(s).")
88
88
  return False
@@ -10,45 +10,46 @@ that outlines the expected behavior of any downloader class.
10
10
  from collections.abc import Awaitable, Callable
11
11
  from typing import Any, Protocol, runtime_checkable
12
12
 
13
+ from novel_downloader.models import BookConfig
14
+
13
15
 
14
16
  @runtime_checkable
15
17
  class DownloaderProtocol(Protocol):
16
18
  """
17
- Protocol for fully-asynchronous downloader classes.
19
+ Protocol for async downloader implementations.
18
20
 
19
- Defines the expected interface for any downloader implementation,
20
- including both batch and single book downloads,
21
- as well as optional pre-download hooks.
21
+ Uses BookConfig (with book_id, optional start_id/end_id/ignore_ids)
22
+ for both single and batch downloads.
22
23
  """
23
24
 
24
25
  async def download(
25
26
  self,
26
- book_id: str,
27
+ book: BookConfig,
27
28
  *,
28
29
  progress_hook: Callable[[int, int], Awaitable[None]] | None = None,
29
30
  **kwargs: Any,
30
31
  ) -> None:
31
32
  """
32
- Download logic for a single book.
33
+ Download a single book.
33
34
 
34
- :param book_id: The identifier of the book.
35
- :param progress_hook: (optional) Called after each chapter;
35
+ :param book: BookConfig with at least 'book_id'.
36
+ :param progress_hook: Optional async callback after each chapter.
36
37
  args: completed_count, total_count.
37
38
  """
38
39
  ...
39
40
 
40
41
  async def download_many(
41
42
  self,
42
- book_ids: list[str],
43
+ books: list[BookConfig],
43
44
  *,
44
45
  progress_hook: Callable[[int, int], Awaitable[None]] | None = None,
45
46
  **kwargs: Any,
46
47
  ) -> None:
47
48
  """
48
- Batch download entry point.
49
+ Download multiple books.
49
50
 
50
- :param book_ids: List of book IDs to download.
51
- :param progress_hook: (optional) Called after each chapter;
51
+ :param books: List of BookConfig entries.
52
+ :param progress_hook: Optional async callback after each chapter.
52
53
  args: completed_count, total_count.
53
54
  """
54
55
  ...
@@ -19,12 +19,16 @@ from lxml import html
19
19
 
20
20
  from novel_downloader.models import ChapterDict
21
21
  from novel_downloader.utils.network import download_font_file
22
- from novel_downloader.utils.text_utils import apply_font_mapping
22
+ from novel_downloader.utils.text_utils import (
23
+ apply_font_mapping,
24
+ truncate_half_lines,
25
+ )
23
26
 
24
27
  from .utils import (
25
28
  extract_chapter_info,
26
29
  find_ssr_page_context,
27
30
  get_decryptor,
31
+ is_duplicated,
28
32
  vip_status,
29
33
  )
30
34
 
@@ -76,6 +80,7 @@ def parse_encrypted_chapter(
76
80
  fixedFontWoff2_url = chapter_info["fixedFontWoff2"]
77
81
 
78
82
  title = chapter_info.get("chapterName", "Untitled")
83
+ duplicated = is_duplicated(ssr_data)
79
84
  raw_html = chapter_info.get("content", "")
80
85
  chapter_id = chapter_info.get("chapterId", chapter_id)
81
86
  fkp = chapter_info.get("fkp", "")
@@ -83,7 +88,7 @@ def parse_encrypted_chapter(
83
88
  update_time = chapter_info.get("updateTime", "")
84
89
  update_timestamp = chapter_info.get("updateTimestamp", 0)
85
90
  modify_time = chapter_info.get("modifyTime", 0)
86
- word_count = chapter_info.get("wordsCount", 0)
91
+ word_count = chapter_info.get("actualWords", 0)
87
92
  seq = chapter_info.get("seq", None)
88
93
  volume = chapter_info.get("extra", {}).get("volumeName", "")
89
94
 
@@ -177,6 +182,9 @@ def parse_encrypted_chapter(
177
182
  final_paragraphs_str = "\n\n".join(
178
183
  line.strip() for line in original_text.splitlines() if line.strip()
179
184
  )
185
+ if parser._use_truncation and duplicated:
186
+ final_paragraphs_str = truncate_half_lines(final_paragraphs_str)
187
+
180
188
  return {
181
189
  "id": str(chapter_id),
182
190
  "title": str(title),
@@ -187,6 +195,7 @@ def parse_encrypted_chapter(
187
195
  "update_timestamp": update_timestamp,
188
196
  "modify_time": modify_time,
189
197
  "word_count": word_count,
198
+ "duplicated": duplicated,
190
199
  "seq": seq,
191
200
  "volume": volume,
192
201
  "encrypted": True,
@@ -15,11 +15,13 @@ from typing import TYPE_CHECKING
15
15
  from lxml import html
16
16
 
17
17
  from novel_downloader.models import ChapterDict
18
+ from novel_downloader.utils.text_utils import truncate_half_lines
18
19
 
19
20
  from .utils import (
20
21
  extract_chapter_info,
21
22
  find_ssr_page_context,
22
23
  get_decryptor,
24
+ is_duplicated,
23
25
  vip_status,
24
26
  )
25
27
 
@@ -51,6 +53,7 @@ def parse_normal_chapter(
51
53
  return None
52
54
 
53
55
  title = chapter_info.get("chapterName", "Untitled")
56
+ duplicated = is_duplicated(ssr_data)
54
57
  raw_html = chapter_info.get("content", "")
55
58
  chapter_id = chapter_info.get("chapterId", chapter_id)
56
59
  fkp = chapter_info.get("fkp", "")
@@ -58,7 +61,7 @@ def parse_normal_chapter(
58
61
  update_time = chapter_info.get("updateTime", "")
59
62
  update_timestamp = chapter_info.get("updateTimestamp", 0)
60
63
  modify_time = chapter_info.get("modifyTime", 0)
61
- word_count = chapter_info.get("wordsCount", 0)
64
+ word_count = chapter_info.get("actualWords", 0)
62
65
  seq = chapter_info.get("seq", None)
63
66
  volume = chapter_info.get("extra", {}).get("volumeName", "")
64
67
 
@@ -74,6 +77,9 @@ def parse_normal_chapter(
74
77
  if not chapter_text:
75
78
  return None
76
79
 
80
+ if parser._use_truncation and duplicated:
81
+ chapter_text = truncate_half_lines(chapter_text)
82
+
77
83
  return {
78
84
  "id": str(chapter_id),
79
85
  "title": title,
@@ -84,6 +90,7 @@ def parse_normal_chapter(
84
90
  "update_timestamp": update_timestamp,
85
91
  "modify_time": modify_time,
86
92
  "word_count": word_count,
93
+ "duplicated": duplicated,
87
94
  "seq": seq,
88
95
  "volume": volume,
89
96
  "encrypted": False,
@@ -32,7 +32,11 @@ class QidianParser(BaseParser):
32
32
  Parser for Qidian site.
33
33
  """
34
34
 
35
- def __init__(self, config: ParserConfig):
35
+ def __init__(
36
+ self,
37
+ config: ParserConfig,
38
+ fuid: str = "",
39
+ ):
36
40
  """
37
41
  Initialize the QidianParser with the given configuration.
38
42
 
@@ -41,6 +45,7 @@ class QidianParser(BaseParser):
41
45
  super().__init__(config)
42
46
 
43
47
  # Extract and store parser flags from config
48
+ self._use_truncation = config.use_truncation
44
49
  self._decode_font: bool = config.decode_font
45
50
  self._save_font_debug: bool = config.save_font_debug
46
51
 
@@ -52,7 +57,7 @@ class QidianParser(BaseParser):
52
57
  DATA_DIR / "qidian" / "browser_state.cookies",
53
58
  DATA_DIR / "qidian" / "session_state.cookies",
54
59
  ]
55
- self._fuid: str = find_cookie_value(state_files, "ywguid")
60
+ self._fuid: str = fuid or find_cookie_value(state_files, "ywguid")
56
61
 
57
62
  self._font_ocr: FontOCR | None = None
58
63
  if self._decode_font:
@@ -9,6 +9,7 @@ from .helpers import (
9
9
  can_view_chapter,
10
10
  extract_chapter_info,
11
11
  find_ssr_page_context,
12
+ is_duplicated,
12
13
  is_encrypted,
13
14
  is_restricted_page,
14
15
  vip_status,
@@ -22,6 +23,7 @@ __all__ = [
22
23
  "vip_status",
23
24
  "can_view_chapter",
24
25
  "is_encrypted",
26
+ "is_duplicated",
25
27
  "QidianNodeDecryptor",
26
28
  "get_decryptor",
27
29
  ]
@@ -89,6 +89,15 @@ def can_view_chapter(ssr_data: dict[str, Any]) -> bool:
89
89
  return not (vip_status == 1 and is_buy == 0)
90
90
 
91
91
 
92
+ def is_duplicated(ssr_data: dict[str, Any]) -> bool:
93
+ """
94
+ Check if chapter is marked as duplicated (eFW = 1).
95
+ """
96
+ chapter_info = extract_chapter_info(ssr_data)
97
+ efw_flag = chapter_info.get("eFW", 0)
98
+ return bool(efw_flag == 1)
99
+
100
+
92
101
  def is_encrypted(content: str | dict[str, Any]) -> bool:
93
102
  """
94
103
  Return True if content is encrypted.
@@ -66,6 +66,8 @@
66
66
  "download_downloading": "Downloading book {book_id} from {site}...",
67
67
  "download_prompt_parse": "Parse...",
68
68
  "download_book_ids": "One or more book IDs to process",
69
+ "download_option_start": "Start chapter ID (applies to the first book ID only)",
70
+ "download_option_end": "End chapter ID (applies to the first book ID only)",
69
71
  "login_description": "Description",
70
72
  "login_hint": "Hint",
71
73
  "login_manual_prompt": ">> Please complete login in your browser and press Enter to continue...",
@@ -66,6 +66,8 @@
66
66
  "download_downloading": "正在从 {site} 下载书籍 {book_id}...",
67
67
  "download_prompt_parse": "结束...",
68
68
  "download_book_ids": "要处理的一个或多个小说 ID",
69
+ "download_option_start": "起始章节 ID (仅用于第一个书籍 ID)",
70
+ "download_option_end": "结束章节 ID (仅用于第一个书籍 ID)",
69
71
  "login_description": "说明",
70
72
  "login_hint": "提示",
71
73
  "login_manual_prompt": ">> 请在浏览器中完成登录后按回车继续...",
@@ -8,6 +8,7 @@ novel_downloader.models
8
8
  from .browser import NewContextOptions
9
9
  from .chapter import ChapterDict
10
10
  from .config import (
11
+ BookConfig,
11
12
  DownloaderConfig,
12
13
  ExporterConfig,
13
14
  FetcherConfig,
@@ -39,6 +40,7 @@ from .types import (
39
40
 
40
41
  __all__ = [
41
42
  "NewContextOptions",
43
+ "BookConfig",
42
44
  "DownloaderConfig",
43
45
  "ParserConfig",
44
46
  "FetcherConfig",
@@ -17,6 +17,7 @@ strongly typed Python objects for safer and cleaner access.
17
17
  """
18
18
 
19
19
  from dataclasses import dataclass
20
+ from typing import NotRequired, TypedDict
20
21
 
21
22
  from .types import (
22
23
  BrowserType,
@@ -67,6 +68,7 @@ class DownloaderConfig:
67
68
  @dataclass
68
69
  class ParserConfig:
69
70
  cache_dir: str = "./novel_cache"
71
+ use_truncation: bool = True
70
72
  decode_font: bool = False
71
73
  use_freq: bool = False
72
74
  use_ocr: bool = True
@@ -98,3 +100,10 @@ class ExporterConfig:
98
100
  include_toc: bool = False
99
101
  include_picture: bool = False
100
102
  split_mode: SplitMode = "book"
103
+
104
+
105
+ class BookConfig(TypedDict):
106
+ book_id: str
107
+ start_id: NotRequired[str]
108
+ end_id: NotRequired[str]
109
+ ignore_ids: NotRequired[list[str]]
@@ -52,6 +52,7 @@ book_ids = [
52
52
  ]
53
53
  mode = "session" # browser / session
54
54
  login_required = true # 是否需要登录才能访问
55
+ use_truncation = true # 是否基于章节长度截断以避免重复内容
55
56
 
56
57
  [sites.biquge] # 笔趣阁
57
58
  book_ids = [
@@ -65,7 +65,13 @@ class HomeScreen(Screen): # type: ignore[misc]
65
65
  return
66
66
  id_list = {x.strip() for x in ids.split(",") if x.strip()}
67
67
  adapter = ConfigAdapter(config=self.app.config, site=str(site))
68
- asyncio.create_task(self._download(adapter, str(site), id_list))
68
+ # asyncio.create_task(self._download(adapter, str(site), id_list))
69
+ self.run_worker(
70
+ self._download(adapter, str(site), id_list),
71
+ name="download",
72
+ group="downloads",
73
+ description="正在下载书籍...",
74
+ )
69
75
 
70
76
  def _make_title_bar(self) -> Horizontal:
71
77
  return Horizontal(
@@ -106,12 +112,12 @@ class HomeScreen(Screen): # type: ignore[misc]
106
112
  self,
107
113
  adapter: ConfigAdapter,
108
114
  site: str,
109
- valid_book_ids: set[str],
115
+ book_ids: set[str],
110
116
  ) -> None:
111
117
  btn = self.query_one("#download", Button)
112
118
  btn.disabled = True
113
119
  try:
114
- logging.info(f"下载请求: {site} | {valid_book_ids}")
120
+ logging.info(f"下载请求: {site} | {book_ids}")
115
121
  downloader_cfg = adapter.get_downloader_config()
116
122
  fetcher_cfg = adapter.get_fetcher_config()
117
123
  parser_cfg = adapter.get_parser_config()
@@ -134,16 +140,17 @@ class HomeScreen(Screen): # type: ignore[misc]
134
140
  downloader = get_downloader(
135
141
  fetcher=fetcher,
136
142
  parser=parser,
137
- exporter=exporter,
138
143
  site=site,
139
144
  config=downloader_cfg,
140
145
  )
141
146
 
142
- for book_id in valid_book_ids:
147
+ for book_id in book_ids:
143
148
  logging.info(t("download_downloading", book_id=book_id, site=site))
144
149
  await downloader.download(
145
- book_id, progress_hook=self._update_progress
150
+ {"book_id": book_id},
151
+ progress_hook=self._update_progress,
146
152
  )
153
+ await asyncio.to_thread(exporter.export, book_id)
147
154
 
148
155
  if downloader_cfg.login_required and fetcher.is_logged_in:
149
156
  await fetcher.save_state()
@@ -116,35 +116,6 @@ QD_DECRYPT_SCRIPT_PATH = files("novel_downloader.resources.js_scripts").joinpath
116
116
  # Text Files
117
117
  BLACKLIST_PATH = files("novel_downloader.resources.text").joinpath("blacklist.txt")
118
118
 
119
- # -----------------------------------------------------------------------------
120
- # EPUB defaults
121
- # -----------------------------------------------------------------------------
122
- EPUB_IMAGE_FOLDER = "Images"
123
- EPUB_TEXT_FOLDER = "Text"
124
-
125
- EPUB_IMAGE_WRAPPER = (
126
- '<div class="duokan-image-single illus"><img src="../Images/{filename}" /></div>'
127
- )
128
-
129
- EPUB_OPTIONS = {
130
- # guide 是 EPUB 2 的一个部分, 包含封面, 目录, 索引等重要导航信息
131
- "epub2_guide": True,
132
- # landmark 是 EPUB 3 用来标识重要页面 (如目录, 封面, 起始页) 的 <nav> 结构
133
- "epub3_landmark": True,
134
- # EPUB 3 允许提供一个 page list, 让电子书在不同设备上仍然保持相对一致的分页结构
135
- "epub3_pages": True,
136
- # 这个名字会出现在 EPUB 阅读器的导航栏
137
- "landmark_title": "Guide",
138
- # 这个名字会显示在 EPUB 阅读器的分页导航栏
139
- "pages_title": "Pages",
140
- # 是否根据 book.spine 的排列顺序自动设置 EPUB 阅读器的 page-progression-direction
141
- "spine_direction": True,
142
- # 控制 EPUB 阅读器的默认翻页方向 (LTR 或 RTL)
143
- "package_direction": False,
144
- # 是否为 EPUB 书籍中的章节 添加播放顺序
145
- "play_order": {"enabled": True, "start_from": 1},
146
- }
147
-
148
119
  # ---------------------------------------------------------------------
149
120
  # Pretrained model registry (e.g. used in font recovery or OCR)
150
121
  # ---------------------------------------------------------------------
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- novel_downloader.utils.model_loader
4
- -----------------------------------
3
+ novel_downloader.utils.fontocr.model_loader
4
+ -------------------------------------------
5
5
 
6
6
  Utility functions for managing pre-trained model downloads.
7
7
 
@@ -25,7 +25,8 @@ from novel_downloader.utils.constants import (
25
25
  REC_IMAGE_SHAPE_MAP,
26
26
  )
27
27
  from novel_downloader.utils.hash_store import img_hash_store
28
- from novel_downloader.utils.model_loader import get_rec_chinese_char_model_dir
28
+
29
+ from .model_loader import get_rec_chinese_char_model_dir
29
30
 
30
31
  logger = logging.getLogger(__name__)
31
32
 
@@ -36,7 +36,8 @@ from novel_downloader.utils.constants import (
36
36
  REC_IMAGE_SHAPE_MAP,
37
37
  )
38
38
  from novel_downloader.utils.hash_store import img_hash_store
39
- from novel_downloader.utils.model_loader import (
39
+
40
+ from .model_loader import (
40
41
  get_rec_char_vector_dir,
41
42
  get_rec_chinese_char_model_dir,
42
43
  )
@@ -15,12 +15,19 @@ Submodules:
15
15
  from .chapter_formatting import format_chapter
16
16
  from .diff_display import diff_inline_display
17
17
  from .font_mapping import apply_font_mapping
18
- from .text_cleaning import clean_chapter_title, is_promotional_line
18
+ from .text_cleaning import (
19
+ clean_chapter_title,
20
+ content_prefix,
21
+ is_promotional_line,
22
+ truncate_half_lines,
23
+ )
19
24
 
20
25
  __all__ = [
21
26
  "apply_font_mapping",
22
27
  "format_chapter",
23
28
  "clean_chapter_title",
24
29
  "is_promotional_line",
30
+ "content_prefix",
31
+ "truncate_half_lines",
25
32
  "diff_inline_display",
26
33
  ]
@@ -6,6 +6,7 @@ novel_downloader.utils.text_utils.text_cleaning
6
6
  Tools for detecting and removing promotional or ad-like content from text.
7
7
  """
8
8
 
9
+ import math
9
10
  import re
10
11
 
11
12
  from novel_downloader.utils.file_utils.io import load_blacklisted_words
@@ -50,7 +51,57 @@ def is_promotional_line(line: str) -> bool:
50
51
  return False
51
52
 
52
53
 
54
+ def content_prefix(
55
+ text: str,
56
+ n: int,
57
+ ignore_chars: set[str] | None = None,
58
+ ) -> str:
59
+ """
60
+ Return the prefix of `text` containing the first `n` non-ignored characters.
61
+
62
+ :param text: The full input string.
63
+ :param n: Number of content characters to include.
64
+ :param ignore_chars: Characters to ignore when counting content.
65
+ :return: Truncated string preserving original whitespace and line breaks.
66
+ """
67
+ ignore = ignore_chars or set()
68
+ cnt = 0
69
+
70
+ for i, ch in enumerate(text):
71
+ if ch not in ignore:
72
+ cnt += 1
73
+ if cnt >= n:
74
+ return text[: i + 1]
75
+
76
+ return text
77
+
78
+
79
+ def truncate_half_lines(text: str) -> str:
80
+ """
81
+ Keep the first half of the lines (rounded up), preserving line breaks.
82
+
83
+ :param text: Full input text
84
+ :return: Truncated text with first half of lines
85
+ """
86
+ lines = text.splitlines()
87
+ non_empty_lines = [line for line in lines if line.strip()]
88
+ keep_count = math.ceil(len(non_empty_lines) / 2)
89
+
90
+ result_lines = []
91
+ count = 0
92
+ for line in lines:
93
+ result_lines.append(line)
94
+ if line.strip():
95
+ count += 1
96
+ if count >= keep_count:
97
+ break
98
+
99
+ return "\n".join(result_lines)
100
+
101
+
53
102
  __all__ = [
54
103
  "clean_chapter_title",
55
104
  "is_promotional_line",
105
+ "content_prefix",
106
+ "truncate_half_lines",
56
107
  ]