novel-downloader 1.5.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +1 -3
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +26 -21
  5. novel_downloader/cli/download.py +79 -66
  6. novel_downloader/cli/export.py +17 -21
  7. novel_downloader/cli/main.py +1 -1
  8. novel_downloader/cli/search.py +62 -65
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +8 -5
  11. novel_downloader/config/adapter.py +206 -209
  12. novel_downloader/config/{loader.py → file_io.py} +53 -26
  13. novel_downloader/core/__init__.py +5 -5
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +3 -24
  21. novel_downloader/core/downloaders/base.py +49 -23
  22. novel_downloader/core/downloaders/common.py +191 -137
  23. novel_downloader/core/downloaders/qianbi.py +187 -146
  24. novel_downloader/core/downloaders/qidian.py +187 -141
  25. novel_downloader/core/downloaders/registry.py +4 -2
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +3 -20
  28. novel_downloader/core/exporters/base.py +33 -37
  29. novel_downloader/core/exporters/common/__init__.py +1 -2
  30. novel_downloader/core/exporters/common/epub.py +15 -10
  31. novel_downloader/core/exporters/common/main_exporter.py +19 -12
  32. novel_downloader/core/exporters/common/txt.py +17 -12
  33. novel_downloader/core/exporters/epub_util.py +59 -29
  34. novel_downloader/core/exporters/linovelib/__init__.py +1 -0
  35. novel_downloader/core/exporters/linovelib/epub.py +23 -25
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
  37. novel_downloader/core/exporters/linovelib/txt.py +20 -14
  38. novel_downloader/core/exporters/qidian.py +2 -8
  39. novel_downloader/core/exporters/registry.py +4 -2
  40. novel_downloader/core/exporters/txt_util.py +7 -7
  41. novel_downloader/core/fetchers/__init__.py +54 -48
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
  45. novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/lewenn.py +83 -0
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +56 -64
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +5 -16
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/shuhaige.py +84 -0
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/wanbengo.py +83 -0
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +1 -9
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +6 -19
  79. novel_downloader/core/interfaces/parser.py +7 -8
  80. novel_downloader/core/interfaces/searcher.py +9 -1
  81. novel_downloader/core/parsers/__init__.py +49 -12
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +64 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/esjzone.py +64 -69
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/linovelib.py +48 -64
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/qianbi.py +48 -50
  99. novel_downloader/core/parsers/qidian/main_parser.py +756 -48
  100. novel_downloader/core/parsers/qidian/utils/__init__.py +3 -21
  101. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
  102. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +4 -4
  103. novel_downloader/core/parsers/quanben5.py +103 -0
  104. novel_downloader/core/parsers/registry.py +5 -16
  105. novel_downloader/core/parsers/sfacg.py +38 -45
  106. novel_downloader/core/parsers/shencou.py +215 -0
  107. novel_downloader/core/parsers/shuhaige.py +111 -0
  108. novel_downloader/core/parsers/tongrenquan.py +116 -0
  109. novel_downloader/core/parsers/ttkan.py +132 -0
  110. novel_downloader/core/parsers/wanbengo.py +191 -0
  111. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  112. novel_downloader/core/parsers/xiguashuwu.py +429 -0
  113. novel_downloader/core/parsers/xs63b.py +161 -0
  114. novel_downloader/core/parsers/xshbook.py +134 -0
  115. novel_downloader/core/parsers/yamibo.py +87 -131
  116. novel_downloader/core/parsers/yibige.py +166 -0
  117. novel_downloader/core/searchers/__init__.py +34 -3
  118. novel_downloader/core/searchers/aaatxt.py +107 -0
  119. novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
  120. novel_downloader/core/searchers/base.py +112 -36
  121. novel_downloader/core/searchers/dxmwx.py +105 -0
  122. novel_downloader/core/searchers/eightnovel.py +84 -0
  123. novel_downloader/core/searchers/esjzone.py +43 -25
  124. novel_downloader/core/searchers/hetushu.py +92 -0
  125. novel_downloader/core/searchers/i25zw.py +93 -0
  126. novel_downloader/core/searchers/ixdzs8.py +107 -0
  127. novel_downloader/core/searchers/jpxs123.py +107 -0
  128. novel_downloader/core/searchers/piaotia.py +100 -0
  129. novel_downloader/core/searchers/qbtr.py +106 -0
  130. novel_downloader/core/searchers/qianbi.py +74 -40
  131. novel_downloader/core/searchers/quanben5.py +144 -0
  132. novel_downloader/core/searchers/registry.py +24 -8
  133. novel_downloader/core/searchers/shuhaige.py +124 -0
  134. novel_downloader/core/searchers/tongrenquan.py +110 -0
  135. novel_downloader/core/searchers/ttkan.py +92 -0
  136. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  137. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  138. novel_downloader/core/searchers/xs63b.py +104 -0
  139. novel_downloader/locales/en.json +34 -85
  140. novel_downloader/locales/zh.json +35 -86
  141. novel_downloader/models/__init__.py +21 -22
  142. novel_downloader/models/book.py +44 -0
  143. novel_downloader/models/config.py +4 -37
  144. novel_downloader/models/login.py +1 -1
  145. novel_downloader/models/search.py +5 -0
  146. novel_downloader/resources/config/settings.toml +8 -70
  147. novel_downloader/resources/json/xiguashuwu.json +718 -0
  148. novel_downloader/utils/__init__.py +13 -24
  149. novel_downloader/utils/chapter_storage.py +5 -5
  150. novel_downloader/utils/constants.py +4 -31
  151. novel_downloader/utils/cookies.py +38 -35
  152. novel_downloader/utils/crypto_utils/__init__.py +7 -0
  153. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  154. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  155. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  156. novel_downloader/utils/crypto_utils/rc4.py +54 -0
  157. novel_downloader/utils/epub/__init__.py +3 -4
  158. novel_downloader/utils/epub/builder.py +6 -6
  159. novel_downloader/utils/epub/constants.py +62 -21
  160. novel_downloader/utils/epub/documents.py +95 -201
  161. novel_downloader/utils/epub/models.py +8 -22
  162. novel_downloader/utils/epub/utils.py +73 -106
  163. novel_downloader/utils/file_utils/__init__.py +2 -23
  164. novel_downloader/utils/file_utils/io.py +53 -188
  165. novel_downloader/utils/file_utils/normalize.py +1 -7
  166. novel_downloader/utils/file_utils/sanitize.py +4 -15
  167. novel_downloader/utils/fontocr/__init__.py +5 -14
  168. novel_downloader/utils/fontocr/core.py +216 -0
  169. novel_downloader/utils/fontocr/loader.py +50 -0
  170. novel_downloader/utils/logger.py +81 -65
  171. novel_downloader/utils/network.py +17 -41
  172. novel_downloader/utils/state.py +4 -90
  173. novel_downloader/utils/text_utils/__init__.py +1 -7
  174. novel_downloader/utils/text_utils/diff_display.py +5 -7
  175. novel_downloader/utils/text_utils/text_cleaner.py +39 -30
  176. novel_downloader/utils/text_utils/truncate_utils.py +3 -14
  177. novel_downloader/utils/time_utils/__init__.py +5 -11
  178. novel_downloader/utils/time_utils/datetime_utils.py +20 -29
  179. novel_downloader/utils/time_utils/sleep_utils.py +55 -49
  180. novel_downloader/web/__init__.py +13 -0
  181. novel_downloader/web/components/__init__.py +11 -0
  182. novel_downloader/web/components/navigation.py +35 -0
  183. novel_downloader/web/main.py +66 -0
  184. novel_downloader/web/pages/__init__.py +17 -0
  185. novel_downloader/web/pages/download.py +78 -0
  186. novel_downloader/web/pages/progress.py +147 -0
  187. novel_downloader/web/pages/search.py +329 -0
  188. novel_downloader/web/services/__init__.py +17 -0
  189. novel_downloader/web/services/client_dialog.py +164 -0
  190. novel_downloader/web/services/cred_broker.py +113 -0
  191. novel_downloader/web/services/cred_models.py +35 -0
  192. novel_downloader/web/services/task_manager.py +264 -0
  193. novel_downloader-2.0.1.dist-info/METADATA +172 -0
  194. novel_downloader-2.0.1.dist-info/RECORD +206 -0
  195. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/entry_points.txt +1 -1
  196. novel_downloader/core/downloaders/biquge.py +0 -29
  197. novel_downloader/core/downloaders/esjzone.py +0 -29
  198. novel_downloader/core/downloaders/linovelib.py +0 -29
  199. novel_downloader/core/downloaders/sfacg.py +0 -29
  200. novel_downloader/core/downloaders/yamibo.py +0 -29
  201. novel_downloader/core/exporters/biquge.py +0 -22
  202. novel_downloader/core/exporters/esjzone.py +0 -22
  203. novel_downloader/core/exporters/qianbi.py +0 -22
  204. novel_downloader/core/exporters/sfacg.py +0 -22
  205. novel_downloader/core/exporters/yamibo.py +0 -22
  206. novel_downloader/core/fetchers/base/__init__.py +0 -14
  207. novel_downloader/core/fetchers/base/browser.py +0 -422
  208. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  209. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  210. novel_downloader/core/fetchers/esjzone/browser.py +0 -209
  211. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  212. novel_downloader/core/fetchers/linovelib/browser.py +0 -198
  213. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  214. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  215. novel_downloader/core/fetchers/qidian/browser.py +0 -326
  216. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  217. novel_downloader/core/fetchers/sfacg/browser.py +0 -194
  218. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  219. novel_downloader/core/fetchers/yamibo/browser.py +0 -234
  220. novel_downloader/core/parsers/biquge.py +0 -139
  221. novel_downloader/core/parsers/qidian/book_info_parser.py +0 -90
  222. novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -528
  223. novel_downloader/core/parsers/qidian/chapter_normal.py +0 -157
  224. novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
  225. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -114
  226. novel_downloader/models/chapter.py +0 -25
  227. novel_downloader/models/types.py +0 -13
  228. novel_downloader/tui/__init__.py +0 -7
  229. novel_downloader/tui/app.py +0 -32
  230. novel_downloader/tui/main.py +0 -17
  231. novel_downloader/tui/screens/__init__.py +0 -14
  232. novel_downloader/tui/screens/home.py +0 -198
  233. novel_downloader/tui/screens/login.py +0 -74
  234. novel_downloader/tui/styles/home_layout.tcss +0 -79
  235. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  236. novel_downloader/utils/cache.py +0 -24
  237. novel_downloader/utils/crypto_utils.py +0 -71
  238. novel_downloader/utils/fontocr/hash_store.py +0 -280
  239. novel_downloader/utils/fontocr/hash_utils.py +0 -103
  240. novel_downloader/utils/fontocr/model_loader.py +0 -69
  241. novel_downloader/utils/fontocr/ocr_v1.py +0 -315
  242. novel_downloader/utils/fontocr/ocr_v2.py +0 -764
  243. novel_downloader/utils/fontocr/ocr_v3.py +0 -744
  244. novel_downloader-1.5.0.dist-info/METADATA +0 -196
  245. novel_downloader-1.5.0.dist-info/RECORD +0 -164
  246. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/WHEEL +0 -0
  247. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/licenses/LICENSE +0 -0
  248. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/top_level.txt +0 -0
@@ -8,35 +8,71 @@ Main parser class for handling Qidian HTML
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
+ import json
11
12
  import logging
13
+ import re
14
+ from contextlib import suppress
15
+ from html import unescape
12
16
  from pathlib import Path
13
- from typing import TYPE_CHECKING, Any
17
+ from typing import Any, TypedDict
18
+
19
+ from lxml import html
14
20
 
15
21
  from novel_downloader.core.parsers.base import BaseParser
16
22
  from novel_downloader.core.parsers.registry import register_parser
17
- from novel_downloader.models import ChapterDict, ParserConfig
18
- from novel_downloader.utils import find_cookie_value
23
+ from novel_downloader.models import (
24
+ BookInfoDict,
25
+ ChapterDict,
26
+ ChapterInfoDict,
27
+ ParserConfig,
28
+ VolumeInfoDict,
29
+ )
30
+ from novel_downloader.utils import (
31
+ download,
32
+ truncate_half_lines,
33
+ )
19
34
  from novel_downloader.utils.constants import DATA_DIR
35
+ from novel_downloader.utils.cookies import get_cookie_value
36
+ from novel_downloader.utils.fontocr import get_font_ocr
20
37
 
21
- from .book_info_parser import parse_book_info
22
- from .chapter_router import parse_chapter
23
- from .utils import is_encrypted
38
+ from .utils import (
39
+ get_decryptor,
40
+ )
24
41
 
25
42
  logger = logging.getLogger(__name__)
26
43
 
27
- if TYPE_CHECKING:
28
- from novel_downloader.utils.fontocr import FontOCR
44
+
45
+ class Rule(TypedDict, total=False):
46
+ delete_all: bool
47
+ delete_first: bool
48
+ transform_flip_x: bool
49
+ append_start_char: str
50
+ append_end_char: str
51
+ append_start_attr: str
52
+ append_end_attr: str
53
+
54
+
55
+ class Rules(TypedDict):
56
+ # e.g., orders = ["i", "em", "span"]
57
+ orders: list[str]
58
+ # e.g., sy["sy-3"] -> Rule
59
+ sy: dict[str, Rule]
60
+ # e.g., p_rules["p3"]["i"] -> Rule
61
+ p_rules: dict[str, dict[str, Rule]]
29
62
 
30
63
 
31
64
  @register_parser(
32
65
  site_keys=["qidian", "qd"],
33
- backends=["session", "browser"],
34
66
  )
35
67
  class QidianParser(BaseParser):
36
68
  """
37
- Parser for Qidian site.
69
+ Parser for 起点中文网 site.
38
70
  """
39
71
 
72
+ _RE_P_DELIM = re.compile(r"(?i)<\s*p\s*>")
73
+ _RE_ATTR = re.compile(r"attr\(\s*([^)]+?)\s*\)", re.I)
74
+ _RE_SCALEX = re.compile(r"scalex\(\s*-?1\s*\)", re.I)
75
+
40
76
  def __init__(
41
77
  self,
42
78
  config: ParserConfig,
@@ -49,47 +85,20 @@ class QidianParser(BaseParser):
49
85
  """
50
86
  super().__init__(config)
51
87
 
52
- # Extract and store parser flags from config
53
- self._use_truncation = config.use_truncation
54
- self._decode_font: bool = config.decode_font
55
-
56
88
  self._fixed_font_dir: Path = self._base_cache_dir / "fixed_fonts"
57
89
  self._fixed_font_dir.mkdir(parents=True, exist_ok=True)
58
90
  self._debug_dir: Path = Path.cwd() / "debug"
59
91
 
60
92
  state_files = [
61
- DATA_DIR / "qidian" / "browser_state.cookies",
62
93
  DATA_DIR / "qidian" / "session_state.cookies",
63
94
  ]
64
- self._fuid: str = fuid or find_cookie_value(state_files, "ywguid")
65
-
66
- self._font_ocr: FontOCR | None = None
67
- if self._decode_font:
68
- try:
69
- from novel_downloader.utils.fontocr import FontOCR
70
- except ImportError:
71
- logger.warning(
72
- "[QidianParser] FontOCR not available, font decoding will skip"
73
- )
74
- else:
75
- self._font_ocr = FontOCR(
76
- cache_dir=self._base_cache_dir,
77
- use_freq=config.use_freq,
78
- use_ocr=config.use_ocr,
79
- use_vec=config.use_vec,
80
- batch_size=config.batch_size,
81
- gpu_mem=config.gpu_mem,
82
- gpu_id=config.gpu_id,
83
- ocr_weight=config.ocr_weight,
84
- vec_weight=config.vec_weight,
85
- font_debug=config.save_font_debug,
86
- )
95
+ self._fuid: str = fuid or get_cookie_value(state_files, "ywguid")
87
96
 
88
97
  def parse_book_info(
89
98
  self,
90
99
  html_list: list[str],
91
100
  **kwargs: Any,
92
- ) -> dict[str, Any]:
101
+ ) -> BookInfoDict | None:
93
102
  """
94
103
  Parse a book info page and extract metadata and chapter structure.
95
104
 
@@ -97,8 +106,65 @@ class QidianParser(BaseParser):
97
106
  :return: Parsed metadata and chapter structure as a dictionary.
98
107
  """
99
108
  if not html_list:
100
- return {}
101
- return parse_book_info(html_list[0])
109
+ return None
110
+
111
+ doc = html.fromstring(html_list[0])
112
+
113
+ book_name = self._first_str(doc.xpath('//h1[@id="bookName"]/text()'))
114
+ author = self._first_str(doc.xpath('//a[@class="writer-name"]/text()'))
115
+
116
+ book_id = doc.xpath('//a[@id="bookImg"]/@data-bid')[0]
117
+ cover_url = f"https://bookcover.yuewen.com/qdbimg/349573/{book_id}/600.webp"
118
+
119
+ update_time = self._first_str(
120
+ doc.xpath('//span[@class="update-time"]/text()'),
121
+ replaces=[("更新时间:", "")],
122
+ )
123
+ serial_status = self._first_str(
124
+ doc.xpath('//p[@class="book-attribute"]/span[1]/text()')
125
+ )
126
+
127
+ tags = [
128
+ t.strip()
129
+ for t in doc.xpath('//p[contains(@class,"all-label")]//a/text()')
130
+ if t.strip()
131
+ ]
132
+
133
+ word_count = self._first_str(doc.xpath('//p[@class="count"]/em[1]/text()'))
134
+ summary_brief = self._first_str(doc.xpath('//p[@class="intro"]/text()'))
135
+
136
+ raw_lines = [
137
+ s.strip()
138
+ for s in doc.xpath('//p[@id="book-intro-detail"]//text()')
139
+ if s.strip()
140
+ ]
141
+ summary = "\n".join(raw_lines)
142
+
143
+ volumes: list[VolumeInfoDict] = []
144
+ for vol in doc.xpath('//div[@id="allCatalog"]//div[@class="catalog-volume"]'):
145
+ vol_name = self._first_str(vol.xpath('.//h3[@class="volume-name"]/text()'))
146
+ vol_name = vol_name.split(chr(183))[0].strip()
147
+ chapters: list[ChapterInfoDict] = []
148
+ for li in vol.xpath('.//ul[contains(@class,"volume-chapters")]/li'):
149
+ title = self._first_str(li.xpath('.//a[@class="chapter-name"]/text()'))
150
+ url = self._first_str(li.xpath('.//a[@class="chapter-name"]/@href'))
151
+ cid = url.rstrip("/").split("/")[-1] if url else ""
152
+ chapters.append({"title": title, "url": url, "chapterId": cid})
153
+ volumes.append({"volume_name": vol_name, "chapters": chapters})
154
+
155
+ return {
156
+ "book_name": book_name,
157
+ "author": author,
158
+ "cover_url": cover_url,
159
+ "update_time": update_time,
160
+ "word_count": word_count,
161
+ "serial_status": serial_status,
162
+ "tags": tags,
163
+ "summary_brief": summary_brief,
164
+ "summary": summary,
165
+ "volumes": volumes,
166
+ "extra": {},
167
+ }
102
168
 
103
169
  def parse_chapter(
104
170
  self,
@@ -113,16 +179,658 @@ class QidianParser(BaseParser):
113
179
  """
114
180
  if not html_list:
115
181
  return None
116
- return parse_chapter(self, html_list[0], chapter_id)
182
+ try:
183
+ ssr_data = self._find_ssr_page_context(html_list[0])
184
+ chapter_info = self._extract_chapter_info(ssr_data)
185
+ if not chapter_info:
186
+ logger.warning(
187
+ "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
188
+ )
189
+ return None
190
+
191
+ if not self._can_view_chapter(chapter_info):
192
+ logger.warning(
193
+ "[Parser] Chapter '%s' is not purchased or inaccessible.",
194
+ chapter_id,
195
+ )
196
+ return None
197
+
198
+ if self._is_encrypted(ssr_data):
199
+ if not self._decode_font:
200
+ return None
201
+ return self.parse_encrypted_chapter(chapter_info, chapter_id)
202
+
203
+ return self.parse_normal_chapter(chapter_info, chapter_id)
204
+
205
+ except Exception as e:
206
+ logger.warning("[Parser] parse error for chapter '%s': %s", chapter_id, e)
207
+ return None
208
+
209
+ def parse_normal_chapter(
210
+ self,
211
+ chapter_info: dict[str, Any],
212
+ chapter_id: str,
213
+ ) -> ChapterDict | None:
214
+ """
215
+ Extract structured chapter info from a normal Qidian page.
216
+
217
+ :param chapter_info: Parsed chapter info block from ssr data.
218
+ :param chapter_id: Chapter identifier (string).
219
+ :return: a dictionary with keys like 'id', 'title', 'content', etc.
220
+ """
221
+ duplicated = self._is_duplicated(chapter_info)
222
+
223
+ title = chapter_info.get("chapterName", "Untitled")
224
+ raw_html = chapter_info.get("content", "")
225
+ chapter_id = chapter_info.get("chapterId", chapter_id)
226
+ fkp = chapter_info.get("fkp", "")
227
+ author_say = chapter_info.get("authorSay", "").strip()
228
+ update_time = chapter_info.get("updateTime", "")
229
+ update_timestamp = chapter_info.get("updateTimestamp", 0)
230
+ modify_time = chapter_info.get("modifyTime", 0)
231
+ word_count = chapter_info.get("actualWords", 0)
232
+ seq = chapter_info.get("seq")
233
+ volume = chapter_info.get("extra", {}).get("volumeName", "")
234
+
235
+ if self._is_vip(chapter_info):
236
+ decryptor = get_decryptor()
237
+ raw_html = decryptor.decrypt(raw_html, chapter_id, fkp, self._fuid)
238
+
239
+ parts = self._RE_P_DELIM.split(raw_html)
240
+ paragraphs = [unescape(p).strip() for p in parts if p.strip()]
241
+ chapter_text = "\n".join(paragraphs)
242
+ if not chapter_text:
243
+ return None
244
+
245
+ if self._use_truncation and duplicated:
246
+ chapter_text = truncate_half_lines(chapter_text)
247
+
248
+ return {
249
+ "id": str(chapter_id),
250
+ "title": title,
251
+ "content": chapter_text,
252
+ "extra": {
253
+ "author_say": author_say,
254
+ "updated_at": update_time,
255
+ "update_timestamp": update_timestamp,
256
+ "modify_time": modify_time,
257
+ "word_count": word_count,
258
+ "duplicated": duplicated,
259
+ "seq": seq,
260
+ "volume": volume,
261
+ "encrypted": False,
262
+ },
263
+ }
264
+
265
+ def parse_encrypted_chapter(
266
+ self,
267
+ chapter_info: dict[str, Any],
268
+ chapter_id: str,
269
+ ) -> ChapterDict | None:
270
+ """
271
+ Extract and return the formatted textual content of an encrypted chapter.
272
+
273
+ Steps:
274
+ 1. Decode and save randomFont bytes; download fixedFont via download().
275
+ 2. Parse CSS rules and save debug JSON.
276
+ 3. Render encrypted paragraphs, then run OCR font-mapping.
277
+ 4. Extracts paragraph texts and formats them.
278
+
279
+ :param chapter_info: Parsed chapter info block from ssr data.
280
+ :return: Formatted chapter text or empty string if not parsable.
281
+ """
282
+ debug_dir = self._debug_dir / "qidian" / "font_debug" / chapter_id
283
+ if self._save_font_debug:
284
+ debug_dir.mkdir(parents=True, exist_ok=True)
285
+
286
+ duplicated = self._is_duplicated(chapter_info)
287
+
288
+ css_str = chapter_info["css"]
289
+ randomFont_str = chapter_info["randomFont"]
290
+ fixedFontWoff2_url = chapter_info["fixedFontWoff2"]
291
+
292
+ title = chapter_info.get("chapterName", "Untitled")
293
+ raw_html = chapter_info.get("content", "")
294
+ chapter_id = chapter_info.get("chapterId", chapter_id)
295
+ fkp = chapter_info.get("fkp", "")
296
+ author_say = chapter_info.get("authorSay", "").strip()
297
+ update_time = chapter_info.get("updateTime", "")
298
+ update_timestamp = chapter_info.get("updateTimestamp", 0)
299
+ modify_time = chapter_info.get("modifyTime", 0)
300
+ word_count = chapter_info.get("actualWords", 0)
301
+ seq = chapter_info.get("seq")
302
+ volume = chapter_info.get("extra", {}).get("volumeName", "")
303
+
304
+ # extract + save font
305
+ rf = json.loads(randomFont_str)
306
+ rand_path = self._base_cache_dir / "randomFont.ttf"
307
+ rand_path.parent.mkdir(parents=True, exist_ok=True)
308
+ rand_path.write_bytes(bytes(rf["data"]))
309
+
310
+ fixed_path = download(
311
+ url=fixedFontWoff2_url,
312
+ target_dir=self._fixed_font_dir,
313
+ )
314
+ if fixed_path is None:
315
+ logger.warning(
316
+ "[Parser] failed to download fixedfont for chapter '%s'", chapter_id
317
+ )
318
+ return None
319
+
320
+ # Extract and render paragraphs from HTML with CSS rules
321
+ if self._is_vip(chapter_info):
322
+ decryptor = get_decryptor()
323
+ raw_html = decryptor.decrypt(
324
+ raw_html,
325
+ chapter_id,
326
+ fkp,
327
+ self._fuid,
328
+ )
329
+
330
+ css_rules = self._parse_css_rules(css_str)
331
+ paragraphs_str, refl_list = self._render_visible_text(raw_html, css_rules)
332
+ if self._save_font_debug:
333
+ paragraphs_str_path = debug_dir / f"{chapter_id}_debug.txt"
334
+ paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
335
+
336
+ # Run OCR + fallback mapping
337
+ char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
338
+ refl_set = set(refl_list)
339
+ char_set = char_set - refl_set
340
+ if self._save_font_debug:
341
+ char_sets_path = debug_dir / "char_set_debug.txt"
342
+ temp = f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}"
343
+ char_sets_path.write_text(
344
+ temp,
345
+ encoding="utf-8",
346
+ )
347
+
348
+ mapping_result = self._generate_font_map(
349
+ fixed_font_path=fixed_path,
350
+ random_font_path=rand_path,
351
+ char_set=char_set,
352
+ refl_set=refl_set,
353
+ cache_dir=self._base_cache_dir,
354
+ batch_size=self._config.batch_size,
355
+ )
356
+ if not mapping_result:
357
+ return None
358
+
359
+ if self._save_font_debug:
360
+ mapping_json_path = debug_dir / "font_mapping.json"
361
+ mapping_json_path.write_text(
362
+ json.dumps(mapping_result, ensure_ascii=False, indent=2),
363
+ encoding="utf-8",
364
+ )
365
+
366
+ # Reconstruct final readable text
367
+ original_text = self._apply_font_mapping(
368
+ text=paragraphs_str,
369
+ font_map=mapping_result,
370
+ )
371
+
372
+ final_paragraphs_str = "\n".join(
373
+ line.strip() for line in original_text.splitlines() if line.strip()
374
+ )
375
+ if self._use_truncation and duplicated:
376
+ final_paragraphs_str = truncate_half_lines(final_paragraphs_str)
377
+
378
+ return {
379
+ "id": str(chapter_id),
380
+ "title": str(title),
381
+ "content": final_paragraphs_str,
382
+ "extra": {
383
+ "author_say": author_say,
384
+ "updated_at": update_time,
385
+ "update_timestamp": update_timestamp,
386
+ "modify_time": modify_time,
387
+ "word_count": word_count,
388
+ "duplicated": duplicated,
389
+ "seq": seq,
390
+ "volume": volume,
391
+ "encrypted": True,
392
+ },
393
+ }
394
+
395
+ @staticmethod
396
+ def _find_ssr_page_context(html_str: str) -> dict[str, Any]:
397
+ """
398
+ Extract SSR JSON from <script id="vite-plugin-ssr_pageContext">.
399
+ """
400
+ tree = html.fromstring(html_str)
401
+ script = tree.xpath('//script[@id="vite-plugin-ssr_pageContext"]/text()')
402
+ return json.loads(script[0].strip()) if script else {}
403
+
404
+ @staticmethod
405
+ def _extract_chapter_info(ssr_data: dict[str, Any]) -> dict[str, Any]:
406
+ """
407
+ Extract the 'chapterInfo' dictionary from the SSR page context.
408
+
409
+ This handles nested key access and returns an empty dict if missing.
410
+
411
+ :param ssr_data: The full SSR data object from _find_ssr_page_context().
412
+ :return: A dict with chapter metadata such as chapterName, authorSay, etc.
413
+ """
414
+ page_context = ssr_data.get("pageContext", {})
415
+ page_props = page_context.get("pageProps", {})
416
+ page_data = page_props.get("pageData", {})
417
+ chapter_info = page_data.get("chapterInfo", {})
418
+ return chapter_info if isinstance(chapter_info, dict) else {}
117
419
 
118
- def is_encrypted(self, html_str: str) -> bool:
420
+ @staticmethod
421
+ def _is_restricted_page(html_str: str) -> bool:
422
+ """
423
+ Return True if page content indicates access restriction
424
+ (e.g. not subscribed/purchased).
425
+
426
+ :param html_str: Raw HTML string.
427
+ """
428
+ markers = ["这是VIP章节", "需要订阅", "订阅后才能阅读"]
429
+ return any(m in html_str for m in markers)
430
+
431
+ @classmethod
432
+ def _is_vip(cls, chapter_info: dict[str, Any]) -> bool:
433
+ """
434
+ :return: True if VIP, False otherwise.
435
+ """
436
+ vip_flag = chapter_info.get("vipStatus", 0)
437
+ fens_flag = chapter_info.get("fEnS", 0)
438
+ return bool(vip_flag == 1 and fens_flag != 0)
439
+
440
+ @classmethod
441
+ def _can_view_chapter(cls, chapter_info: dict[str, Any]) -> bool:
442
+ """
443
+ A chapter is not viewable if it is marked as VIP
444
+ and has not been purchased.
445
+
446
+ :return: True if viewable, False otherwise.
447
+ """
448
+ is_buy = chapter_info.get("isBuy", 0)
449
+ vip_status = chapter_info.get("vipStatus", 0)
450
+ return not (vip_status == 1 and is_buy == 0)
451
+
452
+ @classmethod
453
+ def _is_duplicated(cls, chapter_info: dict[str, Any]) -> bool:
454
+ """
455
+ Check if chapter is marked as duplicated (eFW = 1).
456
+ """
457
+ efw_flag = chapter_info.get("eFW", 0)
458
+ return bool(efw_flag == 1)
459
+
460
+ @classmethod
461
+ def _is_encrypted(cls, content: str | dict[str, Any]) -> bool:
119
462
  """
120
463
  Return True if content is encrypted.
121
464
 
122
- :param html: Raw HTML of the chapter page.
465
+ Chapter Encryption Status (cES):
466
+ * 0: 内容是'明文'
467
+ * 2: 字体加密
468
+
469
+ :param content: HTML content, either as a raw string or a BeautifulSoup object.
470
+ :return: True if encrypted marker is found, else False.
471
+ """
472
+ ssr_data = (
473
+ cls._find_ssr_page_context(content) if isinstance(content, str) else content
474
+ )
475
+ chapter_info = cls._extract_chapter_info(ssr_data)
476
+ return int(chapter_info.get("cES", 0)) == 2
477
+
478
+ @staticmethod
479
+ def _generate_font_map(
480
+ fixed_font_path: Path,
481
+ random_font_path: Path,
482
+ char_set: set[str],
483
+ refl_set: set[str],
484
+ cache_dir: Path,
485
+ batch_size: int = 32,
486
+ ) -> dict[str, str]:
487
+ """
488
+ Build a mapping from scrambled font chars to real chars.
489
+
490
+ Uses OCR to decode and generate mapping from a fixed obfuscated font
491
+ and an random obfuscated font. Results are cached in JSON.
492
+
493
+ :param fixed_font_path: fixed font file.
494
+ :param random_font_path: random font file.
495
+ :param char_set: Characters to match directly.
496
+ :param refl_set: Characters to match in flipped form.
497
+ :param cache_dir: Directory to save/load cached results.
498
+ :param batch_size: How many chars to OCR per batch.
499
+
500
+ :return: { obf_char: real_char, ... }
501
+ """
502
+ font_ocr = get_font_ocr()
503
+ if not font_ocr:
504
+ return {}
505
+
506
+ mapping_result: dict[str, str] = {}
507
+ fixed_map_file = cache_dir / "fixed_font_map" / f"{fixed_font_path.stem}.json"
508
+ fixed_map_file.parent.mkdir(parents=True, exist_ok=True)
509
+
510
+ # load existing cache
511
+ try:
512
+ with open(fixed_map_file, encoding="utf-8") as f:
513
+ fixed_map = json.load(f)
514
+ cached_chars = set(fixed_map.keys())
515
+ mapping_result.update(
516
+ {ch: fixed_map[ch] for ch in char_set if ch in fixed_map}
517
+ )
518
+ mapping_result.update(
519
+ {ch: fixed_map[ch] for ch in refl_set if ch in fixed_map}
520
+ )
521
+ char_set = char_set - cached_chars
522
+ refl_set = refl_set - cached_chars
523
+ except Exception:
524
+ fixed_map = {}
525
+ cached_chars = set()
526
+
527
+ # prepare font renderers and cmap sets
528
+ fixed_chars = font_ocr.extract_font_charset(fixed_font_path)
529
+ random_chars = font_ocr.extract_font_charset(random_font_path)
530
+ fixed_font = font_ocr.load_render_font(fixed_font_path)
531
+ random_font = font_ocr.load_render_font(random_font_path)
532
+
533
+ # process normal and reflected sets together
534
+ rendered = []
535
+ for chars, reflect in [(char_set, False), (refl_set, True)]:
536
+ for ch in chars:
537
+ if ch in fixed_chars:
538
+ font = fixed_font
539
+ elif ch in random_chars:
540
+ font = random_font
541
+ else:
542
+ continue
543
+ rendered.append(
544
+ (ch, font_ocr.render_char_image_array(ch, font, reflect))
545
+ )
546
+
547
+ if rendered:
548
+ # query OCR+vec simultaneously
549
+ imgs_to_query = [img for _, img in rendered]
550
+ fused = font_ocr.predict(imgs_to_query, batch_size=batch_size)
551
+
552
+ # pick best per char, apply threshold + cache
553
+ for (ch, _), preds in zip(rendered, fused, strict=False):
554
+ if not preds:
555
+ continue
556
+ real_char, _ = preds
557
+ mapping_result[ch] = real_char
558
+ fixed_map[ch] = real_char
559
+
560
+ # persist updated fixed_map
561
+ try:
562
+ with open(fixed_map_file, "w", encoding="utf-8") as f:
563
+ json.dump(fixed_map, f, ensure_ascii=False, indent=2)
564
+ except Exception as e:
565
+ logger.error("[FontOCR] Failed to save fixed map: %s", e)
566
+
567
+ return mapping_result
568
+
569
+ @staticmethod
570
+ def _apply_font_mapping(text: str, font_map: dict[str, str]) -> str:
571
+ """
572
+ Replace each character in `text` using `font_map`,
573
+ leaving unmapped characters unchanged.
574
+
575
+ :param text: The input string, possibly containing obfuscated font chars.
576
+ :param font_map: A dict mapping obfuscated chars to real chars.
577
+ :return: The de-obfuscated text.
578
+ """
579
+ return "".join(font_map.get(ch, ch) for ch in text)
580
+
581
+ @staticmethod
582
+ def _only_tag(selector: str) -> str | None:
583
+ """
584
+ Normalize a selector into just its tag name for ordering.
585
+
586
+ Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
587
+
588
+ Returns None if can't extract a tag.
589
+ """
590
+ sel = selector.strip()
591
+ # If it has spaces, take the rightmost simple selector
592
+ last = sel.split()[-1]
593
+ # Drop ::pseudo
594
+ last = last.split("::", 1)[0]
595
+ # If it's like 'span[attr=..]' keep 'span'
596
+ last = last.split("[", 1)[0]
597
+ # If it starts with '.', it's not a tag
598
+ if not last or last.startswith("."):
599
+ return None
600
+ return last
601
+
602
+ @staticmethod
603
+ def _parse_decls(block: str) -> list[tuple[str, str]]:
604
+ """
605
+ Parse 'name:value;...' inside a block. Tolerates quotes and attr().
606
+ """
607
+ decls: list[tuple[str, str]] = []
608
+ i = 0
609
+ n = len(block)
610
+ name: list[str] = []
611
+ val: list[str] = []
612
+ in_name = True
613
+ quote = None # track ' or "
614
+ while i < n:
615
+ c = block[i]
616
+ if quote:
617
+ # inside quotes
618
+ if c == "\\" and i + 1 < n:
619
+ # keep escaped char
620
+ (name if in_name else val).append(c)
621
+ i += 1
622
+ (name if in_name else val).append(block[i])
623
+ elif c == quote:
624
+ (name if in_name else val).append(c)
625
+ quote = None
626
+ else:
627
+ (name if in_name else val).append(c)
628
+ else:
629
+ if c in ("'", '"'):
630
+ (name if in_name else val).append(c)
631
+ quote = c
632
+ elif in_name and c == ":":
633
+ in_name = False
634
+ elif c == ";":
635
+ nm = "".join(name).strip().lower()
636
+ vl = "".join(val).strip()
637
+ if nm:
638
+ decls.append((nm, vl))
639
+ name.clear()
640
+ val.clear()
641
+ in_name = True
642
+ else:
643
+ (name if in_name else val).append(c)
644
+ i += 1
645
+
646
+ if name or val:
647
+ nm = "".join(name).strip().lower()
648
+ vl = "".join(val).strip()
649
+ if nm:
650
+ decls.append((nm, vl))
651
+ return decls
652
+
653
+ @classmethod
654
+ def _parse_css_rules(cls, css_str: str) -> Rules:
655
+ """
656
+ Produces normalized Rules with:
657
+ * orders: list[str] of tag names sorted by numeric 'order'
658
+ * sy: '.sy-*' class rules
659
+ * p_rules: '.p* <tag>' rules, indexed by p-class then tag
660
+ """
661
+ rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
662
+ order_pairs: list[tuple[str, int]] = []
663
+
664
+ i = 0
665
+ while True:
666
+ b1 = css_str.find("{", i)
667
+ if b1 == -1:
668
+ break
669
+ selector = css_str[i:b1].strip().lower()
670
+ b2 = css_str.find("}", b1 + 1)
671
+ if b2 == -1:
672
+ break
673
+ block = css_str[b1 + 1 : b2]
674
+ i = b2 + 1
675
+
676
+ decls = cls._parse_decls(block)
677
+
678
+ new_rule: Rule = {}
679
+ order_val: int | None = None
680
+
681
+ for name, value in decls:
682
+ v = value.strip()
683
+ if name == "font-size" and v == "0":
684
+ if "::first-letter" in selector:
685
+ new_rule["delete_first"] = True
686
+ else:
687
+ new_rule["delete_all"] = True
688
+ elif name == "transform":
689
+ if cls._RE_SCALEX.search(v.replace(" ", "")):
690
+ new_rule["transform_flip_x"] = True
691
+ elif name == "order":
692
+ with suppress(ValueError, TypeError):
693
+ order_val = int(v)
694
+ elif name == "content":
695
+ # normalize: remove outer quotes
696
+ if "::after" in selector:
697
+ m = cls._RE_ATTR.search(v)
698
+ if m:
699
+ new_rule["append_end_attr"] = m.group(1)
700
+ else:
701
+ s = v.strip().strip("\"'")
702
+ new_rule["append_end_char"] = s
703
+ elif "::before" in selector:
704
+ m = cls._RE_ATTR.search(v)
705
+ if m:
706
+ new_rule["append_start_attr"] = m.group(1)
707
+ else:
708
+ s = v.strip().strip("\"'")
709
+ new_rule["append_start_char"] = s
710
+
711
+ # classification
712
+ if selector.startswith(".sy-"):
713
+ key = selector.lstrip(".")
714
+ old = rules["sy"].get(key)
715
+ rules["sy"][key] = {**old, **new_rule} if old else (new_rule or {})
716
+
717
+ elif selector.startswith(".p") and " " in selector:
718
+ p_cls, right = selector.split(" ", 1)
719
+ p_cls = p_cls.lstrip(".")
720
+ tag = cls._only_tag(right)
721
+ if tag:
722
+ prev = rules["p_rules"].setdefault(p_cls, {}).get(tag)
723
+ rules["p_rules"][p_cls][tag] = (
724
+ {**prev, **new_rule} if prev else (new_rule or {})
725
+ )
726
+
727
+ if order_val is not None:
728
+ tag_for_order = cls._only_tag(selector)
729
+ if tag_for_order:
730
+ order_pairs.append((tag_for_order, order_val))
731
+
732
+ # normalize orders
733
+ order_pairs.sort(key=lambda t: t[1])
734
+ seen = set()
735
+ orders: list[str] = []
736
+ for tag, _ in order_pairs:
737
+ if tag not in seen:
738
+ seen.add(tag)
739
+ orders.append(tag)
740
+ rules["orders"] = orders
741
+ return rules
742
+
743
+ @staticmethod
744
+ def _render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
123
745
  """
124
- return is_encrypted(html_str)
746
+ Renderer the HTML using pre-parsed Rules.
747
+ """
748
+ tree = html.fromstring(html_str)
749
+ paragraphs_out: list[str] = []
750
+ refl_list: list[str] = []
751
+ orders = rules.get("orders") or []
752
+ p_rules = rules.get("p_rules") or {}
753
+ sy_rules = rules.get("sy") or {}
754
+
755
+ def _class_list(el: html.HtmlElement) -> list[str]:
756
+ cls = el.get("class")
757
+ return cls.split() if cls else []
758
+
759
+ def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
760
+ if rule.get("delete_all"):
761
+ return ""
762
+
763
+ parts: list[str] = []
764
+ if "append_start_char" in rule:
765
+ parts.append(rule["append_start_char"])
766
+ if "append_start_attr" in rule:
767
+ parts.append(el.get(rule["append_start_attr"], ""))
768
+
769
+ text = el.text or ""
770
+ if rule.get("delete_first") and text:
771
+ text = text[1:]
772
+ parts.append(text)
773
+
774
+ if "append_end_char" in rule:
775
+ parts.append(rule["append_end_char"])
776
+ if "append_end_attr" in rule:
777
+ parts.append(el.get(rule["append_end_attr"], ""))
778
+
779
+ s = "".join(parts)
780
+
781
+ if rule.get("transform_flip_x") and s:
782
+ refl_list.append(s)
783
+
784
+ return s
785
+
786
+ for p in tree.findall(".//p"):
787
+ p_classes = _class_list(p)
788
+ p_key = next((c for c in p_classes if c.startswith("p")), None)
789
+ has_ordered_rules = p_key in p_rules
790
+
791
+ buf_parts: list[str] = []
792
+
793
+ if p.text and not has_ordered_rules:
794
+ buf_parts.append(p.text)
795
+
796
+ ordered_cache: dict[str, list[str]] = {}
797
+
798
+ for child in p:
799
+ tag = str(child.tag)
800
+
801
+ # Handle inline <y class="sy-*"> spans
802
+ if tag == "y" and not has_ordered_rules:
803
+ y_cls = next(
804
+ (c for c in _class_list(child) if c.startswith("sy-")), None
805
+ )
806
+ if y_cls and y_cls in sy_rules:
807
+ buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
808
+ else:
809
+ buf_parts.append(child.text or "")
810
+ if child.tail:
811
+ buf_parts.append(child.tail)
812
+ continue
813
+
814
+ # Handle ordered paragraphs: only cache tags that appear in `orders`
815
+ if p_key and has_ordered_rules and tag in orders:
816
+ rule = p_rules[p_key].get(tag, {})
817
+ ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
818
+ continue
819
+
820
+ # Non-ordered, non-<y> nodes: include text + tails as-is
821
+ if not has_ordered_rules:
822
+ buf_parts.append(child.text or "")
823
+ if child.tail:
824
+ buf_parts.append(child.tail)
825
+
826
+ # If ordered, flush in global orders with all duplicates preserved
827
+ if has_ordered_rules:
828
+ for tag in orders:
829
+ if tag in ordered_cache:
830
+ buf_parts.extend(ordered_cache[tag])
831
+
832
+ para = "".join(buf_parts)
833
+ if para:
834
+ paragraphs_out.append(para)
125
835
 
126
- @property
127
- def save_font_debug(self) -> bool:
128
- return self._config.save_font_debug
836
+ return "\n".join(paragraphs_out), refl_list