PyPI - novel-downloader - Versions diffs - 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

novel-downloader 1.5.0py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (241) hide show

novel_downloader/__init__.py +1 -1
novel_downloader/cli/__init__.py +1 -3
novel_downloader/cli/clean.py +21 -88
novel_downloader/cli/config.py +26 -21
novel_downloader/cli/download.py +77 -64
novel_downloader/cli/export.py +16 -20
novel_downloader/cli/main.py +1 -1
novel_downloader/cli/search.py +62 -65
novel_downloader/cli/ui.py +156 -0
novel_downloader/config/__init__.py +8 -5
novel_downloader/config/adapter.py +65 -105
novel_downloader/config/{loader.py → file_io.py} +53 -26
novel_downloader/core/__init__.py +1 -0
novel_downloader/core/archived/deqixs/fetcher.py +115 -0
novel_downloader/core/archived/deqixs/parser.py +132 -0
novel_downloader/core/archived/deqixs/searcher.py +89 -0
novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
novel_downloader/core/archived/wanbengo/searcher.py +98 -0
novel_downloader/core/archived/xshbook/searcher.py +93 -0
novel_downloader/core/downloaders/__init__.py +3 -24
novel_downloader/core/downloaders/base.py +49 -23
novel_downloader/core/downloaders/common.py +191 -137
novel_downloader/core/downloaders/qianbi.py +187 -146
novel_downloader/core/downloaders/qidian.py +187 -141
novel_downloader/core/downloaders/registry.py +4 -2
novel_downloader/core/downloaders/signals.py +46 -0
novel_downloader/core/exporters/__init__.py +3 -20
novel_downloader/core/exporters/base.py +33 -37
novel_downloader/core/exporters/common/__init__.py +1 -2
novel_downloader/core/exporters/common/epub.py +15 -10
novel_downloader/core/exporters/common/main_exporter.py +19 -12
novel_downloader/core/exporters/common/txt.py +14 -9
novel_downloader/core/exporters/epub_util.py +59 -29
novel_downloader/core/exporters/linovelib/__init__.py +1 -0
novel_downloader/core/exporters/linovelib/epub.py +23 -25
novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
novel_downloader/core/exporters/linovelib/txt.py +17 -11
novel_downloader/core/exporters/qidian.py +2 -8
novel_downloader/core/exporters/registry.py +4 -2
novel_downloader/core/exporters/txt_util.py +7 -7
novel_downloader/core/fetchers/__init__.py +54 -48
novel_downloader/core/fetchers/aaatxt.py +83 -0
novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
novel_downloader/core/fetchers/dxmwx.py +110 -0
novel_downloader/core/fetchers/eightnovel.py +139 -0
novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
novel_downloader/core/fetchers/guidaye.py +85 -0
novel_downloader/core/fetchers/hetushu.py +92 -0
novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
novel_downloader/core/fetchers/ixdzs8.py +113 -0
novel_downloader/core/fetchers/jpxs123.py +101 -0
novel_downloader/core/fetchers/lewenn.py +83 -0
novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
novel_downloader/core/fetchers/piaotia.py +105 -0
novel_downloader/core/fetchers/qbtr.py +101 -0
novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +46 -39
novel_downloader/core/fetchers/quanben5.py +92 -0
novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
novel_downloader/core/fetchers/registry.py +5 -16
novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
novel_downloader/core/fetchers/shencou.py +106 -0
novel_downloader/core/fetchers/shuhaige.py +84 -0
novel_downloader/core/fetchers/tongrenquan.py +84 -0
novel_downloader/core/fetchers/ttkan.py +95 -0
novel_downloader/core/fetchers/wanbengo.py +83 -0
novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
novel_downloader/core/fetchers/xiguashuwu.py +177 -0
novel_downloader/core/fetchers/xs63b.py +171 -0
novel_downloader/core/fetchers/xshbook.py +85 -0
novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
novel_downloader/core/fetchers/yibige.py +114 -0
novel_downloader/core/interfaces/__init__.py +1 -9
novel_downloader/core/interfaces/downloader.py +6 -2
novel_downloader/core/interfaces/exporter.py +7 -7
novel_downloader/core/interfaces/fetcher.py +4 -17
novel_downloader/core/interfaces/parser.py +5 -6
novel_downloader/core/interfaces/searcher.py +9 -1
novel_downloader/core/parsers/__init__.py +49 -12
novel_downloader/core/parsers/aaatxt.py +132 -0
novel_downloader/core/parsers/b520.py +116 -0
novel_downloader/core/parsers/base.py +63 -12
novel_downloader/core/parsers/biquyuedu.py +133 -0
novel_downloader/core/parsers/dxmwx.py +162 -0
novel_downloader/core/parsers/eightnovel.py +224 -0
novel_downloader/core/parsers/esjzone.py +61 -66
novel_downloader/core/parsers/guidaye.py +128 -0
novel_downloader/core/parsers/hetushu.py +139 -0
novel_downloader/core/parsers/i25zw.py +137 -0
novel_downloader/core/parsers/ixdzs8.py +186 -0
novel_downloader/core/parsers/jpxs123.py +137 -0
novel_downloader/core/parsers/lewenn.py +142 -0
novel_downloader/core/parsers/linovelib.py +48 -64
novel_downloader/core/parsers/piaotia.py +189 -0
novel_downloader/core/parsers/qbtr.py +136 -0
novel_downloader/core/parsers/qianbi.py +48 -50
novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
novel_downloader/core/parsers/qidian/chapter_encrypted.py +272 -330
novel_downloader/core/parsers/qidian/chapter_normal.py +24 -55
novel_downloader/core/parsers/qidian/main_parser.py +11 -38
novel_downloader/core/parsers/qidian/utils/__init__.py +1 -0
novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
novel_downloader/core/parsers/quanben5.py +103 -0
novel_downloader/core/parsers/registry.py +5 -16
novel_downloader/core/parsers/sfacg.py +38 -45
novel_downloader/core/parsers/shencou.py +215 -0
novel_downloader/core/parsers/shuhaige.py +111 -0
novel_downloader/core/parsers/tongrenquan.py +116 -0
novel_downloader/core/parsers/ttkan.py +132 -0
novel_downloader/core/parsers/wanbengo.py +191 -0
novel_downloader/core/parsers/xiaoshuowu.py +173 -0
novel_downloader/core/parsers/xiguashuwu.py +435 -0
novel_downloader/core/parsers/xs63b.py +161 -0
novel_downloader/core/parsers/xshbook.py +134 -0
novel_downloader/core/parsers/yamibo.py +87 -131
novel_downloader/core/parsers/yibige.py +166 -0
novel_downloader/core/searchers/__init__.py +34 -3
novel_downloader/core/searchers/aaatxt.py +107 -0
novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
novel_downloader/core/searchers/base.py +112 -36
novel_downloader/core/searchers/dxmwx.py +105 -0
novel_downloader/core/searchers/eightnovel.py +84 -0
novel_downloader/core/searchers/esjzone.py +43 -25
novel_downloader/core/searchers/hetushu.py +92 -0
novel_downloader/core/searchers/i25zw.py +93 -0
novel_downloader/core/searchers/ixdzs8.py +107 -0
novel_downloader/core/searchers/jpxs123.py +107 -0
novel_downloader/core/searchers/piaotia.py +100 -0
novel_downloader/core/searchers/qbtr.py +106 -0
novel_downloader/core/searchers/qianbi.py +74 -40
novel_downloader/core/searchers/quanben5.py +144 -0
novel_downloader/core/searchers/registry.py +24 -8
novel_downloader/core/searchers/shuhaige.py +124 -0
novel_downloader/core/searchers/tongrenquan.py +110 -0
novel_downloader/core/searchers/ttkan.py +92 -0
novel_downloader/core/searchers/xiaoshuowu.py +122 -0
novel_downloader/core/searchers/xiguashuwu.py +95 -0
novel_downloader/core/searchers/xs63b.py +104 -0
novel_downloader/locales/en.json +31 -82
novel_downloader/locales/zh.json +32 -83
novel_downloader/models/__init__.py +21 -22
novel_downloader/models/book.py +44 -0
novel_downloader/models/config.py +4 -37
novel_downloader/models/login.py +1 -1
novel_downloader/models/search.py +5 -0
novel_downloader/resources/config/settings.toml +8 -70
novel_downloader/resources/json/xiguashuwu.json +718 -0
novel_downloader/utils/__init__.py +13 -22
novel_downloader/utils/chapter_storage.py +3 -2
novel_downloader/utils/constants.py +4 -29
novel_downloader/utils/cookies.py +6 -18
novel_downloader/utils/crypto_utils/__init__.py +13 -0
novel_downloader/utils/crypto_utils/aes_util.py +90 -0
novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
novel_downloader/utils/epub/__init__.py +1 -1
novel_downloader/utils/epub/constants.py +57 -16
novel_downloader/utils/epub/documents.py +88 -194
novel_downloader/utils/epub/models.py +0 -14
novel_downloader/utils/epub/utils.py +63 -96
novel_downloader/utils/file_utils/__init__.py +2 -23
novel_downloader/utils/file_utils/io.py +3 -113
novel_downloader/utils/file_utils/sanitize.py +0 -4
novel_downloader/utils/fontocr.py +207 -0
novel_downloader/utils/logger.py +8 -16
novel_downloader/utils/network.py +2 -2
novel_downloader/utils/state.py +4 -90
novel_downloader/utils/text_utils/__init__.py +1 -7
novel_downloader/utils/text_utils/diff_display.py +5 -7
novel_downloader/utils/time_utils/__init__.py +5 -11
novel_downloader/utils/time_utils/datetime_utils.py +20 -29
novel_downloader/utils/time_utils/sleep_utils.py +4 -8
novel_downloader/web/__init__.py +13 -0
novel_downloader/web/components/__init__.py +11 -0
novel_downloader/web/components/navigation.py +35 -0
novel_downloader/web/main.py +66 -0
novel_downloader/web/pages/__init__.py +17 -0
novel_downloader/web/pages/download.py +78 -0
novel_downloader/web/pages/progress.py +147 -0
novel_downloader/web/pages/search.py +329 -0
novel_downloader/web/services/__init__.py +17 -0
novel_downloader/web/services/client_dialog.py +164 -0
novel_downloader/web/services/cred_broker.py +113 -0
novel_downloader/web/services/cred_models.py +35 -0
novel_downloader/web/services/task_manager.py +264 -0
novel_downloader-2.0.0.dist-info/METADATA +171 -0
novel_downloader-2.0.0.dist-info/RECORD +210 -0
{novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
novel_downloader/core/downloaders/biquge.py +0 -29
novel_downloader/core/downloaders/esjzone.py +0 -29
novel_downloader/core/downloaders/linovelib.py +0 -29
novel_downloader/core/downloaders/sfacg.py +0 -29
novel_downloader/core/downloaders/yamibo.py +0 -29
novel_downloader/core/exporters/biquge.py +0 -22
novel_downloader/core/exporters/esjzone.py +0 -22
novel_downloader/core/exporters/qianbi.py +0 -22
novel_downloader/core/exporters/sfacg.py +0 -22
novel_downloader/core/exporters/yamibo.py +0 -22
novel_downloader/core/fetchers/base/__init__.py +0 -14
novel_downloader/core/fetchers/base/browser.py +0 -422
novel_downloader/core/fetchers/biquge/__init__.py +0 -14
novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
novel_downloader/core/fetchers/esjzone/browser.py +0 -209
novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
novel_downloader/core/fetchers/linovelib/browser.py +0 -198
novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
novel_downloader/core/fetchers/qidian/__init__.py +0 -14
novel_downloader/core/fetchers/qidian/browser.py +0 -326
novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
novel_downloader/core/fetchers/sfacg/browser.py +0 -194
novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
novel_downloader/core/fetchers/yamibo/browser.py +0 -234
novel_downloader/core/parsers/biquge.py +0 -139
novel_downloader/models/chapter.py +0 -25
novel_downloader/models/types.py +0 -13
novel_downloader/tui/__init__.py +0 -7
novel_downloader/tui/app.py +0 -32
novel_downloader/tui/main.py +0 -17
novel_downloader/tui/screens/__init__.py +0 -14
novel_downloader/tui/screens/home.py +0 -198
novel_downloader/tui/screens/login.py +0 -74
novel_downloader/tui/styles/home_layout.tcss +0 -79
novel_downloader/tui/widgets/richlog_handler.py +0 -24
novel_downloader/utils/cache.py +0 -24
novel_downloader/utils/fontocr/__init__.py +0 -22
novel_downloader/utils/fontocr/hash_store.py +0 -280
novel_downloader/utils/fontocr/hash_utils.py +0 -103
novel_downloader/utils/fontocr/model_loader.py +0 -69
novel_downloader/utils/fontocr/ocr_v1.py +0 -315
novel_downloader/utils/fontocr/ocr_v2.py +0 -764
novel_downloader/utils/fontocr/ocr_v3.py +0 -744
novel_downloader-1.5.0.dist-info/METADATA +0 -196
novel_downloader-1.5.0.dist-info/RECORD +0 -164
{novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
{novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
{novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0

novel_downloader/core/parsers/qidian/chapter_encrypted.py CHANGED Viewed

@@ -11,9 +11,10 @@ from __future__ import annotations
 import json
 import logging
-from typing import TYPE_CHECKING, Any
+import re
+from contextlib import suppress
+from typing import TYPE_CHECKING, TypedDict
-import tinycss2
 from lxml import html
 from novel_downloader.models import ChapterDict
@@ -29,13 +30,36 @@ from .utils import (
     is_duplicated,
     vip_status,
 )
+from .utils.fontmap_recover import (
+    apply_font_mapping,
+    generate_font_map,
+)
 if TYPE_CHECKING:
     from .main_parser import QidianParser
 logger = logging.getLogger(__name__)
-IGNORED_CLASS_LISTS = {"title", "review"}
-NON_CONTENT_KEYWORDS = {"旧版", "反馈", "扫码"}
+_RE_ATTR = re.compile(r"attr\(\s*([^)]+?)\s*\)", re.I)
+_RE_SCALEX = re.compile(r"scalex\(\s*-?1\s*\)", re.I)
+class Rule(TypedDict, total=False):
+    delete_all: bool
+    delete_first: bool
+    transform_flip_x: bool
+    append_start_char: str
+    append_end_char: str
+    append_start_attr: str
+    append_end_attr: str
+class Rules(TypedDict):
+    # e.g., orders = ["i", "em", "span"]
+    orders: list[str]
+    # e.g., sy["sy-3"] -> Rule
+    sy: dict[str, Rule]
+    # e.g., p_rules["p3"]["i"] -> Rule
+    p_rules: dict[str, dict[str, Rule]]
 def parse_encrypted_chapter(
@@ -58,7 +82,7 @@ def parse_encrypted_chapter(
     :return: Formatted chapter text or empty string if not parsable.
     """
     try:
-        if not (parser._decode_font and parser._font_ocr):
+        if not parser._decode_font:
             return None
         ssr_data = find_ssr_page_context(html_str)
         chapter_info = extract_chapter_info(ssr_data)
@@ -104,47 +128,21 @@ def parse_encrypted_chapter(
             raise ValueError("fixed_path is None: failed to download font")
         # Extract and render paragraphs from HTML with CSS rules
-        main_paragraphs = extract_paragraphs_recursively(html_str, chapter_id)
-        if not main_paragraphs or contains_keywords(
-            main_paragraphs, NON_CONTENT_KEYWORDS
-        ):
-            if vip_status(ssr_data):
-                try:
-                    decryptor = get_decryptor()
-                    raw_html = decryptor.decrypt(
-                        raw_html,
-                        chapter_id,
-                        fkp,
-                        parser._fuid,
-                    )
-                except Exception as e:
-                    logger.error(
-                        "[Parser] decryption failed for '%s': %s", chapter_id, e
-                    )
-                    return None
-            main_paragraphs = extract_paragraphs_recursively(raw_html, chapter_id)
-        if parser.save_font_debug:
-            main_paragraphs_path = debug_dir / "main_paragraphs_debug.json"
-            main_paragraphs_path.write_text(
-                json.dumps(main_paragraphs, ensure_ascii=False, indent=2),
-                encoding="utf-8",
-            )
-        paragraphs_rules = parse_rule(css_str)
-        if parser.save_font_debug:
-            paragraphs_rules_path = debug_dir / "paragraphs_rules_debug.json"
-            paragraphs_rules_path.write_text(
-                json.dumps(paragraphs_rules, ensure_ascii=False, indent=2),
-                encoding="utf-8",
-            )
-        end_number = parse_end_number(main_paragraphs, paragraphs_rules)
-        paragraphs_str, refl_list = render_paragraphs(
-            main_paragraphs,
-            paragraphs_rules,
-            end_number,
-        )
+        if vip_status(ssr_data):
+            try:
+                decryptor = get_decryptor()
+                raw_html = decryptor.decrypt(
+                    raw_html,
+                    chapter_id,
+                    fkp,
+                    parser._fuid,
+                )
+            except Exception as e:
+                logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
+                return None
+        css_rules = parse_css_rules(css_str)
+        paragraphs_str, refl_list = render_visible_text(raw_html, css_rules)
         if parser.save_font_debug:
             paragraphs_str_path = debug_dir / f"{chapter_id}_debug.txt"
             paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
@@ -161,13 +159,17 @@ def parse_encrypted_chapter(
                 encoding="utf-8",
             )
-        mapping_result = parser._font_ocr.generate_font_map(
+        mapping_result = generate_font_map(
             fixed_font_path=fixed_path,
             random_font_path=rand_path,
             char_set=char_set,
             refl_set=refl_set,
-            chapter_id=chapter_id,
+            cache_dir=parser._base_cache_dir,
+            batch_size=parser._config.batch_size,
         )
+        if not mapping_result:
+            return None
         if parser.save_font_debug:
             mapping_json_path = debug_dir / "font_mapping.json"
             mapping_json_path.write_text(
@@ -176,12 +178,12 @@ def parse_encrypted_chapter(
             )
         # Reconstruct final readable text
-        original_text = parser._font_ocr.apply_font_mapping(
+        original_text = apply_font_mapping(
             text=paragraphs_str,
             font_map=mapping_result,
         )
-        final_paragraphs_str = "\n\n".join(
+        final_paragraphs_str = "\n".join(
             line.strip() for line in original_text.splitlines() if line.strip()
         )
         if parser._use_truncation and duplicated:
@@ -211,318 +213,258 @@ def parse_encrypted_chapter(
     return None
-def extract_paragraphs_recursively(
-    html_str: str,
-    chapter_id: str,
-) -> list[dict[str, Any]]:
-    def parse_element(elem: html.HtmlElement) -> dict[str, Any]:
-        class_attr = elem.attrib.get("class", "")
-        class_list = class_attr.split() if isinstance(class_attr, str) else class_attr
-        if "review" in class_list:
-            return {}
-        # Build attrs with class as list
-        attrs = {k: v.split() if k == "class" else v for k, v in elem.attrib.items()}
-        node: dict[str, Any] = {
-            "tag": elem.tag,
-            "attrs": attrs,
-            "data": [],
-        }
-        # Append entire elem.text if present (no splitting)
-        if elem.text:
-            node["data"].append(elem.text)
-        # Recurse into children
-        for child in elem.iterchildren(tag=None):
-            child_dict = parse_element(child)
-            if child_dict:
-                node["data"].append(child_dict)
-            # Append entire tail string (no split)
-            if child.tail:
-                node["data"].append(child.tail)
-        return node
-    tree = html.fromstring(html_str)
-    # Try to find <main id="c-{chapter_id}">
-    main_elem = tree.xpath(f'//main[@id="c-{chapter_id}"]')
-    search_root = main_elem[0] if main_elem else tree
-    return [parse_element(p) for p in search_root.findall(".//p")]
-def parse_rule(css_str: str) -> dict[str, Any]:
+def _only_tag(selector: str) -> str | None:
     """
-    Parse a CSS string and extract style rules for rendering.
+    Normalize a selector into just its tag name for ordering.
-    Handles:
-    - font-size:0 (mark for deletion)
-    - scaleX(-1) (mark as mirrored)
-    - ::before / ::after with content or attr()
-    - class + tag selector mapping
-    - custom rendering order via 'order'
+    Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
-    :param css_str: Raw CSS stylesheet string.
-    :return: Dict with "rules" and "orders" for rendering.
+    Returns None if can't extract a tag.
     """
-    rules: dict[str, Any] = {}
-    orders = []
-    stylesheet = tinycss2.parse_stylesheet(
-        css_str, skip_comments=True, skip_whitespace=True
-    )
-    for rule in stylesheet:
-        if rule.type != "qualified-rule":
-            continue
-        selector = tinycss2.serialize(rule.prelude).strip()
-        declarations = tinycss2.parse_declaration_list(rule.content)
-        parsed = {}
-        order_val = None
-        for decl in declarations:
-            if decl.type != "declaration":
-                continue
-            name = decl.lower_name
-            value = tinycss2.serialize(decl.value).strip()
-            if name == "font-size" and value == "0":
+    sel = selector.strip()
+    # If it has spaces, take the rightmost simple selector
+    last = sel.split()[-1]
+    # Drop ::pseudo
+    last = last.split("::", 1)[0]
+    # If it's like 'span[attr=..]' keep 'span'
+    last = last.split("[", 1)[0]
+    # If it starts with '.', it's not a tag
+    if not last or last.startswith("."):
+        return None
+    return last
+def _parse_decls(block: str) -> list[tuple[str, str]]:
+    """
+    Parse 'name:value;...' inside a block. Tolerates quotes and attr().
+    """
+    decls: list[tuple[str, str]] = []
+    i = 0
+    n = len(block)
+    name: list[str] = []
+    val: list[str] = []
+    in_name = True
+    quote = None  # track ' or "
+    while i < n:
+        c = block[i]
+        if quote:
+            # inside quotes
+            if c == "\\" and i + 1 < n:
+                # keep escaped char
+                (name if in_name else val).append(c)
+                i += 1
+                (name if in_name else val).append(block[i])
+            elif c == quote:
+                (name if in_name else val).append(c)
+                quote = None
+            else:
+                (name if in_name else val).append(c)
+        else:
+            if c in ("'", '"'):
+                (name if in_name else val).append(c)
+                quote = c
+            elif in_name and c == ":":
+                in_name = False
+            elif c == ";":
+                nm = "".join(name).strip().lower()
+                vl = "".join(val).strip()
+                if nm:
+                    decls.append((nm, vl))
+                name.clear()
+                val.clear()
+                in_name = True
+            else:
+                (name if in_name else val).append(c)
+        i += 1
+    if name or val:
+        nm = "".join(name).strip().lower()
+        vl = "".join(val).strip()
+        if nm:
+            decls.append((nm, vl))
+    return decls
+def parse_css_rules(css_str: str) -> Rules:
+    """
+    Produces normalized Rules with:
+      - orders: list[str] of tag names sorted by numeric 'order'
+      - sy: '.sy-*' class rules
+      - p_rules: '.p* <tag>' rules, indexed by p-class then tag
+    """
+    rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
+    order_pairs: list[tuple[str, int]] = []
+    i = 0
+    while True:
+        b1 = css_str.find("{", i)
+        if b1 == -1:
+            break
+        selector = css_str[i:b1].strip().lower()
+        b2 = css_str.find("}", b1 + 1)
+        if b2 == -1:
+            break
+        block = css_str[b1 + 1 : b2]
+        i = b2 + 1
+        decls = _parse_decls(block)
+        new_rule: Rule = {}
+        order_val: int | None = None
+        for name, value in decls:
+            v = value.strip()
+            if name == "font-size" and v == "0":
                 if "::first-letter" in selector:
-                    parsed["delete-first"] = True
+                    new_rule["delete_first"] = True
                 else:
-                    parsed["delete-all"] = True
-            elif name == "transform" and value.lower() == "scalex(-1)":
-                parsed["transform-x_-1"] = True
+                    new_rule["delete_all"] = True
+            elif name == "transform":
+                if _RE_SCALEX.search(v.replace(" ", "")):
+                    new_rule["transform_flip_x"] = True
             elif name == "order":
-                order_val = value
+                with suppress(ValueError, TypeError):
+                    order_val = int(v)
             elif name == "content":
+                # normalize: remove outer quotes
                 if "::after" in selector:
-                    if "attr(" in value:
-                        parsed["append-end-attr"] = value.split("attr(")[1].split(")")[
-                            0
-                        ]
+                    m = _RE_ATTR.search(v)
+                    if m:
+                        new_rule["append_end_attr"] = m.group(1)
                     else:
-                        parsed["append-end-char"] = value.strip("\"'")
+                        s = v.strip().strip("\"'")
+                        new_rule["append_end_char"] = s
                 elif "::before" in selector:
-                    if "attr(" in value:
-                        parsed["append-start-attr"] = value.split("attr(")[1].split(
-                            ")"
-                        )[0]
+                    m = _RE_ATTR.search(v)
+                    if m:
+                        new_rule["append_start_attr"] = m.group(1)
                     else:
-                        parsed["append-start-char"] = value.strip("\"'")
+                        s = v.strip().strip("\"'")
+                        new_rule["append_start_char"] = s
-        # Store in structure
+        # classification
         if selector.startswith(".sy-"):
-            rules.setdefault("sy", {})[selector[1:]] = parsed
+            key = selector.lstrip(".")
+            old = rules["sy"].get(key)
+            rules["sy"][key] = {**old, **new_rule} if old else (new_rule or {})
         elif selector.startswith(".p") and " " in selector:
-            class_str, tag_part = selector.split(" ", 1)
-            class_str = class_str.lstrip(".")
-            tag_part = tag_part.split("::")[0]
-            rules.setdefault(class_str, {}).setdefault(tag_part, {}).update(parsed)
+            p_cls, right = selector.split(" ", 1)
+            p_cls = p_cls.lstrip(".")
+            tag = _only_tag(right)
+            if tag:
+                prev = rules["p_rules"].setdefault(p_cls, {}).get(tag)
+                rules["p_rules"][p_cls][tag] = (
+                    {**prev, **new_rule} if prev else (new_rule or {})
+                )
+        if order_val is not None:
+            tag_for_order = _only_tag(selector)
+            if tag_for_order:
+                order_pairs.append((tag_for_order, order_val))
+    # normalize orders
+    order_pairs.sort(key=lambda t: t[1])
+    seen = set()
+    orders: list[str] = []
+    for tag, _num in order_pairs:
+        if tag not in seen:
+            seen.add(tag)
+            orders.append(tag)
+    rules["orders"] = orders
+    return rules
+def render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
+    """
+    Renderer the HTML using pre-parsed Rules.
+    """
+    tree = html.fromstring(html_str)
+    paragraphs_out: list[str] = []
+    refl_list: list[str] = []
+    orders = rules.get("orders") or []
+    p_rules = rules.get("p_rules") or {}
+    sy_rules = rules.get("sy") or {}
-        if order_val:
-            orders.append((selector, order_val))
+    def _class_list(el: html.HtmlElement) -> list[str]:
+        cls = el.get("class")
+        return cls.split() if cls else []
-    orders.sort(key=lambda x: int(x[1]))
-    return {"rules": rules, "orders": orders}
+    def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
+        if rule.get("delete_all"):
+            return ""
+        parts: list[str] = []
+        if "append_start_char" in rule:
+            parts.append(rule["append_start_char"])
+        if "append_start_attr" in rule:
+            parts.append(el.get(rule["append_start_attr"], ""))
-def render_paragraphs(
-    main_paragraphs: list[dict[str, Any]],
-    rules: dict[str, Any],
-    end_number: str = "",
-) -> tuple[str, list[str]]:
-    """
-    Applies the parsed CSS rules to the paragraph structure and
-    reconstructs the visible text.
+        text = el.text or ""
+        if rule.get("delete_first") and text:
+            text = text[1:]
+        parts.append(text)
-    Handles special class styles like .sy-*, text order control,
-    mirrored characters, etc.
+        if "append_end_char" in rule:
+            parts.append(rule["append_end_char"])
+        if "append_end_attr" in rule:
+            parts.append(el.get(rule["append_end_attr"], ""))
-    :param main_paragraphs: A list of paragraph dictionaries, each with 'attrs'
-                            and 'data' fields representing structured content.
-    :param rules: A dictionary with keys 'orders' and 'rules', parsed from CSS.
-                  - rules['orders']: List of (selector, id) tuples.
-                  - rules['rules']: Nested dict containing transformation rules.
+        s = "".join(parts)
-    :return:
-        - A reconstructed paragraph string with line breaks.
-        - A list of mirrored (reflected) characters for later OCR processing.
-    """
-    orders: list[tuple[str, str]] = rules.get("orders", [])
-    rules = rules.get("rules", {})
-    refl_list: list[str] = []
+        if rule.get("transform_flip_x") and s:
+            refl_list.append(s)
-    def apply_rule(data: dict[str, Any], rule: dict[str, Any]) -> str:
-        if rule.get("delete-all", False):
-            return ""
+        return s
-        curr_str = ""
-        if isinstance(data.get("data"), list) and data["data"]:
-            first_data = data["data"][0]
-            if isinstance(first_data, str):
-                curr_str += first_data
+    for p in tree.findall(".//p"):
+        p_classes = _class_list(p)
+        p_key = next((c for c in p_classes if c.startswith("p")), None)
+        has_ordered_rules = p_key in p_rules
-        if rule.get("delete-first", False):
-            curr_str = "" if len(curr_str) <= 1 else curr_str[1:]
+        buf_parts: list[str] = []
-        curr_str += rule.get("append-end-char", "")
+        if p.text and not has_ordered_rules:
+            buf_parts.append(p.text)
-        attr_name = rule.get("append-end-attr", "")
-        if attr_name:
-            curr_str += data.get("attrs", {}).get(f"{attr_name}{end_number}", "")
+        ordered_cache: dict[str, list[str]] = {}
-        curr_str = rule.get("append-start-char", "") + curr_str
+        for child in p:
+            tag = str(child.tag)
-        attr_name = rule.get("append-start-attr", "")
-        if attr_name:
-            curr_str = (
-                data.get("attrs", {}).get(f"{attr_name}{end_number}", "") + curr_str
-            )
+            # Handle inline <y class="sy-*"> spans
+            if tag == "y" and not has_ordered_rules:
+                y_cls = next(
+                    (c for c in _class_list(child) if c.startswith("sy-")), None
+                )
+                if y_cls and y_cls in sy_rules:
+                    buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
+                else:
+                    buf_parts.append(child.text or "")
+                if child.tail:
+                    buf_parts.append(child.tail)
+                continue
-        if rule.get("transform-x_-1", False):
-            refl_list.append(curr_str)
-        return curr_str
-    paragraphs_str = ""
-    for paragraph in main_paragraphs:
-        class_list = paragraph.get("attrs", {}).get("class", [])
-        p_class_str = next((c for c in class_list if c.startswith("p")), None)
-        curr_datas = paragraph.get("data", [])
-        ordered_cache = {}
-        for data in curr_datas:
-            # 文本节点直接加
-            if isinstance(data, str):
-                paragraphs_str += data
+            # Handle ordered paragraphs: only cache tags that appear in `orders`
+            if p_key and has_ordered_rules and tag in orders:
+                rule = p_rules[p_key].get(tag, {})
+                ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
                 continue
-            if isinstance(data, dict):
-                tag = data.get("tag", "")
-                attrs = data.get("attrs", {})
-                # 跳过 span.review
-                if tag == "span" and "class" in attrs and "review" in attrs["class"]:
-                    continue
-                # sy 类型标签处理
-                if tag == "y":
-                    tag_class_list = attrs.get("class", [])
-                    tag_class = next(
-                        (c for c in tag_class_list if c.startswith("sy-")), None
-                    )
-                    if tag_class in rules.get("sy", {}):
-                        curr_rule = rules["sy"][tag_class]
-                        paragraphs_str += apply_rule(data, curr_rule)
-                    continue
-                if not p_class_str:
-                    if any(cls in IGNORED_CLASS_LISTS for cls in class_list):
-                        continue
-                    logger.debug(f"[parser] not find p_class_str: {class_list}")
-                    continue
-                # 普通标签处理，根据 orders 顺序匹配
-                for ord_selector, _ in orders:
-                    tag_name = f"{ord_selector}{end_number}"
-                    if data.get("tag") != tag_name:
-                        continue
-                    curr_rule = rules.get(p_class_str, {}).get(ord_selector)
-                    curr_rule = curr_rule if curr_rule else {}
-                    ordered_cache[ord_selector] = apply_rule(data, curr_rule)
-                    break
-        # 最后按 orders 顺序拼接
-        for ord_selector, _ in orders:
-            if ord_selector in ordered_cache:
-                paragraphs_str += ordered_cache[ord_selector]
-        paragraphs_str += "\n\n"
-    return paragraphs_str, refl_list
-def parse_paragraph_names(rules: dict[str, Any]) -> set[str]:
-    """
-    Extract all paragraph selector names from parsed rules, excluding "sy".
-    """
-    paragraph_names = set()
-    for group, group_rules in rules.get("rules", {}).items():
-        if group == "sy":
-            continue
-        paragraph_names.update(group_rules.keys())
-    return paragraph_names
-def parse_end_number(
-    main_paragraphs: list[dict[str, Any]],
-    rules: dict[str, Any],
-) -> str:
-    """
-    Find the most frequent numeric suffix from tag names
-    matched by given paragraph prefixes.
-    """
-    paragraph_names = parse_paragraph_names(rules)
-    end_numbers: dict[int, int] = {}
-    prefix_hits = 0
-    sorted_names = sorted(paragraph_names, key=len, reverse=True)
-    def rec_parse(item: list[Any] | dict[str, Any]) -> None:
-        nonlocal prefix_hits
-        if isinstance(item, list):
-            for element in item:
-                rec_parse(element)
-        elif isinstance(item, dict):
-            tag = item.get("tag")
-            if isinstance(tag, str):
-                for prefix in sorted_names:
-                    if tag.startswith(prefix):
-                        prefix_hits += 1
-                        remain = tag[len(prefix) :]
-                        if remain.isdigit():
-                            num = int(remain)
-                            end_numbers[num] = end_numbers.get(num, 0) + 1
-                        break
-            for val in item.values():
-                if isinstance(val, (list | dict)):
-                    rec_parse(val)
-    rec_parse(main_paragraphs)
-    if not end_numbers:
-        logger.debug("[Parser] No valid ending numbers found")
-        return ""
-    sorted_numbers = sorted(
-        end_numbers.items(), key=lambda x: (x[1], x[0]), reverse=True
-    )
-    logger.debug(
-        "[Parser] Top 3 end numbers:\n%s",
-        "\n".join(f"{n}: {c}" for n, c in sorted_numbers[:3]),
-    )
-    most_common_number, most_common_count = sorted_numbers[0]
-    if most_common_count <= prefix_hits / 2:
-        logger.debug(
-            "[Parser] Top number (%s) does not exceed 50%% threshold: %d of %d",
-            most_common_number,
-            most_common_count,
-            prefix_hits,
-        )
-        return ""
+            # Non-ordered, non-<y> nodes: include text + tails as-is
+            if not has_ordered_rules:
+                buf_parts.append(child.text or "")
+                if child.tail:
+                    buf_parts.append(child.tail)
-    return str(most_common_number)
+        # If ordered, flush in global orders with all duplicates preserved
+        if has_ordered_rules:
+            for tag in orders:
+                if tag in ordered_cache:
+                    buf_parts.extend(ordered_cache[tag])
+        para = "".join(buf_parts)
+        if para:
+            paragraphs_out.append(para)
-def contains_keywords(paragraphs: list[dict[str, Any]], keywords: set[str]) -> bool:
-    for para in paragraphs:
-        data = para.get("data", [])
-        for item in data:
-            if isinstance(item, str) and any(kw in item for kw in keywords):
-                return True
-    return False
+    return "\n".join(paragraphs_out), refl_list

novel-downloader 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

novel-downloader 1.5.0py3-none-any.whl → 2.0.0py3-none-any.whl