PyPI - novel-downloader - Versions diffs - 1.1.0__py3-none-any.whl - Mend

novel-downloader 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

novel_downloader/__init__.py +14 -0
novel_downloader/cli/__init__.py +14 -0
novel_downloader/cli/clean.py +134 -0
novel_downloader/cli/download.py +132 -0
novel_downloader/cli/interactive.py +67 -0
novel_downloader/cli/main.py +45 -0
novel_downloader/cli/settings.py +177 -0
novel_downloader/config/__init__.py +52 -0
novel_downloader/config/adapter.py +153 -0
novel_downloader/config/loader.py +177 -0
novel_downloader/config/models.py +173 -0
novel_downloader/config/site_rules.py +97 -0
novel_downloader/core/__init__.py +25 -0
novel_downloader/core/downloaders/__init__.py +22 -0
novel_downloader/core/downloaders/base_async_downloader.py +157 -0
novel_downloader/core/downloaders/base_downloader.py +187 -0
novel_downloader/core/downloaders/common_asynb_downloader.py +207 -0
novel_downloader/core/downloaders/common_downloader.py +191 -0
novel_downloader/core/downloaders/qidian_downloader.py +208 -0
novel_downloader/core/factory/__init__.py +33 -0
novel_downloader/core/factory/downloader_factory.py +149 -0
novel_downloader/core/factory/parser_factory.py +62 -0
novel_downloader/core/factory/requester_factory.py +106 -0
novel_downloader/core/factory/saver_factory.py +49 -0
novel_downloader/core/interfaces/__init__.py +32 -0
novel_downloader/core/interfaces/async_downloader_protocol.py +37 -0
novel_downloader/core/interfaces/async_requester_protocol.py +68 -0
novel_downloader/core/interfaces/downloader_protocol.py +37 -0
novel_downloader/core/interfaces/parser_protocol.py +40 -0
novel_downloader/core/interfaces/requester_protocol.py +65 -0
novel_downloader/core/interfaces/saver_protocol.py +61 -0
novel_downloader/core/parsers/__init__.py +28 -0
novel_downloader/core/parsers/base_parser.py +96 -0
novel_downloader/core/parsers/common_parser/__init__.py +14 -0
novel_downloader/core/parsers/common_parser/helper.py +321 -0
novel_downloader/core/parsers/common_parser/main_parser.py +86 -0
novel_downloader/core/parsers/qidian_parser/__init__.py +20 -0
novel_downloader/core/parsers/qidian_parser/browser/__init__.py +13 -0
novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +498 -0
novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +97 -0
novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +70 -0
novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +110 -0
novel_downloader/core/parsers/qidian_parser/session/__init__.py +13 -0
novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +451 -0
novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +119 -0
novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +67 -0
novel_downloader/core/parsers/qidian_parser/session/main_parser.py +113 -0
novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +164 -0
novel_downloader/core/parsers/qidian_parser/shared/__init__.py +38 -0
novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +95 -0
novel_downloader/core/parsers/qidian_parser/shared/helpers.py +133 -0
novel_downloader/core/requesters/__init__.py +31 -0
novel_downloader/core/requesters/base_async_session.py +297 -0
novel_downloader/core/requesters/base_browser.py +210 -0
novel_downloader/core/requesters/base_session.py +243 -0
novel_downloader/core/requesters/common_requester/__init__.py +18 -0
novel_downloader/core/requesters/common_requester/common_async_session.py +96 -0
novel_downloader/core/requesters/common_requester/common_session.py +126 -0
novel_downloader/core/requesters/qidian_requester/__init__.py +22 -0
novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +377 -0
novel_downloader/core/requesters/qidian_requester/qidian_session.py +202 -0
novel_downloader/core/savers/__init__.py +20 -0
novel_downloader/core/savers/base_saver.py +169 -0
novel_downloader/core/savers/common_saver/__init__.py +13 -0
novel_downloader/core/savers/common_saver/common_epub.py +232 -0
novel_downloader/core/savers/common_saver/common_txt.py +176 -0
novel_downloader/core/savers/common_saver/main_saver.py +86 -0
novel_downloader/core/savers/epub_utils/__init__.py +27 -0
novel_downloader/core/savers/epub_utils/css_builder.py +68 -0
novel_downloader/core/savers/epub_utils/initializer.py +98 -0
novel_downloader/core/savers/epub_utils/text_to_html.py +132 -0
novel_downloader/core/savers/epub_utils/volume_intro.py +61 -0
novel_downloader/core/savers/qidian_saver.py +22 -0
novel_downloader/locales/en.json +91 -0
novel_downloader/locales/zh.json +91 -0
novel_downloader/resources/config/rules.toml +196 -0
novel_downloader/resources/config/settings.yaml +73 -0
novel_downloader/resources/css_styles/main.css +104 -0
novel_downloader/resources/css_styles/volume-intro.css +56 -0
novel_downloader/resources/images/volume_border.png +0 -0
novel_downloader/resources/js_scripts/qidian_decrypt_node.js +82 -0
novel_downloader/resources/json/replace_word_map.json +4 -0
novel_downloader/resources/text/blacklist.txt +22 -0
novel_downloader/utils/__init__.py +0 -0
novel_downloader/utils/cache.py +24 -0
novel_downloader/utils/constants.py +158 -0
novel_downloader/utils/crypto_utils.py +144 -0
novel_downloader/utils/file_utils/__init__.py +43 -0
novel_downloader/utils/file_utils/io.py +252 -0
novel_downloader/utils/file_utils/normalize.py +68 -0
novel_downloader/utils/file_utils/sanitize.py +77 -0
novel_downloader/utils/fontocr/__init__.py +23 -0
novel_downloader/utils/fontocr/ocr_v1.py +304 -0
novel_downloader/utils/fontocr/ocr_v2.py +658 -0
novel_downloader/utils/hash_store.py +288 -0
novel_downloader/utils/hash_utils.py +103 -0
novel_downloader/utils/i18n.py +41 -0
novel_downloader/utils/logger.py +104 -0
novel_downloader/utils/model_loader.py +72 -0
novel_downloader/utils/network.py +287 -0
novel_downloader/utils/state.py +156 -0
novel_downloader/utils/text_utils/__init__.py +27 -0
novel_downloader/utils/text_utils/chapter_formatting.py +46 -0
novel_downloader/utils/text_utils/diff_display.py +75 -0
novel_downloader/utils/text_utils/font_mapping.py +31 -0
novel_downloader/utils/text_utils/text_cleaning.py +57 -0
novel_downloader/utils/time_utils/__init__.py +22 -0
novel_downloader/utils/time_utils/datetime_utils.py +146 -0
novel_downloader/utils/time_utils/sleep_utils.py +49 -0
novel_downloader-1.1.0.dist-info/METADATA +157 -0
novel_downloader-1.1.0.dist-info/RECORD +115 -0
novel_downloader-1.1.0.dist-info/WHEEL +5 -0
novel_downloader-1.1.0.dist-info/entry_points.txt +2 -0
novel_downloader-1.1.0.dist-info/licenses/LICENSE +21 -0
novel_downloader-1.1.0.dist-info/top_level.txt +1 -0

novel_downloader/core/parsers/qidian_parser/browser/main_parser.py ADDED Viewed

@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+novel_downloader.core.parsers.qidian_parser.browser.main_parser
+---------------------------------------------------------------
+Main parser class for handling Qidian chapters rendered via a browser environment.
+This module defines `QidianBrowserParser`, a parser implementation that supports
+content extracted from dynamically rendered Qidian HTML pages.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, Optional
+from novel_downloader.config.models import ParserConfig
+from novel_downloader.core.parsers.base_parser import BaseParser
+from ..shared import (
+    is_encrypted,
+    parse_book_info,
+)
+from .chapter_router import parse_chapter
+if TYPE_CHECKING:
+    from novel_downloader.utils.fontocr import FontOCR
+class QidianBrowserParser(BaseParser):
+    """
+    Parser for Qidian site using a browser-rendered HTML workflow.
+    """
+    def __init__(self, config: ParserConfig):
+        """
+        Initialize the QidianBrowserParser with the given configuration.
+        :param config: ParserConfig object controlling:
+        """
+        super().__init__(config)
+        # Extract and store parser flags from config
+        self._decode_font: bool = config.decode_font
+        self._save_font_debug: bool = config.save_font_debug
+        self._fixed_font_dir: Path = self._base_cache_dir / "fixed_fonts"
+        self._fixed_font_dir.mkdir(parents=True, exist_ok=True)
+        self._font_debug_dir: Optional[Path] = None
+        self._font_ocr: Optional[FontOCR] = None
+        if self._decode_font:
+            from novel_downloader.utils.fontocr import FontOCR
+            self._font_ocr = FontOCR(
+                cache_dir=self._base_cache_dir,
+                use_freq=config.use_freq,
+                ocr_version=config.ocr_version,
+                use_ocr=config.use_ocr,
+                use_vec=config.use_vec,
+                batch_size=config.batch_size,
+                ocr_weight=config.ocr_weight,
+                vec_weight=config.vec_weight,
+                font_debug=config.save_font_debug,
+            )
+            self._font_debug_dir = self._base_cache_dir / "font_debug"
+            self._font_debug_dir.mkdir(parents=True, exist_ok=True)
+    def parse_book_info(self, html: str) -> Dict[str, Any]:
+        """
+        Parse a book info page and extract metadata and chapter structure.
+        :param html: Raw HTML of the book info page.
+        :return: Parsed metadata and chapter structure as a dictionary.
+        """
+        return parse_book_info(html)
+    def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
+        """
+        :param html: Raw HTML of the chapter page.
+        :param chapter_id: Identifier of the chapter being parsed.
+        :return: Cleaned chapter content as plain text.
+        """
+        return parse_chapter(self, html_str, chapter_id)
+    def is_encrypted(self, html_str: str) -> bool:
+        """
+        Return True if content is encrypted.
+        :param html: Raw HTML of the chapter page.
+        """
+        return is_encrypted(html_str)
+    def _init_cache_folders(self) -> None:
+        """
+        Prepare cache folders for plain/encrypted HTML and font debug data.
+        Folders are only created if corresponding debug/save flags are enabled.
+        """
+        base = self._base_cache_dir
+        # Font debug folder
+        if self._save_font_debug and self.book_id:
+            self._font_debug_dir = base / self.book_id / "font_debug"
+            self._font_debug_dir.mkdir(parents=True, exist_ok=True)
+        else:
+            self._font_debug_dir = None
+    def _on_book_id_set(self) -> None:
+        self._init_cache_folders()

novel_downloader/core/parsers/qidian_parser/session/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+novel_downloader.core.parsers.qidian_parser.session
+---------------------------------------------------------------
+This package provides parsing components for handling Qidian
+pages that have been rendered by a session.
+"""
+from .main_parser import QidianSessionParser
+__all__ = ["QidianSessionParser"]

novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py ADDED Viewed

@@ -0,0 +1,451 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+novel_downloader.core.parsers.qidian_parser.session.chapter_encrypted
+---------------------------------------------------------------------
+Support for parsing encrypted chapters from Qidian using font OCR mapping,
+CSS rules, and custom rendering logic.
+Includes:
+- Font downloading and caching
+- Encrypted paragraph extraction
+- Custom CSS parsing and layout restoration
+- Font-based OCR decryption and mapping
+"""
+from __future__ import annotations
+import json
+import logging
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+import tinycss2
+from bs4 import BeautifulSoup, Tag
+from novel_downloader.utils.network import download_font_file
+from novel_downloader.utils.text_utils import apply_font_mapping
+from ..shared import (
+    extract_chapter_info,
+    find_ssr_page_context,
+    html_to_soup,
+    vip_status,
+)
+from .node_decryptor import QidianNodeDecryptor
+if TYPE_CHECKING:
+    from .main_parser import QidianSessionParser
+logger = logging.getLogger(__name__)
+IGNORED_CLASS_LISTS = {"title", "review"}
+_decryptor: Optional[QidianNodeDecryptor] = None
+def _get_decryptor() -> QidianNodeDecryptor:
+    """
+    Return the singleton QidianNodeDecryptor, initializing it on first use.
+    """
+    global _decryptor
+    if _decryptor is None:
+        _decryptor = QidianNodeDecryptor()
+    return _decryptor
+def parse_encrypted_chapter(
+    parser: QidianSessionParser,
+    soup: BeautifulSoup,
+    chapter_id: str,
+    fuid: str,
+) -> Dict[str, Any]:
+    """
+    Extract and return the formatted textual content of an encrypted chapter.
+    Steps:
+    1. Load SSR JSON context for CSS, fonts, and metadata.
+    3. Decode and save randomFont bytes; download fixedFont via download_font().
+    4. Extract paragraph structures and save debug JSON.
+    5. Parse CSS rules and save debug JSON.
+    6. Render encrypted paragraphs, then run OCR font-mapping.
+    7. Extracts paragraph texts and formats them.
+    :param html_str: Raw HTML content of the chapter page.
+    :return: Formatted chapter text or empty string if not parsable.
+    """
+    try:
+        if not (parser._decode_font and parser._font_ocr):
+            return {}
+        ssr_data = find_ssr_page_context(soup)
+        chapter_info = extract_chapter_info(ssr_data)
+        if not chapter_info:
+            logger.warning(
+                "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
+            )
+            return {}
+        debug_base_dir: Optional[Path] = None
+        if parser._font_debug_dir:
+            debug_base_dir = parser._font_debug_dir / chapter_id
+            debug_base_dir.mkdir(parents=True, exist_ok=True)
+        css_str = chapter_info["css"]
+        randomFont_str = chapter_info["randomFont"]
+        fixedFontWoff2_url = chapter_info["fixedFontWoff2"]
+        title = chapter_info.get("chapterName", "Untitled")
+        raw_html = chapter_info.get("content", "")
+        chapter_id = chapter_info.get("chapterId", "")
+        fkp = chapter_info.get("fkp", "")
+        author_say = chapter_info.get("authorSay", "")
+        update_time = chapter_info.get("updateTime", "")
+        update_timestamp = chapter_info.get("updateTimestamp", 0)
+        modify_time = chapter_info.get("modifyTime", 0)
+        word_count = chapter_info.get("wordsCount", 0)
+        vip = bool(chapter_info.get("vipStatus", 0))
+        is_buy = bool(chapter_info.get("isBuy", 0))
+        seq = chapter_info.get("seq", None)
+        order = chapter_info.get("chapterOrder", None)
+        volume = chapter_info.get("extra", {}).get("volumeName", "")
+        if not raw_html:
+            logger.warning("[Parser] raw_html not found for chapter '%s'", chapter_id)
+            return {}
+        # extract + save font
+        rf = json.loads(randomFont_str)
+        rand_path = parser._base_cache_dir / "randomFont.ttf"
+        rand_path.parent.mkdir(parents=True, exist_ok=True)
+        rand_path.write_bytes(bytes(rf["data"]))
+        fixed_path = download_font_file(
+            url=fixedFontWoff2_url, target_folder=parser._fixed_font_dir
+        )
+        if fixed_path is None:
+            raise ValueError("fixed_path is None: failed to download font")
+        # Extract and render paragraphs from HTML with CSS rules
+        if vip_status(soup):
+            try:
+                decryptor = _get_decryptor()
+                raw_html = decryptor.decrypt(
+                    raw_html,
+                    chapter_id,
+                    fkp,
+                    fuid,
+                )
+            except Exception as e:
+                logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
+                return {}
+        main_paragraphs = extract_paragraphs_recursively(html_to_soup(raw_html))
+        if debug_base_dir:
+            main_paragraphs_path = debug_base_dir / "main_paragraphs_debug.json"
+            main_paragraphs_path.write_text(
+                json.dumps(main_paragraphs, ensure_ascii=False, indent=2),
+                encoding="utf-8",
+            )
+        paragraphs_rules = parse_rule(css_str)
+        if debug_base_dir:
+            paragraphs_rules_path = debug_base_dir / "paragraphs_rules_debug.json"
+            paragraphs_rules_path.write_text(
+                json.dumps(paragraphs_rules, ensure_ascii=False, indent=2),
+                encoding="utf-8",
+            )
+        paragraphs_str, refl_list = render_paragraphs(main_paragraphs, paragraphs_rules)
+        if debug_base_dir:
+            paragraphs_str_path = debug_base_dir / f"{chapter_id}_debug.txt"
+            paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
+        # Run OCR + fallback mapping
+        char_set = set(c for c in paragraphs_str if c not in {" ", "\n", "\u3000"})
+        refl_set = set(refl_list)
+        char_set = char_set - refl_set
+        if debug_base_dir:
+            char_sets_path = debug_base_dir / "char_set_debug.txt"
+            temp = f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}"
+            char_sets_path.write_text(
+                temp,
+                encoding="utf-8",
+            )
+        mapping_result = parser._font_ocr.generate_font_map(
+            fixed_font_path=fixed_path,
+            random_font_path=rand_path,
+            char_set=char_set,
+            refl_set=refl_set,
+            chapter_id=chapter_id,
+        )
+        if debug_base_dir:
+            mapping_json_path = debug_base_dir / "font_mapping.json"
+            mapping_json_path.write_text(
+                json.dumps(mapping_result, ensure_ascii=False, indent=2),
+                encoding="utf-8",
+            )
+        # Reconstruct final readable text
+        original_text = apply_font_mapping(paragraphs_str, mapping_result)
+        final_paragraphs_str = "\n\n".join(
+            line.strip() for line in original_text.splitlines() if line.strip()
+        )
+        chapter_info = {
+            "id": str(chapter_id),
+            "title": title,
+            "content": final_paragraphs_str,
+            "author_say": author_say.strip() if author_say else "",
+            "updated_at": update_time,
+            "update_timestamp": update_timestamp,
+            "modify_time": modify_time,
+            "word_count": word_count,
+            "vip": vip,
+            "purchased": is_buy,
+            "order": order,
+            "seq": seq,
+            "volume": volume,
+        }
+        return chapter_info
+    except Exception as e:
+        logger.warning(
+            "[Parser] parse error for encrypted chapter '%s': %s", chapter_id, e
+        )
+    return {}
+def extract_paragraphs_recursively(
+    soup: BeautifulSoup, chapter_id: int = -1
+) -> List[Dict[str, Any]]:
+    """
+    Extracts paragraph elements under <main id="c-{chapter_id}"> from HTML
+    and converts them to a nested data structure for further processing.
+    :param html_str: Full HTML content.
+    :param chapter_id: ID used to locate <main id="c-{chapter_id}">.
+    :return list: List of parsed <p> paragraph data.
+    """
+    def parse_element(elem: Any) -> Union[Dict[str, Any], None]:
+        if not isinstance(elem, Tag):
+            return None
+        result = {"tag": elem.name, "attrs": dict(elem.attrs), "data": []}
+        for child in elem.contents:
+            if isinstance(child, Tag):
+                parsed = parse_element(child)
+                if parsed:
+                    result["data"].append(parsed)
+            else:
+                text = child
+                if text:
+                    result["data"].append(text)
+        return result
+    if chapter_id > 0:
+        main_id = f"c-{chapter_id}"
+        main_tag = soup.find("main", id=main_id)
+        if not main_tag:
+            return []
+    else:
+        main_tag = soup
+    result = []
+    for p in main_tag.find_all("p"):
+        parsed_p = parse_element(p)
+        if parsed_p:
+            result.append(parsed_p)
+    return result
+def parse_rule(css_str: str) -> Dict[str, Any]:
+    """
+    Parse a CSS string and extract style rules for rendering.
+    Handles:
+    - font-size:0 (mark for deletion)
+    - scaleX(-1) (mark as mirrored)
+    - ::before / ::after with content or attr()
+    - class + tag selector mapping
+    - custom rendering order via 'order'
+    :param css_str: Raw CSS stylesheet string.
+    :return: Dict with "rules" and "orders" for rendering.
+    """
+    rules: Dict[str, Any] = {}
+    orders = []
+    stylesheet = tinycss2.parse_stylesheet(
+        css_str, skip_comments=True, skip_whitespace=True
+    )
+    for rule in stylesheet:
+        if rule.type != "qualified-rule":
+            continue
+        selector = tinycss2.serialize(rule.prelude).strip()
+        declarations = tinycss2.parse_declaration_list(rule.content)
+        parsed = {}
+        order_val = None
+        for decl in declarations:
+            if decl.type != "declaration":
+                continue
+            name = decl.lower_name
+            value = tinycss2.serialize(decl.value).strip()
+            if name == "font-size" and value == "0":
+                if "::first-letter" in selector:
+                    parsed["delete-first"] = True
+                else:
+                    parsed["delete-all"] = True
+            elif name == "transform" and value.lower() == "scalex(-1)":
+                parsed["transform-x_-1"] = True
+            elif name == "order":
+                order_val = value
+            elif name == "content":
+                if "::after" in selector:
+                    if "attr(" in value:
+                        parsed["append-end-attr"] = value.split("attr(")[1].split(")")[
+                            0
+                        ]
+                    else:
+                        parsed["append-end-char"] = value.strip("\"'")
+                elif "::before" in selector:
+                    if "attr(" in value:
+                        parsed["append-start-attr"] = value.split("attr(")[1].split(
+                            ")"
+                        )[0]
+                    else:
+                        parsed["append-start-char"] = value.strip("\"'")
+        # Store in structure
+        if selector.startswith(".sy-"):
+            rules.setdefault("sy", {})[selector[1:]] = parsed
+        elif selector.startswith(".p") and " " in selector:
+            class_str, tag_part = selector.split(" ", 1)
+            class_str = class_str.lstrip(".")
+            tag_part = tag_part.split("::")[0]
+            rules.setdefault(class_str, {}).setdefault(tag_part, {}).update(parsed)
+        if order_val:
+            orders.append((selector, order_val))
+    orders.sort(key=lambda x: int(x[1]))
+    return {"rules": rules, "orders": orders}
+def render_paragraphs(
+    main_paragraphs: List[Dict[str, Any]], rules: Dict[str, Any]
+) -> Tuple[str, List[str]]:
+    """
+    Applies the parsed CSS rules to the paragraph structure and
+    reconstructs the visible text.
+    Handles special class styles like .sy-*, text order control,
+    mirrored characters, etc.
+    :param main_paragraphs: A list of paragraph dictionaries, each with 'attrs'
+                            and 'data' fields representing structured content.
+    :param rules: A dictionary with keys 'orders' and 'rules', parsed from CSS.
+                  - rules['orders']: List of (selector, id) tuples.
+                  - rules['rules']: Nested dict containing transformation rules.
+    :return:
+        - A reconstructed paragraph string with line breaks.
+        - A list of mirrored (reflected) characters for later OCR processing.
+    """
+    orders: List[Tuple[str, str]] = rules.get("orders", [])
+    rules = rules.get("rules", {})
+    refl_list: List[str] = []
+    def apply_rule(data: Dict[str, Any], rule: Dict[str, Any]) -> str:
+        if rule.get("delete-all", False):
+            return ""
+        curr_str = ""
+        if isinstance(data.get("data"), list) and data["data"]:
+            first_data = data["data"][0]
+            if isinstance(first_data, str):
+                curr_str += first_data
+        if rule.get("delete-first", False):
+            if len(curr_str) <= 1:
+                curr_str = ""
+            else:
+                curr_str = curr_str[1:]
+        curr_str += rule.get("append-end-char", "")
+        attr_name = rule.get("append-end-attr", "")
+        if attr_name:
+            curr_str += data.get("attrs", {}).get(attr_name, "")
+        curr_str = rule.get("append-start-char", "") + curr_str
+        attr_name = rule.get("append-start-attr", "")
+        if attr_name:
+            curr_str = data.get("attrs", {}).get(attr_name, "") + curr_str
+        if rule.get("transform-x_-1", False):
+            refl_list.append(curr_str)
+        return curr_str
+    paragraphs_str = ""
+    for paragraph in main_paragraphs:
+        class_list = paragraph.get("attrs", {}).get("class", [])
+        p_class_str = next((c for c in class_list if c.startswith("p")), None)
+        curr_datas = paragraph.get("data", [])
+        ordered_cache = {}
+        for data in curr_datas:
+            # 文本节点直接加
+            if isinstance(data, str):
+                paragraphs_str += data
+                continue
+            if isinstance(data, dict):
+                tag = data.get("tag", "")
+                attrs = data.get("attrs", {})
+                # 跳过 span.review
+                if tag == "span" and "class" in attrs and "review" in attrs["class"]:
+                    continue
+                # sy 类型标签处理
+                if tag == "y":
+                    tag_class_list = attrs.get("class", [])
+                    tag_class = next(
+                        (c for c in tag_class_list if c.startswith("sy-")), None
+                    )
+                    if tag_class in rules.get("sy", {}):
+                        curr_rule = rules["sy"][tag_class]
+                        paragraphs_str += apply_rule(data, curr_rule)
+                    continue
+                if not p_class_str:
+                    if any(cls in IGNORED_CLASS_LISTS for cls in class_list):
+                        continue
+                    logger.debug(f"[parser] not find p_class_str: {class_list}")
+                    continue
+                # 普通标签处理，根据 orders 顺序匹配
+                for ord_selector, ord_id in orders:
+                    tag_name = f"{ord_selector}"
+                    if data.get("tag") != tag_name:
+                        continue
+                    curr_rule = rules.get(p_class_str, {}).get(ord_selector)
+                    curr_rule = curr_rule if curr_rule else {}
+                    ordered_cache[ord_selector] = apply_rule(data, curr_rule)
+                    break
+        # 最后按 orders 顺序拼接
+        for ord_selector, ord_id in orders:
+            if ord_selector in ordered_cache:
+                paragraphs_str += ordered_cache[ord_selector]
+        paragraphs_str += "\n\n"
+    return paragraphs_str, refl_list

novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py ADDED Viewed

@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+novel_downloader.core.parsers.qidian_parser.session.chapter_normal
+------------------------------------------------------------------
+Provides `parse_normal_chapter`, which will:
+  1. Extract SSR context from a “normal” (non-VIP) chapter page and format it.
+  2. Detect VIP/encrypted chapters and fall back to Node-based decryption
+     via `QidianNodeDecryptor`.
+"""
+import logging
+from typing import Any, Dict, Optional
+from bs4 import BeautifulSoup
+from ..shared import (
+    extract_chapter_info,
+    find_ssr_page_context,
+    html_to_soup,
+    vip_status,
+)
+from .node_decryptor import QidianNodeDecryptor
+logger = logging.getLogger(__name__)
+_decryptor: Optional[QidianNodeDecryptor] = None
+def _get_decryptor() -> QidianNodeDecryptor:
+    """
+    Return the singleton QidianNodeDecryptor, initializing it on first use.
+    """
+    global _decryptor
+    if _decryptor is None:
+        _decryptor = QidianNodeDecryptor()
+    return _decryptor
+def parse_normal_chapter(
+    soup: BeautifulSoup,
+    chapter_id: str,
+    fuid: str,
+) -> Dict[str, Any]:
+    """
+    Extract structured chapter info from a normal Qidian page.
+    :param soup:      A BeautifulSoup of the chapter HTML.
+    :param chapter_id: Chapter identifier (string).
+    :param fuid:      Fock user ID parameter from the page.
+    :return: a dictionary with keys like 'id', 'title', 'content', etc.
+    """
+    try:
+        ssr_data = find_ssr_page_context(soup)
+        chapter_info = extract_chapter_info(ssr_data)
+        if not chapter_info:
+            logger.warning(
+                "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
+            )
+            return {}
+        title = chapter_info.get("chapterName", "Untitled")
+        raw_html = chapter_info.get("content", "")
+        chapter_id = chapter_info.get("chapterId", "")
+        fkp = chapter_info.get("fkp", "")
+        author_say = chapter_info.get("authorSay", "")
+        update_time = chapter_info.get("updateTime", "")
+        update_timestamp = chapter_info.get("updateTimestamp", 0)
+        modify_time = chapter_info.get("modifyTime", 0)
+        word_count = chapter_info.get("wordsCount", 0)
+        vip = bool(chapter_info.get("vipStatus", 0))
+        is_buy = bool(chapter_info.get("isBuy", 0))
+        seq = chapter_info.get("seq", None)
+        order = chapter_info.get("chapterOrder", None)
+        volume = chapter_info.get("extra", {}).get("volumeName", "")
+        if not raw_html:
+            logger.warning("[Parser] raw_html not found for chapter '%s'", chapter_id)
+            return {}
+        if vip_status(soup):
+            try:
+                decryptor = _get_decryptor()
+                raw_html = decryptor.decrypt(
+                    raw_html,
+                    chapter_id,
+                    fkp,
+                    fuid,
+                )
+            except Exception as e:
+                logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
+                return {}
+        paras_soup = html_to_soup(raw_html)
+        paras = [p.get_text(strip=True) for p in paras_soup.find_all("p")]
+        chapter_text = "\n\n".join(paras)
+        return {
+            "id": str(chapter_id),
+            "title": title,
+            "content": chapter_text,
+            "author_say": author_say.strip() if author_say else "",
+            "updated_at": update_time,
+            "update_timestamp": update_timestamp,
+            "modify_time": modify_time,
+            "word_count": word_count,
+            "vip": vip,
+            "purchased": is_buy,
+            "order": order,
+            "seq": seq,
+            "volume": volume,
+        }
+    except Exception as e:
+        logger.warning(
+            "[Parser] parse error for normal chapter '%s': %s", chapter_id, e
+        )
+        return {}