PyPI - novel-downloader - Versions diffs - 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

novel-downloader 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (129) hide show

novel_downloader/core/parsers/{common_parser → common}/helper.py RENAMED Viewed

@@ -1,15 +1,15 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.parsers.common_parser.helpers
----------------------------------------------------
+novel_downloader.core.parsers.common.helpers
+--------------------------------------------
 Shared utility functions for parsing Common pages.
 """
 import logging
 import re
-from typing import Any, Dict, Iterable, Iterator, List, Optional, cast
+from collections.abc import Iterable, Iterator
+from typing import Any, cast
 from bs4 import BeautifulSoup, Tag
@@ -47,7 +47,7 @@ class HTMLExtractor:
         self._html = html
         self._soup = html_to_soup(html)
-    def extract_book_info(self, rules: BookInfoRules) -> Dict[str, Any]:
+    def extract_book_info(self, rules: BookInfoRules) -> dict[str, Any]:
         """
         Extract structured book information from HTML according to the given rules.
@@ -56,7 +56,7 @@ class HTMLExtractor:
         :param rules: Extraction configuration specifying how to extract.
         :return: A dictionary containing extracted book information.
         """
-        book_info: Dict[str, Any] = {}
+        book_info: dict[str, Any] = {}
         for field_name, field_rules in rules.items():
             if field_rules is None:
@@ -72,7 +72,7 @@ class HTMLExtractor:
         return book_info
-    def extract_field(self, steps: List[RuleStep]) -> str:
+    def extract_field(self, steps: list[RuleStep]) -> str:
         """
         Execute a list of extraction steps on the given HTML.
@@ -188,7 +188,7 @@ class HTMLExtractor:
                     current = sep.join(current)
             elif t == "attr":
-                name = step.get("attr")
+                name = step.get("attr") or ""
                 if isinstance(current, list):
                     current = [elem.get(name, "") for elem in current]
                 elif isinstance(current, Tag):
@@ -209,16 +209,16 @@ class HTMLExtractor:
             return str(current.get_text().strip())
         return str(current or "").strip()
-    def extract_mixed_volumes(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
+    def extract_mixed_volumes(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
         """
         Special mode: mixed <volume> and <chapter> under same parent.
         (e.g., dt / dd pattern in BiQuGe)
         """
         list_selector = volume_rule.get("list_selector")
         volume_selector = volume_rule.get("volume_selector")
-        chapter_selector = volume_rule.get("chapter_selector")
         volume_name_steps = volume_rule.get("volume_name_steps")
-        chapter_steps_list = volume_rule.get("chapter_steps")
+        chapter_selector = volume_rule["chapter_selector"]
+        chapter_steps_list = volume_rule["chapter_steps"]
         if not (
             list_selector and volume_selector and chapter_selector and volume_name_steps
@@ -228,8 +228,8 @@ class HTMLExtractor:
                 "chapter_selector 和 volume_name_steps"
             )
-        volumes: List[Dict[str, Any]] = []
-        current_volume: Optional[Dict[str, Any]] = None
+        volumes: list[dict[str, Any]] = []
+        current_volume: dict[str, Any] | None = None
         if not chapter_steps_list:
             chapter_steps_list = []
         chapter_info_steps = {item["key"]: item["steps"] for item in chapter_steps_list}
@@ -241,6 +241,8 @@ class HTMLExtractor:
         for elem in list_area.find_all(
             [volume_selector, chapter_selector], recursive=True
         ):
+            if not isinstance(elem, Tag):
+                continue
             if elem.name == volume_selector:
                 extractor = HTMLExtractor(str(elem))
                 volume_name = extractor.extract_field(volume_name_steps)
@@ -256,10 +258,10 @@ class HTMLExtractor:
         return volumes
-    def extract_volume_blocks(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
-        volume_selector = volume_rule["volume_selector"]
+    def extract_volume_blocks(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
+        volume_selector = volume_rule.get("volume_selector")
+        volume_name_steps = volume_rule.get("volume_name_steps")
         chapter_selector = volume_rule["chapter_selector"]
-        volume_name_steps = volume_rule["volume_name_steps"]
         chapter_steps_list = volume_rule["chapter_steps"]
         if not (volume_selector and volume_name_steps):
             raise ValueError(
@@ -283,7 +285,7 @@ class HTMLExtractor:
         return volumes
-    def extract_flat_chapters(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
+    def extract_flat_chapters(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
         chapter_selector = volume_rule["chapter_selector"]
         chapter_steps_list = volume_rule["chapter_steps"]
         volume_selector = volume_rule.get("volume_selector")
@@ -310,7 +312,7 @@ class HTMLExtractor:
     def extract_volumes_structure(
         self, volume_rule: VolumesRules
-    ) -> List[Dict[str, Any]]:
+    ) -> list[dict[str, Any]]:
         volume_mode = volume_rule.get("volume_mode", "normal")
         if volume_mode == "mixed":
             return self.extract_mixed_volumes(volume_rule)

novel_downloader/core/parsers/{common_parser → common}/main_parser.py RENAMED Viewed

@@ -1,18 +1,18 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.parsers.common_parser.main_parser
--------------------------------------------------------
+novel_downloader.core.parsers.common.main_parser
+------------------------------------------------
 This package provides parsing components for handling
 Common pages.
 """
-from typing import Any, Dict
+from typing import Any
 from novel_downloader.config import ParserConfig, SiteRules
+from novel_downloader.core.parsers.base import BaseParser
+from novel_downloader.utils.chapter_storage import ChapterDict
-from ..base_parser import BaseParser
 from .helper import HTMLExtractor
@@ -35,7 +35,7 @@ class CommonParser(BaseParser):
         self._site = site
         self._site_rule = site_rule
-    def parse_book_info(self, html_str: str) -> Dict[str, Any]:
+    def parse_book_info(self, html_str: str) -> dict[str, Any]:
         """
         Parse a book info page and extract metadata and chapter structure.
@@ -46,7 +46,11 @@ class CommonParser(BaseParser):
         rules = self._site_rule["book_info"]
         return extractor.extract_book_info(rules)
-    def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
+    def parse_chapter(
+        self,
+        html_str: str,
+        chapter_id: str,
+    ) -> ChapterDict | None:
         """
         Parse a single chapter page and extract clean text or simplified HTML.
@@ -66,13 +70,15 @@ class CommonParser(BaseParser):
         title = extractor.extract_field(title_steps["steps"]) if title_steps else ""
         content = extractor.extract_field(content_steps["steps"])
         if not content:
-            return {}
+            return None
         return {
             "id": chapter_id,
             "title": title or "Untitled",
             "content": content,
-            "site": self._site,
+            "extra": {
+                "site": self._site,
+            },
         }
     @property

novel_downloader/core/parsers/{qidian_parser → qidian}/__init__.py RENAMED Viewed

@@ -1,8 +1,7 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.parsers.qidian_parser
--------------------------------------------
+novel_downloader.core.parsers.qidian
+------------------------------------
 This package provides parsing implementations for the Qidian platform.

novel_downloader/core/parsers/{qidian_parser → qidian}/browser/__init__.py RENAMED Viewed

@@ -1,8 +1,7 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.parsers.qidian_parser.browser
----------------------------------------------------
+novel_downloader.core.parsers.qidian.browser
+--------------------------------------------
 This package provides parsing components for handling Qidian
 pages that have been rendered by a browser engine.

novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_encrypted.py RENAMED Viewed

@@ -1,8 +1,7 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.parsers.qidian_parser.browser.chapter_encrypted
----------------------------------------------------------------------
+novel_downloader.core.parsers.qidian.browser.chapter_encrypted
+--------------------------------------------------------------
 Support for parsing encrypted chapters from Qidian using font OCR mapping,
 CSS rules, and custom rendering logic.
@@ -19,11 +18,12 @@ from __future__ import annotations
 import json
 import logging
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
+from typing import TYPE_CHECKING, Any
 import tinycss2
 from bs4 import BeautifulSoup, Tag
+from novel_downloader.utils.chapter_storage import ChapterDict
 from novel_downloader.utils.network import download_font_file
 from novel_downloader.utils.text_utils import apply_font_mapping
@@ -43,7 +43,7 @@ def parse_encrypted_chapter(
     parser: QidianBrowserParser,
     soup: BeautifulSoup,
     chapter_id: str,
-) -> Dict[str, Any]:
+) -> ChapterDict | None:
     """
     Extract and return the formatted textual content of an encrypted chapter.
@@ -61,15 +61,15 @@ def parse_encrypted_chapter(
     """
     try:
         if not (parser._decode_font and parser._font_ocr):
-            return {}
+            return None
         ssr_data = find_ssr_page_context(soup)
         chapter_info = extract_chapter_info(ssr_data)
         if not chapter_info:
             logger.warning(
                 "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
             )
-            return {}
-        debug_base_dir: Optional[Path] = None
+            return None
+        debug_base_dir: Path | None = None
         if parser._font_debug_dir:
             debug_base_dir = parser._font_debug_dir / chapter_id
             debug_base_dir.mkdir(parents=True, exist_ok=True)
@@ -85,10 +85,7 @@ def parse_encrypted_chapter(
         update_timestamp = chapter_info.get("updateTimestamp", 0)
         modify_time = chapter_info.get("modifyTime", 0)
         word_count = chapter_info.get("wordsCount", 0)
-        vip = bool(chapter_info.get("vipStatus", 0))
-        is_buy = bool(chapter_info.get("isBuy", 0))
         seq = chapter_info.get("seq", None)
-        order = chapter_info.get("chapterOrder", None)
         volume = chapter_info.get("extra", {}).get("volumeName", "")
         # extract + save font
@@ -133,7 +130,7 @@ def parse_encrypted_chapter(
             logger.warning(
                 f"[Parser] No end_number found after parsing chapter '{chapter_id}'"
             )
-            return {}
+            return None
         paragraphs_str, refl_list = render_paragraphs(
             main_paragraphs, paragraphs_rules, end_number
@@ -143,7 +140,7 @@ def parse_encrypted_chapter(
             paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
         # Run OCR + fallback mapping
-        char_set = set(c for c in paragraphs_str if c not in {" ", "\n", "\u3000"})
+        char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
         refl_set = set(refl_list)
         char_set = char_set - refl_set
         if debug_base_dir:
@@ -174,33 +171,31 @@ def parse_encrypted_chapter(
         final_paragraphs_str = "\n\n".join(
             line.strip() for line in original_text.splitlines() if line.strip()
         )
-        chapter_info = {
+        return {
             "id": str(chapter_id),
             "title": title,
             "content": final_paragraphs_str,
-            "author_say": author_say.strip() if author_say else "",
-            "updated_at": update_time,
-            "update_timestamp": update_timestamp,
-            "modify_time": modify_time,
-            "word_count": word_count,
-            "vip": vip,
-            "purchased": is_buy,
-            "order": order,
-            "seq": seq,
-            "volume": volume,
+            "extra": {
+                "author_say": author_say.strip() if author_say else "",
+                "updated_at": update_time,
+                "update_timestamp": update_timestamp,
+                "modify_time": modify_time,
+                "word_count": word_count,
+                "seq": seq,
+                "volume": volume,
+            },
         }
-        return chapter_info
     except Exception as e:
         logger.warning(
             "[Parser] parse error for encrypted chapter '%s': %s", chapter_id, e
         )
-    return {}
+    return None
 def extract_paragraphs_recursively(
     soup: BeautifulSoup, chapter_id: str = ""
-) -> List[Dict[str, Any]]:
+) -> list[dict[str, Any]]:
     """
     Extracts paragraph elements under <main id="c-{chapter_id}"> from HTML
     and converts them to a nested data structure for further processing.
@@ -211,7 +206,7 @@ def extract_paragraphs_recursively(
     :return list: List of parsed <p> paragraph data.
     """
-    def parse_element(elem: Any) -> Union[Dict[str, Any], None]:
+    def parse_element(elem: Any) -> dict[str, Any] | None:
         if not isinstance(elem, Tag):
             return None
         result = {"tag": elem.name, "attrs": dict(elem.attrs), "data": []}
@@ -229,7 +224,7 @@ def extract_paragraphs_recursively(
     if chapter_id:
         main_id = f"c-{chapter_id}"
         main_tag = soup.find("main", id=main_id)
-        if not main_tag:
+        if not isinstance(main_tag, Tag):
             return []
     else:
         main_tag = soup
@@ -243,7 +238,7 @@ def extract_paragraphs_recursively(
     return result
-def parse_rule(css_str: str) -> Dict[str, Any]:
+def parse_rule(css_str: str) -> dict[str, Any]:
     """
     Parse a CSS string and extract style rules for rendering.
@@ -258,7 +253,7 @@ def parse_rule(css_str: str) -> Dict[str, Any]:
     :return: Dict with "rules" and "orders" for rendering.
     """
-    rules: Dict[str, Any] = {}
+    rules: dict[str, Any] = {}
     orders = []
     stylesheet = tinycss2.parse_stylesheet(
@@ -322,7 +317,7 @@ def parse_rule(css_str: str) -> Dict[str, Any]:
     return {"rules": rules, "orders": orders}
-def parse_paragraph_names(rules: Dict[str, Any]) -> Set[str]:
+def parse_paragraph_names(rules: dict[str, Any]) -> set[str]:
     """
     Extract all paragraph selector names from parsed rules, excluding "sy".
     """
@@ -335,16 +330,16 @@ def parse_paragraph_names(rules: Dict[str, Any]) -> Set[str]:
 def parse_end_number(
-    main_paragraphs: List[Dict[str, Any]], paragraph_names: Set[str]
-) -> Optional[int]:
+    main_paragraphs: list[dict[str, Any]], paragraph_names: set[str]
+) -> int | None:
     """
     Find the most frequent numeric suffix from tag names
     matched by given paragraph prefixes.
     """
-    end_numbers: Dict[int, int] = {}
+    end_numbers: dict[int, int] = {}
     sorted_names = sorted(paragraph_names, key=len, reverse=True)
-    def rec_parse(item: Union[List[Any], Dict[str, Any]]) -> None:
+    def rec_parse(item: list[Any] | dict[str, Any]) -> None:
         if isinstance(item, list):
             for element in item:
                 rec_parse(element)
@@ -359,7 +354,7 @@ def parse_end_number(
                             end_numbers[num] = end_numbers.get(num, 0) + 1
                         break
             for val in item.values():
-                if isinstance(val, (list, dict)):
+                if isinstance(val, (list | dict)):
                     rec_parse(val)
     rec_parse(main_paragraphs)
@@ -381,10 +376,10 @@ def parse_end_number(
 def render_paragraphs(
-    main_paragraphs: List[Dict[str, Any]],
-    rules: Dict[str, Any],
+    main_paragraphs: list[dict[str, Any]],
+    rules: dict[str, Any],
     end_number: int,
-) -> Tuple[str, List[str]]:
+) -> tuple[str, list[str]]:
     """
     Applies the parsed CSS rules to the paragraph structure and
     reconstructs the visible text.
@@ -403,11 +398,11 @@ def render_paragraphs(
         - A reconstructed paragraph string with line breaks.
         - A list of mirrored (reflected) characters for later OCR processing.
     """
-    orders: List[Tuple[str, str]] = rules.get("orders", [])
+    orders: list[tuple[str, str]] = rules.get("orders", [])
     rules = rules.get("rules", {})
-    refl_list: List[str] = []
+    refl_list: list[str] = []
-    def apply_rule(data: Dict[str, Any], rule: Dict[str, Any]) -> str:
+    def apply_rule(data: dict[str, Any], rule: dict[str, Any]) -> str:
         if rule.get("delete-all", False):
             return ""
@@ -418,10 +413,7 @@ def render_paragraphs(
                 curr_str += first_data
         if rule.get("delete-first", False):
-            if len(curr_str) <= 1:
-                curr_str = ""
-            else:
-                curr_str = curr_str[1:]
+            curr_str = "" if len(curr_str) <= 1 else curr_str[1:]
         curr_str += rule.get("append-end-char", "")
@@ -480,7 +472,7 @@ def render_paragraphs(
                     logger.debug(f"[parser] not find p_class_str: {class_list}")
                     continue
                 # 普通标签处理，根据 orders 顺序匹配
-                for ord_selector, ord_id in orders:
+                for ord_selector, _ in orders:
                     tag_name = f"{ord_selector}{end_number}"
                     if data.get("tag") != tag_name:
                         continue
@@ -489,7 +481,7 @@ def render_paragraphs(
                     ordered_cache[ord_selector] = apply_rule(data, curr_rule)
                     break
         # 最后按 orders 顺序拼接
-        for ord_selector, ord_id in orders:
+        for ord_selector, _ in orders:
             if ord_selector in ordered_cache:
                 paragraphs_str += ordered_cache[ord_selector]

novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_normal.py RENAMED Viewed

@@ -1,18 +1,18 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.parsers.qidian_parser.browser.chapter_normal
-------------------------------------------------------------------
+novel_downloader.core.parsers.qidian.browser.chapter_normal
+-----------------------------------------------------------
 Parser logic for extracting readable text from Qidian chapters
 that use plain (non-encrypted) browser-rendered HTML.
 """
 import logging
-from typing import Any, Dict
 from bs4 import BeautifulSoup
+from novel_downloader.utils.chapter_storage import ChapterDict
 from ..shared import (
     extract_chapter_info,
     find_ssr_page_context,
@@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
 def parse_normal_chapter(
     soup: BeautifulSoup,
     chapter_id: str,
-) -> Dict[str, Any]:
+) -> ChapterDict | None:
     """
     Extract and format the chapter text from a normal Qidian page.
     Returns empty string if VIP/encrypted.
@@ -44,7 +44,7 @@ def parse_normal_chapter(
         main = soup.select_one("div#app div#reader-content main")
         if not main:
             logger.warning("[Parser] Main content not found for chapter")
-            return {}
+            return None
         ssr_data = find_ssr_page_context(soup)
         chapter_info = extract_chapter_info(ssr_data)
@@ -52,7 +52,7 @@ def parse_normal_chapter(
             logger.warning(
                 "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
             )
-            return {}
+            return None
         title = chapter_info.get("chapterName", "Untitled")
         chapter_id = chapter_info.get("chapterId", "")
@@ -61,10 +61,7 @@ def parse_normal_chapter(
         update_timestamp = chapter_info.get("updateTimestamp", 0)
         modify_time = chapter_info.get("modifyTime", 0)
         word_count = chapter_info.get("wordsCount", 0)
-        vip = bool(chapter_info.get("vipStatus", 0))
-        is_buy = bool(chapter_info.get("isBuy", 0))
         seq = chapter_info.get("seq", None)
-        order = chapter_info.get("chapterOrder", None)
         volume = chapter_info.get("extra", {}).get("volumeName", "")
         # remove review spans
@@ -78,20 +75,19 @@ def parse_normal_chapter(
             "id": str(chapter_id),
             "title": title,
             "content": chapter_text,
-            "author_say": author_say.strip() if author_say else "",
-            "updated_at": update_time,
-            "update_timestamp": update_timestamp,
-            "modify_time": modify_time,
-            "word_count": word_count,
-            "vip": vip,
-            "purchased": is_buy,
-            "order": order,
-            "seq": seq,
-            "volume": volume,
+            "extra": {
+                "author_say": author_say.strip() if author_say else "",
+                "updated_at": update_time,
+                "update_timestamp": update_timestamp,
+                "modify_time": modify_time,
+                "word_count": word_count,
+                "seq": seq,
+                "volume": volume,
+            },
         }
     except Exception as e:
         logger.warning(
             "[Parser] parse error for normal chapter '%s': %s", chapter_id, e
         )
-        return {}
+    return None

novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_router.py RENAMED Viewed

@@ -1,8 +1,7 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.parsers.qidian_parser.browser.chapter_router
-------------------------------------------------------------------
+novel_downloader.core.parsers.qidian.browser.chapter_router
+-----------------------------------------------------------
 Routing logic for selecting the correct chapter parser for Qidian browser pages.
@@ -13,7 +12,9 @@ routes the parsing task to either the encrypted or normal chapter parser.
 from __future__ import annotations
 import logging
-from typing import TYPE_CHECKING, Any, Dict
+from typing import TYPE_CHECKING
+from novel_downloader.utils.chapter_storage import ChapterDict
 from ..shared import (
     can_view_chapter,
@@ -32,7 +33,7 @@ def parse_chapter(
     parser: QidianBrowserParser,
     html_str: str,
     chapter_id: str,
-) -> Dict[str, Any]:
+) -> ChapterDict | None:
     """
     Extract and return the formatted textual content of chapter.
@@ -48,11 +49,11 @@ def parse_chapter(
             logger.warning(
                 "[Parser] Chapter '%s' is not purchased or inaccessible.", chapter_id
             )
-            return {}
+            return None
         if is_encrypted(soup):
             if not parser._decode_font:
-                return {}
+                return None
             try:
                 from .chapter_encrypted import parse_encrypted_chapter
@@ -62,9 +63,9 @@ def parse_chapter(
                     "[Parser] Encrypted chapter '%s' requires extra dependencies.",
                     chapter_id,
                 )
-                return {}
+                return None
         return parse_normal_chapter(soup, chapter_id)
     except Exception as e:
         logger.warning("[Parser] parse error for chapter '%s': %s", chapter_id, e)
-        return {}
+    return None

novel-downloader 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

novel-downloader 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl