PyPI - novel-downloader - Versions diffs - 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl - Mend

novel-downloader 1.2.2py3-none-any.whl → 1.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (128) hide show

novel_downloader/core/interfaces/{async_downloader_protocol.py → async_downloader.py} RENAMED Viewed

@@ -1,26 +1,25 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.interfaces.async_downloader_protocol
+novel_downloader.core.interfaces.async_downloader
 ----------------------------------------------------------
 This module defines the AsyncDownloaderProtocol, a structural interface
 that outlines the expected behavior of any downloader class.
 """
-from typing import List, Protocol
+from typing import Protocol
 class AsyncDownloaderProtocol(Protocol):
     """
-    Protocol for fully‐asynchronous downloader classes.
+    Protocol for fully-asynchronous downloader classes.
     Defines the expected interface for any downloader implementation,
     including both batch and single book downloads,
     as well as optional pre-download hooks.
     """
-    async def download(self, book_ids: List[str]) -> None:
+    async def download(self, book_ids: list[str]) -> None:
         """
         Batch download entry point.

novel_downloader/core/interfaces/{async_requester_protocol.py → async_requester.py} RENAMED Viewed

@@ -1,15 +1,14 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.interfaces.async_requester_protocol
+novel_downloader.core.interfaces.async_requester
 --------------------------------------------------------
 Defines the AsyncRequesterProtocol interface for fetching raw HTML or JSON
 for book info pages, individual chapters, managing request lifecycle,
-and optionally retrieving a user's authenticated bookcase — all in async style.
+and optionally retrieving a user's authenticated bookcase.
 """
-from typing import Literal, Optional, Protocol, runtime_checkable
+from typing import Any, Literal, Protocol, runtime_checkable
 @runtime_checkable
@@ -24,7 +23,13 @@ class AsyncRequesterProtocol(Protocol):
     def is_async(self) -> Literal[True]:
         ...
-    async def login(self, max_retries: int = 3, manual_login: bool = False) -> bool:
+    async def login(
+        self,
+        username: str = "",
+        password: str = "",
+        manual_login: bool = False,
+        **kwargs: Any,
+    ) -> bool:
         """
         Attempt to log in asynchronously.
         :returns: True if login succeeded.
@@ -32,41 +37,47 @@ class AsyncRequesterProtocol(Protocol):
         ...
     async def get_book_info(
-        self, book_id: str, wait_time: Optional[float] = None
+        self,
+        book_id: str,
+        **kwargs: Any,
     ) -> str:
         """
         Fetch the raw HTML (or JSON) of the book info page asynchronously.
         :param book_id: The book identifier.
-        :param wait_time: Base number of seconds to wait before returning content.
         :return: The page content as a string.
         """
         ...
     async def get_book_chapter(
-        self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
+        self,
+        book_id: str,
+        chapter_id: str,
+        **kwargs: Any,
     ) -> str:
         """
         Fetch the raw HTML (or JSON) of a single chapter asynchronously.
         :param book_id: The book identifier.
         :param chapter_id: The chapter identifier.
-        :param wait_time: Base number of seconds to wait before returning content.
         :return: The chapter content as a string.
         """
         ...
-    async def get_bookcase(self, wait_time: Optional[float] = None) -> str:
+    async def get_bookcase(
+        self,
+        page: int = 1,
+        **kwargs: Any,
+    ) -> str:
         """
         Optional: Retrieve the HTML content of the authenticated
         user's bookcase page asynchronously.
-        :param wait_time: Base number of seconds to wait before returning content.
         :return: The HTML markup of the bookcase page.
         """
         ...
-    async def shutdown(self) -> None:
+    async def close(self) -> None:
         """
         Shutdown and clean up any resources (e.g., close aiohttp session).
         """

novel_downloader/core/interfaces/{parser_protocol.py → parser.py} RENAMED Viewed

@@ -1,14 +1,15 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.interfaces.parser_protocol
-------------------------------------------------
+novel_downloader.core.interfaces.parser
+---------------------------------------
 Defines the ParserProtocol interface for extracting book metadata,
 parsing individual chapter content, and setting parser context via book_id.
 """
-from typing import Any, Dict, Protocol, runtime_checkable
+from typing import Any, Protocol, runtime_checkable
+from novel_downloader.utils.chapter_storage import ChapterDict
 @runtime_checkable
@@ -20,7 +21,7 @@ class ParserProtocol(Protocol):
       - accept a book_id context for multi-step workflows.
     """
-    def parse_book_info(self, html_str: str) -> Dict[str, Any]:
+    def parse_book_info(self, html_str: str) -> dict[str, Any]:
         """
         Parse and return a dictionary of book information from the raw HTML.
@@ -29,7 +30,11 @@ class ParserProtocol(Protocol):
         """
         ...
-    def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
+    def parse_chapter(
+        self,
+        html_str: str,
+        chapter_id: str,
+    ) -> ChapterDict | None:
         """
         Parse and return the text content of one chapter.

novel_downloader/core/interfaces/{saver_protocol.py → saver.py} RENAMED Viewed

@@ -1,8 +1,7 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.interfaces.saver_protocol
-------------------------------------------------
+novel_downloader.core.interfaces.saver
+--------------------------------------
 Defines the SaverProtocol interface for persisting completed books in
 TXT, EPUB, Markdown, and PDF formats.

novel_downloader/core/interfaces/{downloader_protocol.py → sync_downloader.py} RENAMED Viewed

@@ -1,17 +1,16 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.interfaces.downloader_protocol
-----------------------------------------------------
+novel_downloader.core.interfaces.sync_downloader
+------------------------------------------------
-This module defines the DownloaderProtocol, a structural interface
+This module defines the SyncDownloaderProtocol, a structural interface
 that outlines the expected behavior of any downloader class.
 """
-from typing import List, Protocol
+from typing import Protocol
-class DownloaderProtocol(Protocol):
+class SyncDownloaderProtocol(Protocol):
     """
     Protocol for downloader classes.
@@ -20,7 +19,7 @@ class DownloaderProtocol(Protocol):
     as well as optional pre-download hooks.
     """
-    def download(self, book_ids: List[str]) -> None:
+    def download(self, book_ids: list[str]) -> None:
         """
         Batch download entry point.

novel_downloader/core/interfaces/{requester_protocol.py → sync_requester.py} RENAMED Viewed

@@ -1,19 +1,18 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.interfaces.requester_protocol
---------------------------------------------------
+novel_downloader.core.interfaces.sync_requester
+-----------------------------------------------
 Defines the RequesterProtocol interface for fetching raw HTML or JSON
 for book info pages, individual chapters, managing request lifecycle,
 and optionally retrieving a user's authenticated bookcase.
 """
-from typing import Literal, Optional, Protocol, runtime_checkable
+from typing import Any, Literal, Protocol, runtime_checkable
 @runtime_checkable
-class RequesterProtocol(Protocol):
+class SyncRequesterProtocol(Protocol):
     """
     A requester must be able to fetch raw HTML/data for:
       - a book's info page,
@@ -23,46 +22,61 @@ class RequesterProtocol(Protocol):
     def is_async(self) -> Literal[False]:
         ...
-    def login(self, max_retries: int = 3, manual_login: bool = False) -> bool:
+    def login(
+        self,
+        username: str = "",
+        password: str = "",
+        manual_login: bool = False,
+        **kwargs: Any,
+    ) -> bool:
         """
         Attempt to log in
         """
         ...
-    def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
+    def get_book_info(
+        self,
+        book_id: str,
+        **kwargs: Any,
+    ) -> str:
         """
         Fetch the raw HTML (or JSON) of the book info page.
         :param book_id: The book identifier.
-        :param wait_time: Base number of seconds to wait before returning content.
         :return: The page content as a string.
         """
         ...
     def get_book_chapter(
-        self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
+        self,
+        book_id: str,
+        chapter_id: str,
+        **kwargs: Any,
     ) -> str:
         """
         Fetch the raw HTML (or JSON) of a single chapter.
         :param book_id: The book identifier.
         :param chapter_id: The chapter identifier.
-        :param wait_time: Base number of seconds to wait before returning content.
         :return: The chapter content as a string.
         """
         ...
-    def shutdown(self) -> None:
+    def get_bookcase(
+        self,
+        page: int = 1,
+        **kwargs: Any,
+    ) -> str:
         """
-        Shutdown and cleans up resources.
+        Optional: Retrieve the HTML content of the authenticated user's bookcase page.
+        :param page: Page idx
+        :return: The HTML markup of the bookcase page.
         """
         ...
-    def get_bookcase(self, wait_time: Optional[float] = None) -> str:
+    def close(self) -> None:
         """
-        Optional: Retrieve the HTML content of the authenticated user's bookcase page.
-        :param wait_time: Base number of seconds to wait before returning content.
-        :return: The HTML markup of the bookcase page.
+        Shutdown and cleans up resources.
         """
         ...

novel_downloader/core/parsers/__init__.py CHANGED Viewed

@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
 novel_downloader.core.parsers
 -----------------------------
@@ -8,20 +7,22 @@ This package defines all site-specific parsing modules
 for the novel_downloader framework.
 Currently supported:
-- Qidian (起点中文网) via browser-rendered page parsing.
+- Qidian (起点中文网)
 Modules:
 - qidian_parser
 - common_parser
 """
-from .common_parser import CommonParser
-from .qidian_parser import (
+from .biquge import BiqugeParser
+from .common import CommonParser
+from .qidian import (
     QidianBrowserParser,
     QidianSessionParser,
 )
 __all__ = [
+    "BiqugeParser",
     "CommonParser",
     "QidianBrowserParser",
     "QidianSessionParser",

novel_downloader/core/parsers/{base_parser.py → base.py} RENAMED Viewed

@@ -1,8 +1,7 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.parsers.base_parser
------------------------------------------
+novel_downloader.core.parsers.base
+----------------------------------
 This module defines the BaseParser abstract class, which implements the
 ParserProtocol interface and provides a structured foundation for
@@ -16,10 +15,11 @@ a standard parsing interface for:
 import abc
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any
 from novel_downloader.config import ParserConfig
 from novel_downloader.core.interfaces import ParserProtocol
+from novel_downloader.utils.chapter_storage import ChapterDict
 class BaseParser(ParserProtocol, abc.ABC):
@@ -33,19 +33,23 @@ class BaseParser(ParserProtocol, abc.ABC):
     Subclasses must implement actual parsing logic for specific sites.
     """
-    def __init__(self, config: ParserConfig):
+    def __init__(
+        self,
+        config: ParserConfig,
+    ):
         """
         Initialize the parser with a configuration object.
         :param config: ParserConfig object controlling parsing behavior.
         """
         self._config = config
-        self._book_id: Optional[str] = None
+        self._book_id: str | None = None
         self._base_cache_dir = Path(config.cache_dir)
+        self._cache_dir = self._base_cache_dir
     @abc.abstractmethod
-    def parse_book_info(self, html_str: str) -> Dict[str, Any]:
+    def parse_book_info(self, html_str: str) -> dict[str, Any]:
         """
         Parse a book info page and extract metadata and chapter structure.
@@ -58,7 +62,11 @@ class BaseParser(ParserProtocol, abc.ABC):
         ...
     @abc.abstractmethod
-    def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
+    def parse_chapter(
+        self,
+        html_str: str,
+        chapter_id: str,
+    ) -> ChapterDict | None:
         """
         Parse a single chapter page and extract clean text or simplified HTML.
@@ -69,7 +77,7 @@ class BaseParser(ParserProtocol, abc.ABC):
         ...
     @property
-    def book_id(self) -> Optional[str]:
+    def book_id(self) -> str | None:
         """
         Current book ID in context.
@@ -85,6 +93,7 @@ class BaseParser(ParserProtocol, abc.ABC):
         :param value: Book identifier.
         """
         self._book_id = value
+        self._cache_dir = self._base_cache_dir / value
         self._on_book_id_set()
     def _on_book_id_set(self) -> None:

novel_downloader/core/parsers/biquge/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+"""
+novel_downloader.core.parsers.biquge
+------------------------------------
+"""
+from .main_parser import BiqugeParser
+__all__ = ["BiqugeParser"]

novel_downloader/core/parsers/biquge/main_parser.py ADDED Viewed

@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""
+novel_downloader.core.parsers.biquge.main_parser
+------------------------------------------------
+"""
+import re
+from typing import Any
+from lxml import etree
+from lxml.etree import _Element
+from novel_downloader.core.parsers.base import BaseParser
+from novel_downloader.utils.chapter_storage import ChapterDict
+class BiqugeParser(BaseParser):
+    """ """
+    def parse_book_info(self, html_str: str) -> dict[str, Any]:
+        """
+        Parse a book info page and extract metadata and chapter structure.
+        :param html: Raw HTML of the book info page.
+        :return: Parsed metadata and chapter structure as a dictionary.
+        """
+        tree = etree.HTML(html_str, parser=None)
+        result: dict[str, Any] = {}
+        def extract_text(elem: _Element | None) -> str:
+            if elem is None:
+                return ""
+            return "".join(elem.itertext(tag=None)).strip()
+        # 书名
+        book_name_elem = tree.xpath('//div[@id="info"]/h1')
+        result["book_name"] = extract_text(book_name_elem[0]) if book_name_elem else ""
+        # 作者
+        author_elem = tree.xpath('//div[@id="info"]/p[1]')
+        if author_elem:
+            author_text = extract_text(author_elem[0]).replace("\u00a0", "")
+            match = re.search(r"作\s*者[:：]?\s*(\S+)", author_text)
+            result["author"] = match.group(1).strip() if match else ""
+        else:
+            result["author"] = ""
+        # 封面
+        cover_elem = tree.xpath('//div[@id="fmimg"]/img/@src')
+        result["cover_url"] = cover_elem[0].strip() if cover_elem else ""
+        # 最后更新时间
+        update_elem = tree.xpath('//div[@id="info"]/p[3]')
+        if update_elem:
+            update_text = extract_text(update_elem[0])
+            match = re.search(r"最后更新[:：]\s*(\S+)", update_text)
+            result["update_time"] = match.group(1).strip() if match else ""
+        else:
+            result["update_time"] = ""
+        # 简介
+        intro_elem = tree.xpath('//div[@id="intro"]')
+        result["summary"] = extract_text(intro_elem[0]) if intro_elem else ""
+        # 卷和章节
+        chapters = []
+        in_main_volume = False
+        list_dl = tree.xpath('//div[@id="list"]/dl')[0]
+        for elem in list_dl:
+            if elem.tag == "dt":
+                text = "".join(elem.itertext()).strip()
+                in_main_volume = "正文" in text
+            elif in_main_volume and elem.tag == "dd":
+                a: list[_Element] = elem.xpath("./a")
+                if a:
+                    title = "".join(a[0].itertext(tag=None)).strip()
+                    url = a[0].get("href", "").strip()
+                    href_cleaned = url.replace(".html", "")
+                    chapter_id_match = re.search(r"/(\d+)$", href_cleaned)
+                    chapter_id = chapter_id_match.group(1) if chapter_id_match else ""
+                    chapters.append(
+                        {"title": title, "url": url, "chapterId": chapter_id}
+                    )
+        result["volumes"] = [{"volume_name": "正文", "chapters": chapters}]
+        return result
+    def parse_chapter(
+        self,
+        html_str: str,
+        chapter_id: str,
+    ) -> ChapterDict | None:
+        """
+        Parse a single chapter page and extract clean text or simplified HTML.
+        :param html: Raw HTML of the chapter page.
+        :param chapter_id: Identifier of the chapter being parsed.
+        :return: Cleaned chapter content as plain text or minimal HTML.
+        """
+        tree = etree.HTML(html_str, parser=None)
+        # 提取标题
+        title_elem = tree.xpath('//div[@class="bookname"]/h1')
+        title = "".join(title_elem[0].itertext()).strip() if title_elem else ""
+        if not title:
+            title = f"第 {chapter_id} 章"
+        # 提取内容
+        content_elem = tree.xpath('//div[@id="content"]')
+        paragraphs = content_elem[0].xpath(".//p") if content_elem else []
+        paragraph_texts = [
+            "".join(p.itertext()).strip() for p in paragraphs if p is not None
+        ]
+        content = "\n\n".join([p for p in paragraph_texts if p])
+        if not content.strip():
+            return None
+        return {
+            "id": chapter_id,
+            "title": title,
+            "content": content,
+            "extra": {"site": "biquge"},
+        }

novel_downloader/core/parsers/{common_parser → common}/__init__.py RENAMED Viewed

@@ -1,8 +1,7 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.parsers.common_parser
--------------------------------------------
+novel_downloader.core.parsers.common
+------------------------------------
 This module provides a CommonParser class that implements
 general-purpose parsing logic for extracting novel metadata

novel_downloader/core/parsers/{common_parser → common}/helper.py RENAMED Viewed

@@ -1,15 +1,15 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-novel_downloader.core.parsers.common_parser.helpers
----------------------------------------------------
+novel_downloader.core.parsers.common.helpers
+--------------------------------------------
 Shared utility functions for parsing Common pages.
 """
 import logging
 import re
-from typing import Any, Dict, Iterable, Iterator, List, Optional, cast
+from collections.abc import Iterable, Iterator
+from typing import Any, cast
 from bs4 import BeautifulSoup, Tag
@@ -47,7 +47,7 @@ class HTMLExtractor:
         self._html = html
         self._soup = html_to_soup(html)
-    def extract_book_info(self, rules: BookInfoRules) -> Dict[str, Any]:
+    def extract_book_info(self, rules: BookInfoRules) -> dict[str, Any]:
         """
         Extract structured book information from HTML according to the given rules.
@@ -56,7 +56,7 @@ class HTMLExtractor:
         :param rules: Extraction configuration specifying how to extract.
         :return: A dictionary containing extracted book information.
         """
-        book_info: Dict[str, Any] = {}
+        book_info: dict[str, Any] = {}
         for field_name, field_rules in rules.items():
             if field_rules is None:
@@ -72,7 +72,7 @@ class HTMLExtractor:
         return book_info
-    def extract_field(self, steps: List[RuleStep]) -> str:
+    def extract_field(self, steps: list[RuleStep]) -> str:
         """
         Execute a list of extraction steps on the given HTML.
@@ -209,7 +209,7 @@ class HTMLExtractor:
             return str(current.get_text().strip())
         return str(current or "").strip()
-    def extract_mixed_volumes(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
+    def extract_mixed_volumes(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
         """
         Special mode: mixed <volume> and <chapter> under same parent.
         (e.g., dt / dd pattern in BiQuGe)
@@ -228,8 +228,8 @@ class HTMLExtractor:
                 "chapter_selector 和 volume_name_steps"
             )
-        volumes: List[Dict[str, Any]] = []
-        current_volume: Optional[Dict[str, Any]] = None
+        volumes: list[dict[str, Any]] = []
+        current_volume: dict[str, Any] | None = None
         if not chapter_steps_list:
             chapter_steps_list = []
         chapter_info_steps = {item["key"]: item["steps"] for item in chapter_steps_list}
@@ -258,7 +258,7 @@ class HTMLExtractor:
         return volumes
-    def extract_volume_blocks(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
+    def extract_volume_blocks(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
         volume_selector = volume_rule.get("volume_selector")
         volume_name_steps = volume_rule.get("volume_name_steps")
         chapter_selector = volume_rule["chapter_selector"]
@@ -285,7 +285,7 @@ class HTMLExtractor:
         return volumes
-    def extract_flat_chapters(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
+    def extract_flat_chapters(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
         chapter_selector = volume_rule["chapter_selector"]
         chapter_steps_list = volume_rule["chapter_steps"]
         volume_selector = volume_rule.get("volume_selector")
@@ -312,7 +312,7 @@ class HTMLExtractor:
     def extract_volumes_structure(
         self, volume_rule: VolumesRules
-    ) -> List[Dict[str, Any]]:
+    ) -> list[dict[str, Any]]:
         volume_mode = volume_rule.get("volume_mode", "normal")
         if volume_mode == "mixed":
             return self.extract_mixed_volumes(volume_rule)

novel-downloader 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl

novel-downloader 1.2.2py3-none-any.whl → 1.3.1py3-none-any.whl