PyPI - novel-downloader - Versions diffs - 1.3.0__py3-none-any.whl → 1.3.2__py3-none-any.whl - Mend

novel-downloader 1.3.0py3-none-any.whl → 1.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

novel_downloader/core/factory/parser.py CHANGED Viewed

@@ -14,19 +14,40 @@ from novel_downloader.core.interfaces import ParserProtocol
 from novel_downloader.core.parsers import (
     BiqugeParser,
     CommonParser,
+    EsjzoneParser,
+    QianbiParser,
     QidianBrowserParser,
     QidianSessionParser,
+    SfacgParser,
+    YamiboParser,
 )
 ParserBuilder = Callable[[ParserConfig], ParserProtocol]
 _site_map: dict[str, dict[str, ParserBuilder]] = {
+    "biquge": {
+        "session": BiqugeParser,
+        "async": BiqugeParser,
+    },
+    "esjzone": {
+        "session": EsjzoneParser,
+        "async": EsjzoneParser,
+    },
+    "qianbi": {
+        "session": QianbiParser,
+        "async": QianbiParser,
+    },
     "qidian": {
         "browser": QidianBrowserParser,
         "session": QidianSessionParser,
     },
-    "biquge": {
-        "session": BiqugeParser,
+    "sfacg": {
+        "session": SfacgParser,
+        "async": SfacgParser,
+    },
+    "yamibo": {
+        "session": YamiboParser,
+        "async": YamiboParser,
     },
 }

novel_downloader/core/factory/requester.py CHANGED Viewed

@@ -15,30 +15,55 @@ from novel_downloader.core.interfaces import (
     SyncRequesterProtocol,
 )
 from novel_downloader.core.requesters import (
+    BiqugeAsyncSession,
     BiqugeSession,
     CommonAsyncSession,
     CommonSession,
+    EsjzoneAsyncSession,
+    EsjzoneSession,
+    QianbiAsyncSession,
+    QianbiSession,
     QidianBrowser,
     QidianSession,
+    SfacgAsyncSession,
+    SfacgSession,
+    YamiboAsyncSession,
+    YamiboSession,
 )
 AsyncRequesterBuilder = Callable[[RequesterConfig], AsyncRequesterProtocol]
 SyncRequesterBuilder = Callable[[RequesterConfig], SyncRequesterProtocol]
-# _async_site_map: dict[str, AsyncRequesterBuilder] = {
-#     # "biquge": ...
-# }
+_async_site_map: dict[str, AsyncRequesterBuilder] = {
+    "biquge": BiqugeAsyncSession,
+    "esjzone": EsjzoneAsyncSession,
+    "qianbi": QianbiAsyncSession,
+    "sfacg": SfacgAsyncSession,
+    "yamibo": YamiboAsyncSession,
+}
 _sync_site_map: dict[
     str,
     dict[str, SyncRequesterBuilder],
 ] = {
+    "biquge": {
+        "session": BiqugeSession,
+    },
+    "esjzone": {
+        "session": EsjzoneSession,
+    },
+    "qianbi": {
+        "session": QianbiSession,
+    },
     "qidian": {
         "session": QidianSession,
         "browser": QidianBrowser,
     },
-    "biquge": {
-        "session": BiqugeSession,
+    "sfacg": {
+        "session": SfacgSession,
+    },
+    "yamibo": {
+        "session": YamiboSession,
     },
 }
@@ -57,8 +82,8 @@ def get_async_requester(
     site_key = site.lower()
     # site-specific
-    # if site_key in _async_site_map:
-    #     return _async_site_map[site_key](config)
+    if site_key in _async_site_map:
+        return _async_site_map[site_key](config)
     # fallback
     site_rules = load_site_rules()

novel_downloader/core/factory/saver.py CHANGED Viewed

@@ -7,17 +7,29 @@ This module implements a factory function for creating saver instances
 based on the site name and parser mode specified in the configuration.
 """
+from collections.abc import Callable
 from novel_downloader.config import SaverConfig, load_site_rules
 from novel_downloader.core.interfaces import SaverProtocol
 from novel_downloader.core.savers import (
     BiqugeSaver,
     CommonSaver,
+    EsjzoneSaver,
+    QianbiSaver,
     QidianSaver,
+    SfacgSaver,
+    YamiboSaver,
 )
-_site_map = {
-    "qidian": QidianSaver,
+SaverBuilder = Callable[[SaverConfig], SaverProtocol]
+_site_map: dict[str, SaverBuilder] = {
     "biquge": BiqugeSaver,
+    "esjzone": EsjzoneSaver,
+    "qianbi": QianbiSaver,
+    "qidian": QidianSaver,
+    "sfacg": SfacgSaver,
+    "yamibo": YamiboSaver,
 }

novel_downloader/core/interfaces/async_requester.py CHANGED Viewed

@@ -40,7 +40,7 @@ class AsyncRequesterProtocol(Protocol):
         self,
         book_id: str,
         **kwargs: Any,
-    ) -> str:
+    ) -> list[str]:
         """
         Fetch the raw HTML (or JSON) of the book info page asynchronously.
@@ -54,7 +54,7 @@ class AsyncRequesterProtocol(Protocol):
         book_id: str,
         chapter_id: str,
         **kwargs: Any,
-    ) -> str:
+    ) -> list[str]:
         """
         Fetch the raw HTML (or JSON) of a single chapter asynchronously.
@@ -68,7 +68,7 @@ class AsyncRequesterProtocol(Protocol):
         self,
         page: int = 1,
         **kwargs: Any,
-    ) -> str:
+    ) -> list[str]:
         """
         Optional: Retrieve the HTML content of the authenticated
         user's bookcase page asynchronously.

novel_downloader/core/interfaces/parser.py CHANGED Viewed

@@ -21,7 +21,11 @@ class ParserProtocol(Protocol):
       - accept a book_id context for multi-step workflows.
     """
-    def parse_book_info(self, html_str: str) -> dict[str, Any]:
+    def parse_book_info(
+        self,
+        html_str: list[str],
+        **kwargs: Any,
+    ) -> dict[str, Any]:
         """
         Parse and return a dictionary of book information from the raw HTML.
@@ -32,8 +36,9 @@ class ParserProtocol(Protocol):
     def parse_chapter(
         self,
-        html_str: str,
+        html_str: list[str],
         chapter_id: str,
+        **kwargs: Any,
     ) -> ChapterDict | None:
         """
         Parse and return the text content of one chapter.

novel_downloader/core/interfaces/sync_requester.py CHANGED Viewed

@@ -38,7 +38,7 @@ class SyncRequesterProtocol(Protocol):
         self,
         book_id: str,
         **kwargs: Any,
-    ) -> str:
+    ) -> list[str]:
         """
         Fetch the raw HTML (or JSON) of the book info page.
@@ -52,7 +52,7 @@ class SyncRequesterProtocol(Protocol):
         book_id: str,
         chapter_id: str,
         **kwargs: Any,
-    ) -> str:
+    ) -> list[str]:
         """
         Fetch the raw HTML (or JSON) of a single chapter.
@@ -66,7 +66,7 @@ class SyncRequesterProtocol(Protocol):
         self,
         page: int = 1,
         **kwargs: Any,
-    ) -> str:
+    ) -> list[str]:
         """
         Optional: Retrieve the HTML content of the authenticated user's bookcase page.

novel_downloader/core/parsers/__init__.py CHANGED Viewed

@@ -6,24 +6,34 @@ novel_downloader.core.parsers
 This package defines all site-specific parsing modules
 for the novel_downloader framework.
-Currently supported:
-- Qidian (起点中文网)
 Modules:
-- qidian_parser
-- common_parser
+- biquge (笔趣阁)
+- esjzone (ESJ Zone)
+- qianbi (铅笔小说)
+- qidian (起点中文网)
+- sfacg (SF轻小说)
+- yamibo (百合会)
+- common (通用架构)
 """
 from .biquge import BiqugeParser
 from .common import CommonParser
+from .esjzone import EsjzoneParser
+from .qianbi import QianbiParser
 from .qidian import (
     QidianBrowserParser,
     QidianSessionParser,
 )
+from .sfacg import SfacgParser
+from .yamibo import YamiboParser
 __all__ = [
     "BiqugeParser",
     "CommonParser",
+    "EsjzoneParser",
+    "QianbiParser",
     "QidianBrowserParser",
     "QidianSessionParser",
+    "SfacgParser",
+    "YamiboParser",
 ]

novel_downloader/core/parsers/base.py CHANGED Viewed

@@ -49,7 +49,11 @@ class BaseParser(ParserProtocol, abc.ABC):
         self._cache_dir = self._base_cache_dir
     @abc.abstractmethod
-    def parse_book_info(self, html_str: str) -> dict[str, Any]:
+    def parse_book_info(
+        self,
+        html_str: list[str],
+        **kwargs: Any,
+    ) -> dict[str, Any]:
         """
         Parse a book info page and extract metadata and chapter structure.
@@ -64,8 +68,9 @@ class BaseParser(ParserProtocol, abc.ABC):
     @abc.abstractmethod
     def parse_chapter(
         self,
-        html_str: str,
+        html_str: list[str],
         chapter_id: str,
+        **kwargs: Any,
     ) -> ChapterDict | None:
         """
         Parse a single chapter page and extract clean text or simplified HTML.

novel_downloader/core/parsers/biquge/main_parser.py CHANGED Viewed

@@ -18,14 +18,20 @@ from novel_downloader.utils.chapter_storage import ChapterDict
 class BiqugeParser(BaseParser):
     """ """
-    def parse_book_info(self, html_str: str) -> dict[str, Any]:
+    def parse_book_info(
+        self,
+        html_str: list[str],
+        **kwargs: Any,
+    ) -> dict[str, Any]:
         """
         Parse a book info page and extract metadata and chapter structure.
         :param html: Raw HTML of the book info page.
         :return: Parsed metadata and chapter structure as a dictionary.
         """
-        tree = etree.HTML(html_str, parser=None)
+        if not html_str:
+            return {}
+        tree = etree.HTML(html_str[0])
         result: dict[str, Any] = {}
         def extract_text(elem: _Element | None) -> str:
@@ -90,8 +96,9 @@ class BiqugeParser(BaseParser):
     def parse_chapter(
         self,
-        html_str: str,
+        html_str: list[str],
         chapter_id: str,
+        **kwargs: Any,
     ) -> ChapterDict | None:
         """
         Parse a single chapter page and extract clean text or simplified HTML.
@@ -100,7 +107,9 @@ class BiqugeParser(BaseParser):
         :param chapter_id: Identifier of the chapter being parsed.
         :return: Cleaned chapter content as plain text or minimal HTML.
         """
-        tree = etree.HTML(html_str, parser=None)
+        if not html_str:
+            return None
+        tree = etree.HTML(html_str[0], parser=None)
         # 提取标题
         title_elem = tree.xpath('//div[@class="bookname"]/h1')

novel_downloader/core/parsers/common/main_parser.py CHANGED Viewed

@@ -35,21 +35,28 @@ class CommonParser(BaseParser):
         self._site = site
         self._site_rule = site_rule
-    def parse_book_info(self, html_str: str) -> dict[str, Any]:
+    def parse_book_info(
+        self,
+        html_str: list[str],
+        **kwargs: Any,
+    ) -> dict[str, Any]:
         """
         Parse a book info page and extract metadata and chapter structure.
         :param html: Raw HTML of the book info page.
         :return: Parsed metadata and chapter structure as a dictionary.
         """
-        extractor = HTMLExtractor(html_str)
+        if not html_str:
+            return {}
+        extractor = HTMLExtractor(html_str[0])
         rules = self._site_rule["book_info"]
         return extractor.extract_book_info(rules)
     def parse_chapter(
         self,
-        html_str: str,
+        html_str: list[str],
         chapter_id: str,
+        **kwargs: Any,
     ) -> ChapterDict | None:
         """
         Parse a single chapter page and extract clean text or simplified HTML.
@@ -58,7 +65,9 @@ class CommonParser(BaseParser):
         :param chapter_id: Identifier of the chapter being parsed.
         :return: Cleaned chapter content as plain text or minimal HTML.
         """
-        extractor = HTMLExtractor(html_str)
+        if not html_str:
+            return None
+        extractor = HTMLExtractor(html_str[0])
         chapter_rules = self._site_rule["chapter"]
         # 必须有正文内容

novel_downloader/core/parsers/esjzone/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+"""
+novel_downloader.core.parsers.esjzone
+-------------------------------------
+"""
+from .main_parser import EsjzoneParser
+__all__ = ["EsjzoneParser"]

novel_downloader/core/parsers/esjzone/main_parser.py ADDED Viewed

@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+"""
+novel_downloader.core.parsers.esjzone.main_parser
+-------------------------------------------------
+"""
+from typing import Any
+from lxml import etree
+from lxml.etree import _Element
+from novel_downloader.core.parsers.base import BaseParser
+from novel_downloader.utils.chapter_storage import ChapterDict
+class EsjzoneParser(BaseParser):
+    """ """
+    # Book info XPaths
+    _BOOK_NAME_XPATH = '//h2[contains(@class, "text-normal")]/text()'
+    _AUTHOR_XPATH = '//li[strong[text()="作者:"]]/a/text()'
+    _COVER_URL_XPATH = '//div[contains(@class,"product-gallery")]//img/@src'
+    _UPDATE_TIME_XPATH = '//li[strong[text()="更新日期:"]]/text()'
+    _WORD_COUNT_XPATH = '//span[@id="txt"]/text()'
+    _TYPE_XPATH = '//li[strong[text()="類型:"]]/text()'
+    _ALT_NAME_XPATH = '//li[strong[text()="其他書名:"]]/text()'
+    _WEB_URL_XPATH = '//li[strong[text()="Web生肉:"]]/a/@href'
+    _SUMMARY_XPATH = '//div[@class="description"]/p//text()'
+    # Chapter XPaths
+    _CHAPTER_TEXT_XPATH = 'string(//div[contains(@class, "forum-content")])'
+    _CHAPTER_CONTENT_NODES_XPATH = '//div[contains(@class, "forum-content")]/*'
+    _CHAPTER_TIME_XPATHS = [
+        '//i[contains(@class, "icon-clock")]/following-sibling::text()',
+        '//i[contains(@class, "icon-pen-tool")]/following-sibling::text()',
+    ]
+    _CHECK_FORUM_XPATH = '//div[@class="page-title"]//ul[@class="breadcrumbs"]/li[not(@class="slash")]//text()'  # noqa: E501
+    def parse_book_info(
+        self,
+        html_str: list[str],
+        **kwargs: Any,
+    ) -> dict[str, Any]:
+        """
+        Parse a book info page and extract metadata and chapter structure.
+        注: 由于网站使用了多种不同的分卷格式, 已经尝试兼容常见情况,
+        但仍可能存在未覆盖的 cases
+        :param html: Raw HTML of the book info page.
+        :return: Parsed metadata and chapter structure as a dictionary.
+        """
+        if not html_str or self._is_forum_page(html_str):
+            return {}
+        tree = etree.HTML(html_str[0])
+        result: dict[str, Any] = {}
+        result["book_name"] = self._get_text(tree, self._BOOK_NAME_XPATH)
+        result["author"] = self._get_text(tree, self._AUTHOR_XPATH)
+        result["cover_url"] = self._get_text(tree, self._COVER_URL_XPATH)
+        result["update_time"] = self._get_text(tree, self._UPDATE_TIME_XPATH)
+        result["word_count"] = self._get_text(
+            tree, self._WORD_COUNT_XPATH, clean_comma=True
+        )
+        result["type"] = self._get_text(tree, self._TYPE_XPATH)
+        result["alt_name"] = self._get_text(tree, self._ALT_NAME_XPATH)
+        result["web_url"] = self._get_text(tree, self._WEB_URL_XPATH)
+        # result["summary"] = self._get_text(tree, self._SUMMARY_XPATH, join=True)
+        paras = tree.xpath('//div[@class="description"]/p')
+        texts = [p.xpath("string()").strip() for p in paras]
+        result["summary"] = "\n".join(texts).strip()
+        volumes: list[dict[str, Any]] = []
+        current_vol: dict[str, Any] = {}
+        def _start_volume(name: str) -> None:
+            nonlocal current_vol
+            name = name.strip() or "未命名卷"
+            if name == "未命名卷" and current_vol is not None:
+                return
+            current_vol = {"volume_name": name, "chapters": []}
+            volumes.append(current_vol)
+        _start_volume("單卷")
+        nodes = tree.xpath('//div[@id="chapterList"]/details') + tree.xpath(
+            '//div[@id="chapterList"]/*[not(self::details)]'
+        )
+        for node in nodes:
+            tag = node.tag.lower()
+            if tag == "details":
+                # ---- DETAILS‐based layout ----
+                summary = node.find("summary")
+                vol_name = summary.text if summary is not None else "未命名卷"
+                _start_volume(vol_name)
+                # all chapters inside this details
+                for a in node.findall("a"):
+                    title = "".join(a.xpath(".//p//text()")).strip()
+                    href = a.get("href", "")
+                    chap_id = href.rstrip("/").split("/")[-1].split(".", 1)[0]
+                    current_vol["chapters"].append(
+                        {"title": title, "url": href, "chapterId": chap_id}
+                    )
+            elif (
+                tag in ("h2",)
+                or (tag == "p" and node.get("class") == "non")
+                or tag == "summary"
+            ):
+                # Handle possible volume title markers:
+                # - <h2>: standard volume header
+                # - <p class="non">: alternative volume header style
+                # - <summary>: fallback for stray <summary> tags outside <details>
+                _start_volume(node.xpath("string()"))
+            elif tag == "a":
+                # ---- chapter link, attach to current volume ----
+                title = "".join(node.xpath(".//p//text()")).strip()
+                href = node.get("href", "")
+                chap_id = href.rstrip("/").split("/")[-1].split(".", 1)[0]
+                current_vol["chapters"].append(
+                    {"title": title, "url": href, "chapterId": chap_id}
+                )
+        volumes = [vol for vol in volumes if vol["chapters"]]
+        result["volumes"] = volumes
+        return result
+    def parse_chapter(
+        self,
+        html_str: list[str],
+        chapter_id: str,
+        **kwargs: Any,
+    ) -> ChapterDict | None:
+        """
+        Parse a single chapter page and extract clean text or simplified HTML.
+        :param html: Raw HTML of the chapter page.
+        :param chapter_id: Identifier of the chapter being parsed.
+        :return: Cleaned chapter content as plain text or minimal HTML.
+        """
+        if not html_str or self._is_forum_page(html_str):
+            return None
+        tree = etree.HTML(html_str[0], parser=None)
+        content_lines: list[str] = []
+        content_nodes = tree.xpath(self._CHAPTER_CONTENT_NODES_XPATH)
+        for node in content_nodes:
+            if node.tag == "p":
+                img_srcs = node.xpath(".//img/@src")
+                if img_srcs:
+                    for src in img_srcs:
+                        content_lines.append(f'<img src="{src}" />')
+                else:
+                    text = "".join(node.xpath(".//text()")).strip()
+                    if text:
+                        content_lines.append(text)
+            elif node.tag == "a":
+                img_srcs = node.xpath(".//img/@src")
+                for src in img_srcs:
+                    content_lines.append(f'<img src="{src}" />')
+        content = (
+            "\n\n".join(content_lines).strip()
+            if content_lines
+            else tree.xpath(self._CHAPTER_TEXT_XPATH).strip()
+        )
+        if not content:
+            return None
+        title_nodes = tree.xpath("//h2/text()")
+        title = title_nodes[0].strip() if title_nodes else ""
+        updated_at = next(
+            (
+                x.strip()
+                for xp in self._CHAPTER_TIME_XPATHS
+                for x in tree.xpath(xp)
+                if x.strip()
+            ),
+            "",
+        )
+        return {
+            "id": chapter_id,
+            "title": title,
+            "content": content,
+            "extra": {"site": "esjzone", "updated_at": updated_at},
+        }
+    def _is_forum_page(self, html_str: list[str]) -> bool:
+        if not html_str:
+            return False
+        tree = etree.HTML(html_str[0])
+        page_title = tree.xpath('string(//div[@class="page-title"]//h1)').strip()
+        if page_title != "論壇":
+            return False
+        breadcrumb: list[str] = tree.xpath(self._CHECK_FORUM_XPATH)
+        breadcrumb = [s.strip() for s in breadcrumb if s.strip()]
+        return breadcrumb == ["Home", "論壇"]
+    @staticmethod
+    def _get_text(
+        tree: _Element,
+        xpath: str,
+        join: bool = False,
+        clean_comma: bool = False,
+    ) -> str:
+        data = tree.xpath(xpath)
+        if not data:
+            return ""
+        text = "\n".join(data) if join else data[0].strip()
+        return text.replace(",", "") if clean_comma else text

novel_downloader/core/parsers/qianbi/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+"""
+novel_downloader.core.parsers.qianbi
+------------------------------------
+"""
+from .main_parser import QianbiParser
+__all__ = ["QianbiParser"]

novel-downloader 1.3.0__py3-none-any.whl → 1.3.2__py3-none-any.whl

novel-downloader 1.3.0py3-none-any.whl → 1.3.2py3-none-any.whl