PyPI - novel-downloader - Versions diffs - 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

novel-downloader 1.5.0py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (241) hide show

novel_downloader/__init__.py +1 -1
novel_downloader/cli/__init__.py +1 -3
novel_downloader/cli/clean.py +21 -88
novel_downloader/cli/config.py +26 -21
novel_downloader/cli/download.py +77 -64
novel_downloader/cli/export.py +16 -20
novel_downloader/cli/main.py +1 -1
novel_downloader/cli/search.py +62 -65
novel_downloader/cli/ui.py +156 -0
novel_downloader/config/__init__.py +8 -5
novel_downloader/config/adapter.py +65 -105
novel_downloader/config/{loader.py → file_io.py} +53 -26
novel_downloader/core/__init__.py +1 -0
novel_downloader/core/archived/deqixs/fetcher.py +115 -0
novel_downloader/core/archived/deqixs/parser.py +132 -0
novel_downloader/core/archived/deqixs/searcher.py +89 -0
novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
novel_downloader/core/archived/wanbengo/searcher.py +98 -0
novel_downloader/core/archived/xshbook/searcher.py +93 -0
novel_downloader/core/downloaders/__init__.py +3 -24
novel_downloader/core/downloaders/base.py +49 -23
novel_downloader/core/downloaders/common.py +191 -137
novel_downloader/core/downloaders/qianbi.py +187 -146
novel_downloader/core/downloaders/qidian.py +187 -141
novel_downloader/core/downloaders/registry.py +4 -2
novel_downloader/core/downloaders/signals.py +46 -0
novel_downloader/core/exporters/__init__.py +3 -20
novel_downloader/core/exporters/base.py +33 -37
novel_downloader/core/exporters/common/__init__.py +1 -2
novel_downloader/core/exporters/common/epub.py +15 -10
novel_downloader/core/exporters/common/main_exporter.py +19 -12
novel_downloader/core/exporters/common/txt.py +14 -9
novel_downloader/core/exporters/epub_util.py +59 -29
novel_downloader/core/exporters/linovelib/__init__.py +1 -0
novel_downloader/core/exporters/linovelib/epub.py +23 -25
novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
novel_downloader/core/exporters/linovelib/txt.py +17 -11
novel_downloader/core/exporters/qidian.py +2 -8
novel_downloader/core/exporters/registry.py +4 -2
novel_downloader/core/exporters/txt_util.py +7 -7
novel_downloader/core/fetchers/__init__.py +54 -48
novel_downloader/core/fetchers/aaatxt.py +83 -0
novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
novel_downloader/core/fetchers/dxmwx.py +110 -0
novel_downloader/core/fetchers/eightnovel.py +139 -0
novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
novel_downloader/core/fetchers/guidaye.py +85 -0
novel_downloader/core/fetchers/hetushu.py +92 -0
novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
novel_downloader/core/fetchers/ixdzs8.py +113 -0
novel_downloader/core/fetchers/jpxs123.py +101 -0
novel_downloader/core/fetchers/lewenn.py +83 -0
novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
novel_downloader/core/fetchers/piaotia.py +105 -0
novel_downloader/core/fetchers/qbtr.py +101 -0
novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +46 -39
novel_downloader/core/fetchers/quanben5.py +92 -0
novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
novel_downloader/core/fetchers/registry.py +5 -16
novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
novel_downloader/core/fetchers/shencou.py +106 -0
novel_downloader/core/fetchers/shuhaige.py +84 -0
novel_downloader/core/fetchers/tongrenquan.py +84 -0
novel_downloader/core/fetchers/ttkan.py +95 -0
novel_downloader/core/fetchers/wanbengo.py +83 -0
novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
novel_downloader/core/fetchers/xiguashuwu.py +177 -0
novel_downloader/core/fetchers/xs63b.py +171 -0
novel_downloader/core/fetchers/xshbook.py +85 -0
novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
novel_downloader/core/fetchers/yibige.py +114 -0
novel_downloader/core/interfaces/__init__.py +1 -9
novel_downloader/core/interfaces/downloader.py +6 -2
novel_downloader/core/interfaces/exporter.py +7 -7
novel_downloader/core/interfaces/fetcher.py +4 -17
novel_downloader/core/interfaces/parser.py +5 -6
novel_downloader/core/interfaces/searcher.py +9 -1
novel_downloader/core/parsers/__init__.py +49 -12
novel_downloader/core/parsers/aaatxt.py +132 -0
novel_downloader/core/parsers/b520.py +116 -0
novel_downloader/core/parsers/base.py +63 -12
novel_downloader/core/parsers/biquyuedu.py +133 -0
novel_downloader/core/parsers/dxmwx.py +162 -0
novel_downloader/core/parsers/eightnovel.py +224 -0
novel_downloader/core/parsers/esjzone.py +61 -66
novel_downloader/core/parsers/guidaye.py +128 -0
novel_downloader/core/parsers/hetushu.py +139 -0
novel_downloader/core/parsers/i25zw.py +137 -0
novel_downloader/core/parsers/ixdzs8.py +186 -0
novel_downloader/core/parsers/jpxs123.py +137 -0
novel_downloader/core/parsers/lewenn.py +142 -0
novel_downloader/core/parsers/linovelib.py +48 -64
novel_downloader/core/parsers/piaotia.py +189 -0
novel_downloader/core/parsers/qbtr.py +136 -0
novel_downloader/core/parsers/qianbi.py +48 -50
novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
novel_downloader/core/parsers/qidian/chapter_encrypted.py +272 -330
novel_downloader/core/parsers/qidian/chapter_normal.py +24 -55
novel_downloader/core/parsers/qidian/main_parser.py +11 -38
novel_downloader/core/parsers/qidian/utils/__init__.py +1 -0
novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
novel_downloader/core/parsers/quanben5.py +103 -0
novel_downloader/core/parsers/registry.py +5 -16
novel_downloader/core/parsers/sfacg.py +38 -45
novel_downloader/core/parsers/shencou.py +215 -0
novel_downloader/core/parsers/shuhaige.py +111 -0
novel_downloader/core/parsers/tongrenquan.py +116 -0
novel_downloader/core/parsers/ttkan.py +132 -0
novel_downloader/core/parsers/wanbengo.py +191 -0
novel_downloader/core/parsers/xiaoshuowu.py +173 -0
novel_downloader/core/parsers/xiguashuwu.py +435 -0
novel_downloader/core/parsers/xs63b.py +161 -0
novel_downloader/core/parsers/xshbook.py +134 -0
novel_downloader/core/parsers/yamibo.py +87 -131
novel_downloader/core/parsers/yibige.py +166 -0
novel_downloader/core/searchers/__init__.py +34 -3
novel_downloader/core/searchers/aaatxt.py +107 -0
novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
novel_downloader/core/searchers/base.py +112 -36
novel_downloader/core/searchers/dxmwx.py +105 -0
novel_downloader/core/searchers/eightnovel.py +84 -0
novel_downloader/core/searchers/esjzone.py +43 -25
novel_downloader/core/searchers/hetushu.py +92 -0
novel_downloader/core/searchers/i25zw.py +93 -0
novel_downloader/core/searchers/ixdzs8.py +107 -0
novel_downloader/core/searchers/jpxs123.py +107 -0
novel_downloader/core/searchers/piaotia.py +100 -0
novel_downloader/core/searchers/qbtr.py +106 -0
novel_downloader/core/searchers/qianbi.py +74 -40
novel_downloader/core/searchers/quanben5.py +144 -0
novel_downloader/core/searchers/registry.py +24 -8
novel_downloader/core/searchers/shuhaige.py +124 -0
novel_downloader/core/searchers/tongrenquan.py +110 -0
novel_downloader/core/searchers/ttkan.py +92 -0
novel_downloader/core/searchers/xiaoshuowu.py +122 -0
novel_downloader/core/searchers/xiguashuwu.py +95 -0
novel_downloader/core/searchers/xs63b.py +104 -0
novel_downloader/locales/en.json +31 -82
novel_downloader/locales/zh.json +32 -83
novel_downloader/models/__init__.py +21 -22
novel_downloader/models/book.py +44 -0
novel_downloader/models/config.py +4 -37
novel_downloader/models/login.py +1 -1
novel_downloader/models/search.py +5 -0
novel_downloader/resources/config/settings.toml +8 -70
novel_downloader/resources/json/xiguashuwu.json +718 -0
novel_downloader/utils/__init__.py +13 -22
novel_downloader/utils/chapter_storage.py +3 -2
novel_downloader/utils/constants.py +4 -29
novel_downloader/utils/cookies.py +6 -18
novel_downloader/utils/crypto_utils/__init__.py +13 -0
novel_downloader/utils/crypto_utils/aes_util.py +90 -0
novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
novel_downloader/utils/epub/__init__.py +1 -1
novel_downloader/utils/epub/constants.py +57 -16
novel_downloader/utils/epub/documents.py +88 -194
novel_downloader/utils/epub/models.py +0 -14
novel_downloader/utils/epub/utils.py +63 -96
novel_downloader/utils/file_utils/__init__.py +2 -23
novel_downloader/utils/file_utils/io.py +3 -113
novel_downloader/utils/file_utils/sanitize.py +0 -4
novel_downloader/utils/fontocr.py +207 -0
novel_downloader/utils/logger.py +8 -16
novel_downloader/utils/network.py +2 -2
novel_downloader/utils/state.py +4 -90
novel_downloader/utils/text_utils/__init__.py +1 -7
novel_downloader/utils/text_utils/diff_display.py +5 -7
novel_downloader/utils/time_utils/__init__.py +5 -11
novel_downloader/utils/time_utils/datetime_utils.py +20 -29
novel_downloader/utils/time_utils/sleep_utils.py +4 -8
novel_downloader/web/__init__.py +13 -0
novel_downloader/web/components/__init__.py +11 -0
novel_downloader/web/components/navigation.py +35 -0
novel_downloader/web/main.py +66 -0
novel_downloader/web/pages/__init__.py +17 -0
novel_downloader/web/pages/download.py +78 -0
novel_downloader/web/pages/progress.py +147 -0
novel_downloader/web/pages/search.py +329 -0
novel_downloader/web/services/__init__.py +17 -0
novel_downloader/web/services/client_dialog.py +164 -0
novel_downloader/web/services/cred_broker.py +113 -0
novel_downloader/web/services/cred_models.py +35 -0
novel_downloader/web/services/task_manager.py +264 -0
novel_downloader-2.0.0.dist-info/METADATA +171 -0
novel_downloader-2.0.0.dist-info/RECORD +210 -0
{novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
novel_downloader/core/downloaders/biquge.py +0 -29
novel_downloader/core/downloaders/esjzone.py +0 -29
novel_downloader/core/downloaders/linovelib.py +0 -29
novel_downloader/core/downloaders/sfacg.py +0 -29
novel_downloader/core/downloaders/yamibo.py +0 -29
novel_downloader/core/exporters/biquge.py +0 -22
novel_downloader/core/exporters/esjzone.py +0 -22
novel_downloader/core/exporters/qianbi.py +0 -22
novel_downloader/core/exporters/sfacg.py +0 -22
novel_downloader/core/exporters/yamibo.py +0 -22
novel_downloader/core/fetchers/base/__init__.py +0 -14
novel_downloader/core/fetchers/base/browser.py +0 -422
novel_downloader/core/fetchers/biquge/__init__.py +0 -14
novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
novel_downloader/core/fetchers/esjzone/browser.py +0 -209
novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
novel_downloader/core/fetchers/linovelib/browser.py +0 -198
novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
novel_downloader/core/fetchers/qidian/__init__.py +0 -14
novel_downloader/core/fetchers/qidian/browser.py +0 -326
novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
novel_downloader/core/fetchers/sfacg/browser.py +0 -194
novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
novel_downloader/core/fetchers/yamibo/browser.py +0 -234
novel_downloader/core/parsers/biquge.py +0 -139
novel_downloader/models/chapter.py +0 -25
novel_downloader/models/types.py +0 -13
novel_downloader/tui/__init__.py +0 -7
novel_downloader/tui/app.py +0 -32
novel_downloader/tui/main.py +0 -17
novel_downloader/tui/screens/__init__.py +0 -14
novel_downloader/tui/screens/home.py +0 -198
novel_downloader/tui/screens/login.py +0 -74
novel_downloader/tui/styles/home_layout.tcss +0 -79
novel_downloader/tui/widgets/richlog_handler.py +0 -24
novel_downloader/utils/cache.py +0 -24
novel_downloader/utils/fontocr/__init__.py +0 -22
novel_downloader/utils/fontocr/hash_store.py +0 -280
novel_downloader/utils/fontocr/hash_utils.py +0 -103
novel_downloader/utils/fontocr/model_loader.py +0 -69
novel_downloader/utils/fontocr/ocr_v1.py +0 -315
novel_downloader/utils/fontocr/ocr_v2.py +0 -764
novel_downloader/utils/fontocr/ocr_v3.py +0 -744
novel_downloader-1.5.0.dist-info/METADATA +0 -196
novel_downloader-1.5.0.dist-info/RECORD +0 -164
{novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
{novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
{novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0

novel_downloader/core/parsers/dxmwx.py ADDED Viewed

@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+"""
+novel_downloader.core.parsers.dxmwx
+-----------------------------------
+"""
+import re
+from datetime import datetime
+from typing import Any
+from lxml import html
+from novel_downloader.core.parsers.base import BaseParser
+from novel_downloader.core.parsers.registry import register_parser
+from novel_downloader.models import (
+    BookInfoDict,
+    ChapterDict,
+    ChapterInfoDict,
+    VolumeInfoDict,
+)
+@register_parser(
+    site_keys=["dxmwx"],
+)
+class DxmwxParser(BaseParser):
+    """
+    Parser for 大熊猫文学网 book pages.
+    """
+    _RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}")
+    _RE_SPACES = re.compile(r"[ \t\u3000]+")
+    _RE_NEWLINES = re.compile(r"\n{2,}")
+    _RE_TITLE_WS = re.compile(r"\s+")
+    def parse_book_info(
+        self,
+        html_list: list[str],
+        **kwargs: Any,
+    ) -> BookInfoDict | None:
+        if len(html_list) < 2:
+            return None
+        info_tree = html.fromstring(html_list[0])
+        catalog_tree = html.fromstring(html_list[1])
+        book_name = self._first_str(
+            info_tree.xpath("//span[contains(@style,'font-size: 24px')]/text()")
+        )
+        author = self._first_str(
+            info_tree.xpath(
+                "//div[contains(@style,'height: 28px') and contains(., '著')]//a/text()"
+            )
+        )
+        tags = [
+            t.strip()
+            for t in info_tree.xpath("//span[@class='typebut']//a/text()")
+            if t.strip()
+        ]
+        cover_url = "https://www.dxmwx.org" + self._first_str(
+            info_tree.xpath("//img[@class='imgwidth']/@src")
+        )
+        raw_update = self._first_str(
+            info_tree.xpath(
+                "normalize-space(string(//span[starts-with(normalize-space(.), '更新时间：')]))"  # noqa: E501
+            )
+        )
+        raw_update = raw_update.replace("更新时间：", "").strip()
+        update_time = self._normalize_update_date(raw_update)
+        nodes = info_tree.xpath(
+            "//div[contains(@style,'min-height') and "
+            "contains(@style,'padding-left') and contains(@style,'padding-right')][1]"
+        )
+        summary = ""
+        if nodes:
+            texts = [
+                t.replace("\xa0", " ").strip() for t in nodes[0].xpath(".//text()")
+            ]
+            lines = [t for t in texts if t]
+            summary = "\n".join(lines)
+            summary = re.sub(r"^\s*[:：]\s*", "", summary)
+            summary = self._clean_spaces(summary)
+        chapters: list[ChapterInfoDict] = []
+        for a in catalog_tree.xpath(
+            "//div[contains(@style,'height:40px') and contains(@style,'border-bottom')]//a"  # noqa: E501
+        ):
+            href = a.get("href") or ""
+            title = (a.text_content() or "").strip()
+            if not href or not title:
+                continue
+            # "/read/57215_50197663.html" -> "50197663"
+            chap_id = href.split("read/", 1)[-1].split(".html", 1)[0].split("_")[-1]
+            chapters.append({"title": title, "url": href, "chapterId": chap_id})
+        volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
+        return {
+            "book_name": book_name,
+            "author": author,
+            "cover_url": cover_url,
+            "update_time": update_time,
+            "tags": tags,
+            "summary": summary,
+            "volumes": volumes,
+            "extra": {},
+        }
+    def parse_chapter(
+        self,
+        html_list: list[str],
+        chapter_id: str,
+        **kwargs: Any,
+    ) -> ChapterDict | None:
+        if not html_list:
+            return None
+        tree = html.fromstring(html_list[0])
+        title = self._first_str(tree.xpath("//h1[@id='ChapterTitle']/text()"))
+        title = self._RE_TITLE_WS.sub(" ", title).strip()
+        if not title:
+            title = f"第 {chapter_id} 章"
+        paragraphs: list[str] = []
+        for p in tree.xpath("//div[@id='Lab_Contents']//p"):
+            text = self._clean_spaces(p.text_content())
+            if not text:
+                continue
+            if "点这里听书" in text or "大熊猫文学" in text:
+                continue
+            paragraphs.append(text)
+        content = "\n".join(paragraphs).strip()
+        if not content:
+            return None
+        return {
+            "id": chapter_id,
+            "title": title,
+            "content": content,
+            "extra": {"site": "dxmwx"},
+        }
+    @classmethod
+    def _clean_spaces(cls, s: str) -> str:
+        s = s.replace("\xa0", " ")
+        s = cls._RE_SPACES.sub(" ", s)
+        s = cls._RE_NEWLINES.sub("\n", s)
+        return s.strip()
+    @classmethod
+    def _normalize_update_date(cls, raw: str) -> str:
+        """Return a YYYY-MM-DD string."""
+        if not raw:
+            return datetime.now().strftime("%Y-%m-%d")
+        m = cls._RE_DATE.search(raw)
+        if m:
+            return m.group(0)
+        return datetime.now().strftime("%Y-%m-%d")

novel_downloader/core/parsers/eightnovel.py ADDED Viewed

@@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+"""
+novel_downloader.core.parsers.eightnovel
+----------------------------------------
+"""
+import re
+from typing import Any
+from lxml import html
+from novel_downloader.core.parsers.base import BaseParser
+from novel_downloader.core.parsers.registry import register_parser
+from novel_downloader.models import (
+    BookInfoDict,
+    ChapterDict,
+    ChapterInfoDict,
+    VolumeInfoDict,
+)
+@register_parser(
+    site_keys=["eightnovel", "8novel"],
+)
+class EightnovelParser(BaseParser):
+    """
+    Parser for 无限轻小说 book pages.
+    """
+    BASE_URL = "https://www.8novel.com"
+    _SPLIT_STR_PATTERN = re.compile(
+        r'["\']([^"\']+)["\']\s*\.split\s*\(\s*["\']\s*,\s*["\']\s*\)', re.DOTALL
+    )
+    _RE_AUTHOR = re.compile(r"作者[:：]?\s*")
+    _RE_UPDATE = re.compile(r"更新[:：]?\s*")
+    def parse_book_info(
+        self,
+        html_list: list[str],
+        **kwargs: Any,
+    ) -> BookInfoDict | None:
+        if not html_list:
+            return None
+        tree = html.fromstring(html_list[0])
+        # --- Basic metadata ---
+        book_name = self._first_str(tree.xpath("//li[contains(@class,'h2')]/text()"))
+        author_raw = self._first_str(
+            tree.xpath("//span[contains(@class,'item-info-author')]/text()")
+        )
+        author = self._RE_AUTHOR.sub("", author_raw)
+        cover_url = self.BASE_URL + self._first_str(
+            tree.xpath("//div[contains(@class,'item-cover')]//img/@src")
+        )
+        update_raw = self._first_str(
+            tree.xpath("//span[contains(@class,'item-info-date')]/text()")
+        )
+        update_time = self._RE_UPDATE.sub("", update_raw)
+        counts = tree.xpath(
+            "//li[@class='small text-gray']//span[contains(@class,'item-info-num')]/text()"  # noqa: E501
+        )
+        word_count = counts[1].strip() + "萬字" if len(counts) >= 2 else ""
+        tags = tree.xpath("//meta[@property='og:novel:category']/@content")
+        # --- Summary ---
+        summary_nodes = tree.xpath(
+            "//li[contains(@class,'full_text') and contains(@class,'mt-2')]"
+        )
+        if summary_nodes:
+            texts = [t.strip() for t in summary_nodes[0].itertext()]
+            summary = "\n".join(line for line in texts if line)
+        else:
+            summary = ""
+        # --- Chapters / Volumes ---
+        volumes: list[VolumeInfoDict] = []
+        for vol_div in tree.xpath("//div[contains(@class,'folder') and @pid]"):
+            # Volume title
+            h3 = vol_div.xpath(".//div[contains(@class,'vol-title')]//h3")
+            vol_name = (
+                h3[0].text_content().split("/")[0].strip() if h3 else "Unnamed Volume"
+            )
+            # Chapters
+            chapters: list[ChapterInfoDict] = []
+            for a in vol_div.xpath(
+                ".//a[contains(@class,'episode_li') and contains(@class,'d-block')]"
+            ):
+                title = (a.text_content() or "").strip()
+                href = a.get("href") or ""
+                if not href or not title:
+                    continue
+                url = href if href.startswith("http") else self.BASE_URL + href
+                chapter_id = href.split("?")[-1]  # "/read/3355/?270015" -> "270015"
+                chapters.append({"title": title, "url": url, "chapterId": chapter_id})
+            volumes.append({"volume_name": vol_name, "chapters": chapters})
+        return {
+            "book_name": book_name,
+            "author": author,
+            "cover_url": cover_url,
+            "update_time": update_time,
+            "word_count": word_count,
+            "tags": tags,
+            "summary": summary,
+            "volumes": volumes,
+            "extra": {},
+        }
+    def parse_chapter(
+        self,
+        html_list: list[str],
+        chapter_id: str,
+        **kwargs: Any,
+    ) -> ChapterDict | None:
+        if len(html_list) < 2:
+            return None
+        try:
+            id_title_map = self._build_id_title_map(html_list[0])
+            title = id_title_map.get(chapter_id) or ""
+        except Exception:
+            title = ""
+        wrapper = html.fromstring(f"<div>{html_list[1]}</div>")
+        segments: list[str] = []
+        self._append_segment(segments, wrapper.text)
+        for node in wrapper:
+            tag = node.tag.lower() if isinstance(node.tag, str) else ""
+            # A picture‑gallery block
+            if tag == "div" and "content-pics" in (node.get("class") or ""):
+                for img in node.xpath(".//img"):
+                    src = img.get("src")
+                    full = src if not src.startswith("/") else self.BASE_URL + src
+                    segments.append(f'<img src="{full}" />')
+                self._append_segment(segments, node.tail)
+            # Standalone <img>
+            elif tag == "img":
+                src = node.get("src")
+                if not src:
+                    continue
+                full = src if not src.startswith("/") else self.BASE_URL + src
+                segments.append(f'<img src="{full}" />')
+                self._append_segment(segments, node.tail)
+            # Line break -> text in .tail is next paragraph
+            elif tag == "br":
+                self._append_segment(segments, node.tail)
+            # Any other element -> get its text content
+            else:
+                self._append_segment(segments, node.text_content())
+                self._append_segment(segments, node.tail)
+        # Remove final ad line if present
+        if segments and segments[-1] and segments[-1][0] in ("8", "⑧", "⒏"):
+            segments.pop()
+        content = "\n".join(segments).strip()
+        if not content.strip():
+            return None
+        return {
+            "id": chapter_id,
+            "title": title,
+            "content": content,
+            "extra": {"site": "eightnovel"},
+        }
+    @staticmethod
+    def _append_segment(segments: list[str], text: str | None) -> None:
+        """
+        Strip, filter out the '8novel' ad, and append non-empty text to segments.
+        """
+        if not text:
+            return
+        cleaned = text.strip()
+        if cleaned:
+            segments.append(cleaned)
+    @classmethod
+    def _build_id_title_map(cls, html_str: str) -> dict[str, str]:
+        """
+        Extracts two comma-split lists from html_str:
+        - A numeric list of IDs (one element longer)
+        - A list of titles
+        """
+        id_list = None
+        title_list = None
+        for content in cls._SPLIT_STR_PATTERN.findall(html_str):
+            items = [s.strip() for s in content.split(",")]
+            if items == [""]:
+                # skip bids=""
+                continue
+            if all(item.isdigit() for item in items):
+                id_list = items
+            else:
+                title_list = items
+            if id_list and title_list:
+                break
+        if not id_list or not title_list:
+            raise ValueError("Could not locate both ID and title lists")
+        if len(id_list) != len(title_list) + 1:
+            raise ValueError(
+                "ID list must be exactly one element longer than title list"
+            )
+        return dict(zip(id_list[:-1], title_list, strict=False))

novel_downloader/core/parsers/esjzone.py CHANGED Viewed

@@ -12,26 +12,20 @@ from lxml import html
 from novel_downloader.core.parsers.base import BaseParser
 from novel_downloader.core.parsers.registry import register_parser
-from novel_downloader.models import ChapterDict
+from novel_downloader.models import (
+    BookInfoDict,
+    ChapterDict,
+    VolumeInfoDict,
+)
 @register_parser(
     site_keys=["esjzone"],
-    backends=["session", "browser"],
 )
 class EsjzoneParser(BaseParser):
-    """ """
-    # Book info XPaths
-    _BOOK_NAME_XPATH = '//h2[contains(@class, "text-normal")]/text()'
-    _AUTHOR_XPATH = '//li[strong[text()="作者:"]]/a/text()'
-    _COVER_URL_XPATH = '//div[contains(@class,"product-gallery")]//img/@src'
-    _UPDATE_TIME_XPATH = '//li[strong[text()="更新日期:"]]/text()'
-    _WORD_COUNT_XPATH = '//span[@id="txt"]/text()'
-    _TYPE_XPATH = '//li[strong[text()="類型:"]]/text()'
-    _ALT_NAME_XPATH = '//li[strong[text()="其他書名:"]]/text()'
-    _WEB_URL_XPATH = '//li[strong[text()="Web生肉:"]]/a/@href'
-    _SUMMARY_XPATH = '//div[@class="description"]/p//text()'
+    """
+    Parser for esjzone book pages.
+    """
     # Chapter XPaths
     _CHAPTER_TEXT_XPATH = 'string(//div[contains(@class, "forum-content")])'
@@ -40,14 +34,13 @@ class EsjzoneParser(BaseParser):
         '//i[contains(@class, "icon-clock")]/following-sibling::text()',
         '//i[contains(@class, "icon-pen-tool")]/following-sibling::text()',
     ]
     _CHECK_FORUM_XPATH = '//div[@class="page-title"]//ul[@class="breadcrumbs"]/li[not(@class="slash")]//text()'  # noqa: E501
     def parse_book_info(
         self,
         html_list: list[str],
         **kwargs: Any,
-    ) -> dict[str, Any]:
+    ) -> BookInfoDict | None:
         """
         Parse a book info page and extract metadata and chapter structure.
@@ -58,27 +51,40 @@ class EsjzoneParser(BaseParser):
         :return: Parsed metadata and chapter structure as a dictionary.
         """
         if not html_list or self._is_forum_page(html_list):
-            return {}
+            return None
         tree = html.fromstring(html_list[0])
-        result: dict[str, Any] = {}
-        result["book_name"] = self._get_text(tree, self._BOOK_NAME_XPATH)
-        result["author"] = self._get_text(tree, self._AUTHOR_XPATH)
-        result["cover_url"] = self._get_text(tree, self._COVER_URL_XPATH)
-        result["update_time"] = self._get_text(tree, self._UPDATE_TIME_XPATH)
-        result["word_count"] = self._get_text(
-            tree, self._WORD_COUNT_XPATH, clean_comma=True
+        # --- Basic metadata ---
+        book_name = self._first_str(
+            tree.xpath('//h2[contains(@class,"text-normal")]/text()')
         )
-        result["type"] = self._get_text(tree, self._TYPE_XPATH)
-        result["alt_name"] = self._get_text(tree, self._ALT_NAME_XPATH)
-        result["web_url"] = self._get_text(tree, self._WEB_URL_XPATH)
-        # result["summary"] = self._get_text(tree, self._SUMMARY_XPATH, join=True)
+        author = self._first_str(tree.xpath('//li[strong[text()="作者:"]]/a/text()'))
+        cover_url = self._first_str(
+            tree.xpath('//div[contains(@class,"product-gallery")]//img/@src')
+        )
+        update_time = self._first_str(
+            tree.xpath('//li[strong[text()="更新日期:"]]/text()')
+        )  # noqa: E501
+        word_count = self._first_str(
+            tree.xpath('//span[@id="txt"]/text()'), replaces=[(",", "")]
+        )
+        book_type = self._first_str(tree.xpath('//li[strong[text()="類型:"]]/text()'))
+        alt_name = self._first_str(
+            tree.xpath('//li[strong[text()="其他書名:"]]/text()')
+        )  # noqa: E501
+        web_url = self._first_str(tree.xpath('//li[strong[text()="Web生肉:"]]/a/@href'))
+        # Summary paragraphs
         paras = tree.xpath('//div[@class="description"]/p')
         texts = [p.xpath("string()").strip() for p in paras]
-        result["summary"] = "\n".join(texts).strip()
+        summary = "\n".join(t for t in texts if t)
-        volumes: list[dict[str, Any]] = []
-        current_vol: dict[str, Any] = {}
+        current_vol: VolumeInfoDict = {
+            "volume_name": "單卷",
+            "chapters": [],
+        }
+        volumes: list[VolumeInfoDict] = [current_vol]
         def _is_garbage_title(name: str) -> bool:
             stripped = name.strip()
@@ -89,25 +95,18 @@ class EsjzoneParser(BaseParser):
             if _is_garbage_title(name):
                 return
             name = name.strip() or "未命名卷"
-            if name == "未命名卷" and current_vol is not None:
+            if current_vol and current_vol["volume_name"] == name:
                 return
             current_vol = {"volume_name": name, "chapters": []}
             volumes.append(current_vol)
-        _start_volume("單卷")
-        # nodes = tree.xpath('//div[@id="chapterList"]/details') + tree.xpath(
-        #     '//div[@id="chapterList"]/*[not(self::details)]'
-        # )
         nodes = tree.xpath('//div[@id="chapterList"]/*')
         for node in nodes:
             tag = node.tag.lower()
             if tag == "details":
                 # ---- DETAILS-based layout ----
-                summary = node.find("summary")
-                vol_name = summary.text if summary is not None else "未命名卷"
+                vol_name = node.xpath("string(./summary)").strip() or "未命名卷"
                 _start_volume(vol_name)
                 # all chapters inside this details
@@ -116,7 +115,11 @@ class EsjzoneParser(BaseParser):
                     href = a.get("href", "")
                     chap_id = href.rstrip("/").split("/")[-1].split(".", 1)[0]
                     current_vol["chapters"].append(
-                        {"title": title, "url": href, "chapterId": chap_id}
+                        {
+                            "title": title,
+                            "url": href,
+                            "chapterId": chap_id,
+                        }
                     )
             elif (
@@ -139,9 +142,21 @@ class EsjzoneParser(BaseParser):
                     {"title": title, "url": href, "chapterId": chap_id}
                 )
         volumes = [vol for vol in volumes if vol["chapters"]]
-        result["volumes"] = volumes
-        return result
+        return {
+            "book_name": book_name,
+            "author": author,
+            "cover_url": cover_url,
+            "update_time": update_time,
+            "summary": summary,
+            "tags": [book_type],
+            "word_count": word_count,
+            "volumes": volumes,
+            "extra": {
+                "alt_name": alt_name,
+                "web_url": web_url,
+            },
+        }
     def parse_chapter(
         self,
@@ -149,16 +164,9 @@ class EsjzoneParser(BaseParser):
         chapter_id: str,
         **kwargs: Any,
     ) -> ChapterDict | None:
-        """
-        Parse a single chapter page and extract clean text or simplified HTML.
-        :param html_list: Raw HTML of the chapter page.
-        :param chapter_id: Identifier of the chapter being parsed.
-        :return: Cleaned chapter content as plain text or minimal HTML.
-        """
         if not html_list or self._is_forum_page(html_list):
             return None
-        tree = html.fromstring(html_list[0], parser=None)
+        tree = html.fromstring(html_list[0])
         content_lines: list[str] = []
         content_nodes = tree.xpath(self._CHAPTER_CONTENT_NODES_XPATH)
@@ -178,7 +186,7 @@ class EsjzoneParser(BaseParser):
                     content_lines.append(f'<img src="{src}" />')
         content = (
-            "\n\n".join(content_lines).strip()
+            "\n".join(content_lines).strip()
             if content_lines
             else tree.xpath(self._CHAPTER_TEXT_XPATH).strip()
         )
@@ -216,16 +224,3 @@ class EsjzoneParser(BaseParser):
         breadcrumb: list[str] = tree.xpath(self._CHECK_FORUM_XPATH)
         breadcrumb = [s.strip() for s in breadcrumb if s.strip()]
         return breadcrumb == ["Home", "論壇"]
-    @staticmethod
-    def _get_text(
-        tree: html.HtmlElement,
-        xpath: str,
-        join: bool = False,
-        clean_comma: bool = False,
-    ) -> str:
-        data = tree.xpath(xpath)
-        if not data:
-            return ""
-        text = "\n".join(data) if join else data[0].strip()
-        return text.replace(",", "") if clean_comma else text

novel-downloader 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

novel-downloader 1.5.0py3-none-any.whl → 2.0.0py3-none-any.whl