PyPI - novel-downloader - Versions diffs - 1.5.0__py3-none-any.whl → 2.0.1__py3-none-any.whl - Mend

novel-downloader 1.5.0py3-none-any.whl → 2.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (248) hide show

novel_downloader/__init__.py +1 -1
novel_downloader/cli/__init__.py +1 -3
novel_downloader/cli/clean.py +21 -88
novel_downloader/cli/config.py +26 -21
novel_downloader/cli/download.py +79 -66
novel_downloader/cli/export.py +17 -21
novel_downloader/cli/main.py +1 -1
novel_downloader/cli/search.py +62 -65
novel_downloader/cli/ui.py +156 -0
novel_downloader/config/__init__.py +8 -5
novel_downloader/config/adapter.py +206 -209
novel_downloader/config/{loader.py → file_io.py} +53 -26
novel_downloader/core/__init__.py +5 -5
novel_downloader/core/archived/deqixs/fetcher.py +115 -0
novel_downloader/core/archived/deqixs/parser.py +132 -0
novel_downloader/core/archived/deqixs/searcher.py +89 -0
novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
novel_downloader/core/archived/wanbengo/searcher.py +98 -0
novel_downloader/core/archived/xshbook/searcher.py +93 -0
novel_downloader/core/downloaders/__init__.py +3 -24
novel_downloader/core/downloaders/base.py +49 -23
novel_downloader/core/downloaders/common.py +191 -137
novel_downloader/core/downloaders/qianbi.py +187 -146
novel_downloader/core/downloaders/qidian.py +187 -141
novel_downloader/core/downloaders/registry.py +4 -2
novel_downloader/core/downloaders/signals.py +46 -0
novel_downloader/core/exporters/__init__.py +3 -20
novel_downloader/core/exporters/base.py +33 -37
novel_downloader/core/exporters/common/__init__.py +1 -2
novel_downloader/core/exporters/common/epub.py +15 -10
novel_downloader/core/exporters/common/main_exporter.py +19 -12
novel_downloader/core/exporters/common/txt.py +17 -12
novel_downloader/core/exporters/epub_util.py +59 -29
novel_downloader/core/exporters/linovelib/__init__.py +1 -0
novel_downloader/core/exporters/linovelib/epub.py +23 -25
novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
novel_downloader/core/exporters/linovelib/txt.py +20 -14
novel_downloader/core/exporters/qidian.py +2 -8
novel_downloader/core/exporters/registry.py +4 -2
novel_downloader/core/exporters/txt_util.py +7 -7
novel_downloader/core/fetchers/__init__.py +54 -48
novel_downloader/core/fetchers/aaatxt.py +83 -0
novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
novel_downloader/core/fetchers/dxmwx.py +110 -0
novel_downloader/core/fetchers/eightnovel.py +139 -0
novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
novel_downloader/core/fetchers/guidaye.py +85 -0
novel_downloader/core/fetchers/hetushu.py +92 -0
novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
novel_downloader/core/fetchers/ixdzs8.py +113 -0
novel_downloader/core/fetchers/jpxs123.py +101 -0
novel_downloader/core/fetchers/lewenn.py +83 -0
novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
novel_downloader/core/fetchers/piaotia.py +105 -0
novel_downloader/core/fetchers/qbtr.py +101 -0
novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +56 -64
novel_downloader/core/fetchers/quanben5.py +92 -0
novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
novel_downloader/core/fetchers/registry.py +5 -16
novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
novel_downloader/core/fetchers/shencou.py +106 -0
novel_downloader/core/fetchers/shuhaige.py +84 -0
novel_downloader/core/fetchers/tongrenquan.py +84 -0
novel_downloader/core/fetchers/ttkan.py +95 -0
novel_downloader/core/fetchers/wanbengo.py +83 -0
novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
novel_downloader/core/fetchers/xiguashuwu.py +177 -0
novel_downloader/core/fetchers/xs63b.py +171 -0
novel_downloader/core/fetchers/xshbook.py +85 -0
novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
novel_downloader/core/fetchers/yibige.py +114 -0
novel_downloader/core/interfaces/__init__.py +1 -9
novel_downloader/core/interfaces/downloader.py +6 -2
novel_downloader/core/interfaces/exporter.py +7 -7
novel_downloader/core/interfaces/fetcher.py +6 -19
novel_downloader/core/interfaces/parser.py +7 -8
novel_downloader/core/interfaces/searcher.py +9 -1
novel_downloader/core/parsers/__init__.py +49 -12
novel_downloader/core/parsers/aaatxt.py +132 -0
novel_downloader/core/parsers/b520.py +116 -0
novel_downloader/core/parsers/base.py +64 -12
novel_downloader/core/parsers/biquyuedu.py +133 -0
novel_downloader/core/parsers/dxmwx.py +162 -0
novel_downloader/core/parsers/eightnovel.py +224 -0
novel_downloader/core/parsers/esjzone.py +64 -69
novel_downloader/core/parsers/guidaye.py +128 -0
novel_downloader/core/parsers/hetushu.py +139 -0
novel_downloader/core/parsers/i25zw.py +137 -0
novel_downloader/core/parsers/ixdzs8.py +186 -0
novel_downloader/core/parsers/jpxs123.py +137 -0
novel_downloader/core/parsers/lewenn.py +142 -0
novel_downloader/core/parsers/linovelib.py +48 -64
novel_downloader/core/parsers/piaotia.py +189 -0
novel_downloader/core/parsers/qbtr.py +136 -0
novel_downloader/core/parsers/qianbi.py +48 -50
novel_downloader/core/parsers/qidian/main_parser.py +756 -48
novel_downloader/core/parsers/qidian/utils/__init__.py +3 -21
novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
novel_downloader/core/parsers/qidian/utils/node_decryptor.py +4 -4
novel_downloader/core/parsers/quanben5.py +103 -0
novel_downloader/core/parsers/registry.py +5 -16
novel_downloader/core/parsers/sfacg.py +38 -45
novel_downloader/core/parsers/shencou.py +215 -0
novel_downloader/core/parsers/shuhaige.py +111 -0
novel_downloader/core/parsers/tongrenquan.py +116 -0
novel_downloader/core/parsers/ttkan.py +132 -0
novel_downloader/core/parsers/wanbengo.py +191 -0
novel_downloader/core/parsers/xiaoshuowu.py +173 -0
novel_downloader/core/parsers/xiguashuwu.py +429 -0
novel_downloader/core/parsers/xs63b.py +161 -0
novel_downloader/core/parsers/xshbook.py +134 -0
novel_downloader/core/parsers/yamibo.py +87 -131
novel_downloader/core/parsers/yibige.py +166 -0
novel_downloader/core/searchers/__init__.py +34 -3
novel_downloader/core/searchers/aaatxt.py +107 -0
novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
novel_downloader/core/searchers/base.py +112 -36
novel_downloader/core/searchers/dxmwx.py +105 -0
novel_downloader/core/searchers/eightnovel.py +84 -0
novel_downloader/core/searchers/esjzone.py +43 -25
novel_downloader/core/searchers/hetushu.py +92 -0
novel_downloader/core/searchers/i25zw.py +93 -0
novel_downloader/core/searchers/ixdzs8.py +107 -0
novel_downloader/core/searchers/jpxs123.py +107 -0
novel_downloader/core/searchers/piaotia.py +100 -0
novel_downloader/core/searchers/qbtr.py +106 -0
novel_downloader/core/searchers/qianbi.py +74 -40
novel_downloader/core/searchers/quanben5.py +144 -0
novel_downloader/core/searchers/registry.py +24 -8
novel_downloader/core/searchers/shuhaige.py +124 -0
novel_downloader/core/searchers/tongrenquan.py +110 -0
novel_downloader/core/searchers/ttkan.py +92 -0
novel_downloader/core/searchers/xiaoshuowu.py +122 -0
novel_downloader/core/searchers/xiguashuwu.py +95 -0
novel_downloader/core/searchers/xs63b.py +104 -0
novel_downloader/locales/en.json +34 -85
novel_downloader/locales/zh.json +35 -86
novel_downloader/models/__init__.py +21 -22
novel_downloader/models/book.py +44 -0
novel_downloader/models/config.py +4 -37
novel_downloader/models/login.py +1 -1
novel_downloader/models/search.py +5 -0
novel_downloader/resources/config/settings.toml +8 -70
novel_downloader/resources/json/xiguashuwu.json +718 -0
novel_downloader/utils/__init__.py +13 -24
novel_downloader/utils/chapter_storage.py +5 -5
novel_downloader/utils/constants.py +4 -31
novel_downloader/utils/cookies.py +38 -35
novel_downloader/utils/crypto_utils/__init__.py +7 -0
novel_downloader/utils/crypto_utils/aes_util.py +90 -0
novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
novel_downloader/utils/crypto_utils/rc4.py +54 -0
novel_downloader/utils/epub/__init__.py +3 -4
novel_downloader/utils/epub/builder.py +6 -6
novel_downloader/utils/epub/constants.py +62 -21
novel_downloader/utils/epub/documents.py +95 -201
novel_downloader/utils/epub/models.py +8 -22
novel_downloader/utils/epub/utils.py +73 -106
novel_downloader/utils/file_utils/__init__.py +2 -23
novel_downloader/utils/file_utils/io.py +53 -188
novel_downloader/utils/file_utils/normalize.py +1 -7
novel_downloader/utils/file_utils/sanitize.py +4 -15
novel_downloader/utils/fontocr/__init__.py +5 -14
novel_downloader/utils/fontocr/core.py +216 -0
novel_downloader/utils/fontocr/loader.py +50 -0
novel_downloader/utils/logger.py +81 -65
novel_downloader/utils/network.py +17 -41
novel_downloader/utils/state.py +4 -90
novel_downloader/utils/text_utils/__init__.py +1 -7
novel_downloader/utils/text_utils/diff_display.py +5 -7
novel_downloader/utils/text_utils/text_cleaner.py +39 -30
novel_downloader/utils/text_utils/truncate_utils.py +3 -14
novel_downloader/utils/time_utils/__init__.py +5 -11
novel_downloader/utils/time_utils/datetime_utils.py +20 -29
novel_downloader/utils/time_utils/sleep_utils.py +55 -49
novel_downloader/web/__init__.py +13 -0
novel_downloader/web/components/__init__.py +11 -0
novel_downloader/web/components/navigation.py +35 -0
novel_downloader/web/main.py +66 -0
novel_downloader/web/pages/__init__.py +17 -0
novel_downloader/web/pages/download.py +78 -0
novel_downloader/web/pages/progress.py +147 -0
novel_downloader/web/pages/search.py +329 -0
novel_downloader/web/services/__init__.py +17 -0
novel_downloader/web/services/client_dialog.py +164 -0
novel_downloader/web/services/cred_broker.py +113 -0
novel_downloader/web/services/cred_models.py +35 -0
novel_downloader/web/services/task_manager.py +264 -0
novel_downloader-2.0.1.dist-info/METADATA +172 -0
novel_downloader-2.0.1.dist-info/RECORD +206 -0
{novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/entry_points.txt +1 -1
novel_downloader/core/downloaders/biquge.py +0 -29
novel_downloader/core/downloaders/esjzone.py +0 -29
novel_downloader/core/downloaders/linovelib.py +0 -29
novel_downloader/core/downloaders/sfacg.py +0 -29
novel_downloader/core/downloaders/yamibo.py +0 -29
novel_downloader/core/exporters/biquge.py +0 -22
novel_downloader/core/exporters/esjzone.py +0 -22
novel_downloader/core/exporters/qianbi.py +0 -22
novel_downloader/core/exporters/sfacg.py +0 -22
novel_downloader/core/exporters/yamibo.py +0 -22
novel_downloader/core/fetchers/base/__init__.py +0 -14
novel_downloader/core/fetchers/base/browser.py +0 -422
novel_downloader/core/fetchers/biquge/__init__.py +0 -14
novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
novel_downloader/core/fetchers/esjzone/browser.py +0 -209
novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
novel_downloader/core/fetchers/linovelib/browser.py +0 -198
novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
novel_downloader/core/fetchers/qidian/__init__.py +0 -14
novel_downloader/core/fetchers/qidian/browser.py +0 -326
novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
novel_downloader/core/fetchers/sfacg/browser.py +0 -194
novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
novel_downloader/core/fetchers/yamibo/browser.py +0 -234
novel_downloader/core/parsers/biquge.py +0 -139
novel_downloader/core/parsers/qidian/book_info_parser.py +0 -90
novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -528
novel_downloader/core/parsers/qidian/chapter_normal.py +0 -157
novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
novel_downloader/core/parsers/qidian/utils/helpers.py +0 -114
novel_downloader/models/chapter.py +0 -25
novel_downloader/models/types.py +0 -13
novel_downloader/tui/__init__.py +0 -7
novel_downloader/tui/app.py +0 -32
novel_downloader/tui/main.py +0 -17
novel_downloader/tui/screens/__init__.py +0 -14
novel_downloader/tui/screens/home.py +0 -198
novel_downloader/tui/screens/login.py +0 -74
novel_downloader/tui/styles/home_layout.tcss +0 -79
novel_downloader/tui/widgets/richlog_handler.py +0 -24
novel_downloader/utils/cache.py +0 -24
novel_downloader/utils/crypto_utils.py +0 -71
novel_downloader/utils/fontocr/hash_store.py +0 -280
novel_downloader/utils/fontocr/hash_utils.py +0 -103
novel_downloader/utils/fontocr/model_loader.py +0 -69
novel_downloader/utils/fontocr/ocr_v1.py +0 -315
novel_downloader/utils/fontocr/ocr_v2.py +0 -764
novel_downloader/utils/fontocr/ocr_v3.py +0 -744
novel_downloader-1.5.0.dist-info/METADATA +0 -196
novel_downloader-1.5.0.dist-info/RECORD +0 -164
{novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/WHEEL +0 -0
{novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/licenses/LICENSE +0 -0
{novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/top_level.txt +0 -0

novel_downloader/core/parsers/tongrenquan.py ADDED Viewed

@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+"""
+novel_downloader.core.parsers.tongrenquan
+-----------------------------------------
+"""
+from typing import Any
+from lxml import html
+from novel_downloader.core.parsers.base import BaseParser
+from novel_downloader.core.parsers.registry import register_parser
+from novel_downloader.models import (
+    BookInfoDict,
+    ChapterDict,
+    ChapterInfoDict,
+    VolumeInfoDict,
+)
+@register_parser(
+    site_keys=["tongrenquan"],
+)
+class TongrenquanParser(BaseParser):
+    """
+    Parser for 同人圈 book pages.
+    """
+    BASE_URL = "https://www.tongrenquan.org"
+    def parse_book_info(
+        self,
+        html_list: list[str],
+        **kwargs: Any,
+    ) -> BookInfoDict | None:
+        if not html_list:
+            return None
+        tree = html.fromstring(html_list[0])
+        # Metadata
+        book_name = self._first_str(tree.xpath('//div[@class="infos"]/h1/text()'))
+        author = self._first_str(
+            tree.xpath('//div[@class="date"]/span/text()'),
+            replaces=[("作者：", "")],
+        )
+        cover_url = self.BASE_URL + self._first_str(
+            tree.xpath('//div[@class="pic"]//img/@src')
+        )
+        update_time = self._first_str(
+            tree.xpath('//div[@class="date"]/text()'),
+            replaces=[("日期：", "")],
+        )
+        # Summary (collapse text within the <p> tag)
+        paras = tree.xpath('//div[@class="infos"]/p//text()')
+        summary = "\n".join(p.strip() for p in paras if p.strip())
+        # Chapters extraction
+        chapters: list[ChapterInfoDict] = []
+        for a in tree.xpath('//div[contains(@class,"book_list")]//ul//li/a'):
+            url = a.get("href", "").strip()
+            title = a.text_content().strip()
+            # General pattern: /category/bookId/chapterId.html
+            # '/tongren/7562/462.html' -> '462'
+            chapter_id = url.rstrip(".html").split("/")[-1]
+            chapters.append({"title": title, "url": url, "chapterId": chapter_id})
+        volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
+        return {
+            "book_name": book_name,
+            "author": author,
+            "cover_url": cover_url,
+            "update_time": update_time,
+            "tags": ["同人小说"],
+            "summary": summary,
+            "volumes": volumes,
+            "extra": {},
+        }
+    def parse_chapter(
+        self,
+        html_list: list[str],
+        chapter_id: str,
+        **kwargs: Any,
+    ) -> ChapterDict | None:
+        if not html_list:
+            return None
+        tree = html.fromstring(html_list[0])
+        raw_title = self._first_str(
+            tree.xpath('//div[contains(@class,"read_chapterName")]//h1/text()')
+        )
+        book_name = self._first_str(
+            tree.xpath('//div[contains(@class,"readTop")]//a[last()]/text()')
+        )
+        title = raw_title.replace(book_name, "").strip()
+        # Extract paragraphs of content
+        paras = tree.xpath('//div[contains(@class,"read_chapterDetail")]/p')
+        texts = [p.text_content().strip() for p in paras if p.text_content().strip()]
+        content = "\n".join(texts)
+        if not content:
+            return None
+        return {
+            "id": chapter_id,
+            "title": title,
+            "content": content,
+            "extra": {"site": "tongrenquan"},
+        }

novel_downloader/core/parsers/ttkan.py ADDED Viewed

@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""
+novel_downloader.core.parsers.ttkan
+-----------------------------------
+"""
+from datetime import datetime
+from typing import Any
+from lxml import html
+from novel_downloader.core.parsers.base import BaseParser
+from novel_downloader.core.parsers.registry import register_parser
+from novel_downloader.models import (
+    BookInfoDict,
+    ChapterDict,
+    ChapterInfoDict,
+    VolumeInfoDict,
+)
+@register_parser(
+    site_keys=["ttkan"],
+)
+class TtkanParser(BaseParser):
+    """
+    Parser for 天天看小說 book pages.
+    """
+    def parse_book_info(
+        self,
+        html_list: list[str],
+        **kwargs: Any,
+    ) -> BookInfoDict | None:
+        if not html_list:
+            return None
+        tree = html.fromstring(html_list[0])
+        # Book metadata
+        book_name = self._first_str(
+            tree.xpath('//div[contains(@class,"novel_info")]//h1/text()')
+        )
+        author = self._first_str(
+            tree.xpath(
+                '//div[contains(@class,"novel_info")]//li[span/text()="作者："]/a/text()'
+            )
+        )
+        cover_url = self._first_str(
+            tree.xpath('//div[contains(@class,"novel_info")]//amp-img/@src')
+        )
+        serial_status = self._first_str(
+            tree.xpath(
+                '//div[contains(@class,"novel_info")]//span[contains(@class,"state_serial")]/text()'
+            )
+        )
+        update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        # Summary
+        summary_nodes = tree.xpath('//div[@class="description"]//p/text()')
+        summary = "".join(summary_nodes).strip()
+        # Single "正文" volume with all chapter links
+        chapters: list[ChapterInfoDict] = []
+        for a in tree.xpath('//div[@class="full_chapters"]/div[1]/a'):
+            url = a.get("href", "").strip()
+            title = a.text_content().strip()
+            # '/novel/pagea/wushenzhuzai-anmoshi_6094.html' -> '6094'
+            chap_id = url.rstrip(".html").split("_")[-1]
+            chapters.append(
+                {
+                    "chapterId": chap_id,
+                    "title": title,
+                    "url": url,
+                }
+            )
+        volumes: list[VolumeInfoDict] = [
+            {
+                "volume_name": "正文",
+                "chapters": chapters,
+            }
+        ]
+        return {
+            "book_name": book_name,
+            "author": author,
+            "cover_url": cover_url,
+            "update_time": update_time,
+            "serial_status": serial_status,
+            "summary": summary,
+            "volumes": volumes,
+            "extra": {},
+        }
+    def parse_chapter(
+        self,
+        html_list: list[str],
+        chapter_id: str,
+        **kwargs: Any,
+    ) -> ChapterDict | None:
+        if not html_list:
+            return None
+        tree = html.fromstring(html_list[0])
+        # Title
+        title_nodes = tree.xpath('//div[@class="title"]/h1/text()')
+        title = title_nodes[0].strip() if title_nodes else ""
+        # Content paragraphs under <div class="content">
+        paras = tree.xpath('//div[@class="content"]/p')
+        lines = []
+        for p in paras:
+            text = p.text_content().strip()
+            if text:
+                lines.append(text)
+        content = "\n".join(lines).strip()
+        if not content:
+            return None
+        return {
+            "id": chapter_id,
+            "title": title,
+            "content": content,
+            "extra": {"site": "ttkan"},
+        }

novel_downloader/core/parsers/wanbengo.py ADDED Viewed

@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""
+novel_downloader.core.parsers.wanbengo
+--------------------------------------
+"""
+import re
+from datetime import datetime
+from html import unescape
+from typing import Any
+from urllib.parse import urljoin
+from lxml import html
+from novel_downloader.core.parsers.base import BaseParser
+from novel_downloader.core.parsers.registry import register_parser
+from novel_downloader.models import (
+    BookInfoDict,
+    ChapterDict,
+    ChapterInfoDict,
+    VolumeInfoDict,
+)
+@register_parser(
+    site_keys=["wanbengo"],
+)
+class WanbengoParser(BaseParser):
+    """
+    Parser for 完本神站 book pages.
+    """
+    BASE = "https://www.wanbengo.com"
+    # XPaths for the book info page
+    X_BOOK_NAME = "//div[@class='detailTopMid']//h1/text()"
+    X_AUTHOR = "//div[@class='detailTopMid']//div[@class='writer']//a/text()"
+    X_COVER = "//div[@class='detailTopLeft']//img/@src"
+    X_STATUS = "//div[@class='detailTopLeft']//span[contains(@class,'end')]/text()"
+    X_WORDS = "//div[@class='detailTopMid']//table//tr[td/span[contains(text(),'字数')]]/td[last()]/text()"  # noqa: E501
+    X_SUMMARY = "//div[@class='detailTopMid']//table//tr[td/span[contains(text(),'简介')]]/td[last()]//text()"  # noqa: E501
+    X_TAG = "//div[@class='route']/a[2]//text()"
+    X_UPDATE_TXT = "//div[@class='chapterTitle']//span//text()"
+    X_CHAPTERS = "//div[@class='chapter']//ul//li/a"
+    # XPaths for the chapter page
+    X_CHAP_TITLE = "//div[contains(@class,'readerTitle')]//h2/text()"
+    _CHAP_SPLIT_RE = re.compile(r"(?:</p\s*>|<p\b[^>]*>|<br\s*/?>)", re.I)
+    _CHAP_READERCON_RE = re.compile(
+        r'<div[^>]*class=(?:"[^"]*readerCon[^"]*"|\'[^\']*readerCon[^\']*\')[^>]*>(.*?)</div>',
+        re.I | re.S,
+    )
+    _TAGS_RE = re.compile(r"<[^>]+>")
+    _SCRUB_RUNS_RE = re.compile(r"[_?]{2,}")
+    _SCRUB_TAIL_RE = re.compile(r"\s*（未完待续.*?$")
+    # fmt: off
+    ADS = {
+        "完本神站", "本站网址", "报错", "键盘", "客户端", "收藏", "书架",
+        "猜你喜欢", "上一章", "下一章", "章节目录", "LastRead", "贴吧",
+        "倾心打造", "全文无错", "分享本站", "点此章节报错", "温馨提示", "域名",
+        "wanbentxt.com", "wanbengo.com",
+    }
+    # fmt: on
+    _PUNCT_ONLY = re.compile(
+        r"^[\s\W_·—\-･。，、；;：:！!？?\(\)（）【】《》“”\"'…·]+$"
+    )  # noqa: E501
+    def parse_book_info(
+        self,
+        html_list: list[str],
+        **kwargs: Any,
+    ) -> BookInfoDict | None:
+        if not html_list:
+            return None
+        tree = html.fromstring(html_list[0])
+        book_name = self._first_str(tree.xpath(self.X_BOOK_NAME))
+        author = self._first_str(tree.xpath(self.X_AUTHOR))
+        cover_url = self._first_str(tree.xpath(self.X_COVER))
+        serial_status = (
+            self._norm_space(self._first_str(tree.xpath(self.X_STATUS))) or "连载中"
+        )
+        word_count = self._norm_space("".join(tree.xpath(self.X_WORDS)))
+        summary = self._norm_space("".join(tree.xpath(self.X_SUMMARY)))
+        book_type = self._norm_space("".join(tree.xpath(self.X_TAG)))
+        tags = [book_type] if book_type else []
+        update_time = self._extract_update_date(tree.xpath(self.X_UPDATE_TXT))
+        chapters: list[ChapterInfoDict] = []
+        for a in tree.xpath(self.X_CHAPTERS):
+            title = self._norm_space("".join(a.xpath(".//text()")))
+            href = a.get("href") or ""
+            url = urljoin(self.BASE, href)
+            # "/129/103950.html" -> "103950"
+            cid = url.rstrip(".html").split("/")[-1]
+            chapters.append({"title": title, "url": url, "chapterId": cid})
+        volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
+        return {
+            "book_name": book_name,
+            "author": author,
+            "cover_url": cover_url,
+            "update_time": update_time,
+            "word_count": word_count,
+            "summary": summary,
+            "tags": tags,
+            "volumes": volumes,
+            "serial_status": serial_status,
+            "extra": {},
+        }
+    def parse_chapter(
+        self,
+        html_list: list[str],
+        chapter_id: str,
+        **kwargs: Any,
+    ) -> ChapterDict | None:
+        if not html_list:
+            return None
+        inner = self._CHAP_READERCON_RE.search(html_list[0])
+        if not inner:
+            return None
+        tree = html.fromstring(html_list[0])
+        title = self._first_str(tree.xpath(self.X_CHAP_TITLE))
+        parts = self._CHAP_SPLIT_RE.split(inner.group(1))
+        lines: list[str] = []
+        for part in parts:
+            if not part:
+                continue
+            s = self._TAGS_RE.sub("", part)
+            s = unescape(s).replace("\xa0", " ")
+            if self._is_noise_line(s):
+                continue
+            s = self._norm_space(self._scrub_ascii_gibberish(s.strip()))
+            if s:
+                lines.append(s)
+        content = "\n".join(lines)
+        if not content:
+            return None
+        return {
+            "id": chapter_id,
+            "title": title,
+            "content": content,
+            "extra": {"site": "wanbengo"},
+        }
+    @staticmethod
+    def _extract_update_date(texts: list[str]) -> str:
+        """
+        Find a YYYY-MM-DD anywhere in the provided text nodes.
+        If none found, return today's date.
+        """
+        joined = " ".join(t for t in texts if t)
+        m = re.search(r"\b(\d{4}-\d{2}-\d{2})\b", joined)
+        if m:
+            return m.group(1)
+        return datetime.now().strftime("%Y-%m-%d")
+    def _is_noise_line(self, s: str) -> bool:
+        """Heuristic to drop obvious ad/footer/noise lines."""
+        if not s.strip():
+            return True
+        if self._is_ad_line(s):
+            return True
+        if self._PUNCT_ONLY.match(s):
+            return True
+        return False
+    @classmethod
+    def _scrub_ascii_gibberish(cls, s: str) -> str:
+        """
+        Remove common injected ASCII junk like long runs of '?' or '_'
+        while keeping normal text intact.
+        """
+        s = s.replace("()?()", "").replace("[(．)]", "")
+        s = s.replace("．", ".")
+        s = cls._SCRUB_RUNS_RE.sub("", s)  # drop runs like ???? or ____
+        s = cls._SCRUB_TAIL_RE.sub("", s)
+        return s.strip()

novel_downloader/core/parsers/xiaoshuowu.py ADDED Viewed

@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""
+novel_downloader.core.parsers.xiaoshuowu
+----------------------------------------
+"""
+from typing import Any
+from lxml import html
+from novel_downloader.core.parsers.base import BaseParser
+from novel_downloader.core.parsers.registry import register_parser
+from novel_downloader.models import (
+    BookInfoDict,
+    ChapterDict,
+    ChapterInfoDict,
+    VolumeInfoDict,
+)
+@register_parser(
+    site_keys=["xiaoshuowu", "xiaoshuoge"],
+)
+class XiaoshuowuParser(BaseParser):
+    """
+    Parser for 小说屋 (xiaoshuoge.info).
+    """
+    AD_STR: str = "小说屋 www.xiaoshuoge.info"
+    def parse_book_info(
+        self,
+        html_list: list[str],
+        **kwargs: Any,
+    ) -> BookInfoDict | None:
+        if len(html_list) < 2:
+            return None
+        # Parse trees
+        info_tree = html.fromstring(html_list[0])
+        catalog_tree = html.fromstring(html_list[1])
+        book_name = self._first_str(
+            info_tree.xpath('//meta[@property="og:novel:book_name"]/@content')
+        )
+        author = self._first_str(
+            info_tree.xpath('//meta[@property="og:novel:author"]/@content')
+        )
+        # Category -> tags
+        cat_val = self._first_str(
+            info_tree.xpath('//meta[@property="og:novel:category"]/@content')
+        )
+        tags = [cat_val] if cat_val else []
+        word_count = self._first_str(
+            info_tree.xpath(
+                '//table[@class="hide"]//td[contains(text(),"全文字数")]/text()'
+            ),
+            replaces=[("全文字数：", "")],
+        )
+        update_time = self._first_str(
+            info_tree.xpath(
+                '//table[@class="hide"]//td[contains(text(),"最后更新")]/text()'
+            ),
+            replaces=[("最后更新：", "")],
+        )
+        serial_status = self._first_str(
+            info_tree.xpath(
+                '//table[@class="hide"]//td[contains(text(),"连载状态")]/text()'
+            ),
+            replaces=[("连载状态：", "")],
+        )
+        cover_url = self._first_str(
+            info_tree.xpath('//meta[@property="og:image"]/@content')
+        )
+        # Summary
+        summary_div = info_tree.xpath('//div[@class="tabvalue"][1]//div')
+        summary: str = summary_div[0].text_content().strip() if summary_div else ""
+        # Chapters (single volume)
+        chapters: list[ChapterInfoDict] = []
+        chapter_links = catalog_tree.xpath(
+            '//ul[contains(@class,"chapters")]//li[contains(@class,"chapter")]/a'
+        )
+        for a in chapter_links:
+            url = a.get("href", "").strip()
+            title = a.text_content().strip()
+            # chapterId is the numeric filename before ".html"
+            chapter_id = url.rsplit("/", 1)[-1].split(".")[0]
+            chapters.append({"title": title, "url": url, "chapterId": chapter_id})
+        # Single volume
+        volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
+        return {
+            "book_name": book_name,
+            "author": author,
+            "cover_url": cover_url,
+            "update_time": update_time,
+            "word_count": word_count,
+            "serial_status": serial_status,
+            "tags": tags,
+            "summary": summary,
+            "volumes": volumes,
+            "extra": {},
+        }
+    def parse_chapter(
+        self,
+        html_list: list[str],
+        chapter_id: str,
+        **kwargs: Any,
+    ) -> ChapterDict | None:
+        if not html_list:
+            return None
+        doc = html.fromstring(html_list[0])
+        # main container
+        content_divs = doc.xpath('//div[@id="acontent"]')
+        if not content_divs:
+            return None
+        container = content_divs[0]
+        # Get the <h1> title
+        title_elem = container.find("h1")
+        title = title_elem.text_content().strip() if title_elem is not None else ""
+        paras: list[str] = []
+        started = False
+        for node in container.xpath("./*"):
+            # anchor: first <div id="content_tip">
+            if node.tag == "div" and node.get("id") == "content_tip":
+                raw = node.tail or ""
+                # drop any "(小说屋 ...)" prefix before the real text
+                if ")" in raw:
+                    raw = raw.split(")", 1)[1]
+                first_line = raw.lstrip("\ufeff").strip()
+                if first_line:
+                    paras.append(first_line)
+                started = True
+                continue
+            if not started:
+                continue
+            # stop collecting once we hit any div
+            cls_name = node.get("class") or ""
+            if node.tag == "div" and any(
+                k in cls_name for k in ("tishi", "footlink", "fullbar")
+            ):
+                break
+            # grab each <br/> tail as a paragraph
+            if node.tag == "br":
+                line = (node.tail or "").strip()
+                if not line or self.AD_STR in line:
+                    continue
+                paras.append(line)
+        if not paras:
+            return None
+        content = "\n".join(paras)
+        return {
+            "id": chapter_id,
+            "title": title,
+            "content": content,
+            "extra": {"site": "xiaoshuowu"},
+        }

novel-downloader 1.5.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

novel-downloader 1.5.0py3-none-any.whl → 2.0.1py3-none-any.whl