PyPI - novel-downloader - Versions diffs - 1.4.5__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

novel-downloader 1.4.5py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (276) hide show

novel_downloader/__init__.py +1 -1
novel_downloader/cli/__init__.py +2 -4
novel_downloader/cli/clean.py +21 -88
novel_downloader/cli/config.py +27 -104
novel_downloader/cli/download.py +78 -66
novel_downloader/cli/export.py +20 -21
novel_downloader/cli/main.py +3 -1
novel_downloader/cli/search.py +120 -0
novel_downloader/cli/ui.py +156 -0
novel_downloader/config/__init__.py +10 -14
novel_downloader/config/adapter.py +195 -99
novel_downloader/config/{loader.py → file_io.py} +53 -27
novel_downloader/core/__init__.py +14 -13
novel_downloader/core/archived/deqixs/fetcher.py +115 -0
novel_downloader/core/archived/deqixs/parser.py +132 -0
novel_downloader/core/archived/deqixs/searcher.py +89 -0
novel_downloader/core/archived/qidian/searcher.py +79 -0
novel_downloader/core/archived/wanbengo/searcher.py +98 -0
novel_downloader/core/archived/xshbook/searcher.py +93 -0
novel_downloader/core/downloaders/__init__.py +8 -30
novel_downloader/core/downloaders/base.py +182 -30
novel_downloader/core/downloaders/common.py +217 -384
novel_downloader/core/downloaders/qianbi.py +332 -4
novel_downloader/core/downloaders/qidian.py +250 -290
novel_downloader/core/downloaders/registry.py +69 -0
novel_downloader/core/downloaders/signals.py +46 -0
novel_downloader/core/exporters/__init__.py +8 -26
novel_downloader/core/exporters/base.py +107 -31
novel_downloader/core/exporters/common/__init__.py +3 -4
novel_downloader/core/exporters/common/epub.py +92 -171
novel_downloader/core/exporters/common/main_exporter.py +14 -67
novel_downloader/core/exporters/common/txt.py +90 -86
novel_downloader/core/exporters/epub_util.py +184 -1327
novel_downloader/core/exporters/linovelib/__init__.py +3 -2
novel_downloader/core/exporters/linovelib/epub.py +165 -222
novel_downloader/core/exporters/linovelib/main_exporter.py +10 -71
novel_downloader/core/exporters/linovelib/txt.py +76 -66
novel_downloader/core/exporters/qidian.py +15 -11
novel_downloader/core/exporters/registry.py +55 -0
novel_downloader/core/exporters/txt_util.py +67 -0
novel_downloader/core/fetchers/__init__.py +57 -56
novel_downloader/core/fetchers/aaatxt.py +83 -0
novel_downloader/core/fetchers/{biquge/session.py → b520.py} +10 -10
novel_downloader/core/fetchers/{base/session.py → base.py} +63 -47
novel_downloader/core/fetchers/biquyuedu.py +83 -0
novel_downloader/core/fetchers/dxmwx.py +110 -0
novel_downloader/core/fetchers/eightnovel.py +139 -0
novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +23 -11
novel_downloader/core/fetchers/guidaye.py +85 -0
novel_downloader/core/fetchers/hetushu.py +92 -0
novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +22 -26
novel_downloader/core/fetchers/ixdzs8.py +113 -0
novel_downloader/core/fetchers/jpxs123.py +101 -0
novel_downloader/core/fetchers/{biquge/browser.py → lewenn.py} +15 -15
novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +16 -12
novel_downloader/core/fetchers/piaotia.py +105 -0
novel_downloader/core/fetchers/qbtr.py +101 -0
novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +9 -9
novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +55 -40
novel_downloader/core/fetchers/quanben5.py +92 -0
novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
novel_downloader/core/fetchers/registry.py +60 -0
novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +11 -9
novel_downloader/core/fetchers/shencou.py +106 -0
novel_downloader/core/fetchers/{common/browser.py → shuhaige.py} +24 -19
novel_downloader/core/fetchers/tongrenquan.py +84 -0
novel_downloader/core/fetchers/ttkan.py +95 -0
novel_downloader/core/fetchers/{common/session.py → wanbengo.py} +21 -17
novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
novel_downloader/core/fetchers/xiguashuwu.py +177 -0
novel_downloader/core/fetchers/xs63b.py +171 -0
novel_downloader/core/fetchers/xshbook.py +85 -0
novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +23 -11
novel_downloader/core/fetchers/yibige.py +114 -0
novel_downloader/core/interfaces/__init__.py +8 -14
novel_downloader/core/interfaces/downloader.py +6 -2
novel_downloader/core/interfaces/exporter.py +7 -7
novel_downloader/core/interfaces/fetcher.py +4 -17
novel_downloader/core/interfaces/parser.py +5 -6
novel_downloader/core/interfaces/searcher.py +26 -0
novel_downloader/core/parsers/__init__.py +58 -22
novel_downloader/core/parsers/aaatxt.py +132 -0
novel_downloader/core/parsers/b520.py +116 -0
novel_downloader/core/parsers/base.py +63 -12
novel_downloader/core/parsers/biquyuedu.py +133 -0
novel_downloader/core/parsers/dxmwx.py +162 -0
novel_downloader/core/parsers/eightnovel.py +224 -0
novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +67 -67
novel_downloader/core/parsers/guidaye.py +128 -0
novel_downloader/core/parsers/hetushu.py +139 -0
novel_downloader/core/parsers/i25zw.py +137 -0
novel_downloader/core/parsers/ixdzs8.py +186 -0
novel_downloader/core/parsers/jpxs123.py +137 -0
novel_downloader/core/parsers/lewenn.py +142 -0
novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +54 -65
novel_downloader/core/parsers/piaotia.py +189 -0
novel_downloader/core/parsers/qbtr.py +136 -0
novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +54 -51
novel_downloader/core/parsers/qidian/__init__.py +2 -2
novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
novel_downloader/core/parsers/qidian/chapter_encrypted.py +290 -346
novel_downloader/core/parsers/qidian/chapter_normal.py +25 -56
novel_downloader/core/parsers/qidian/main_parser.py +19 -57
novel_downloader/core/parsers/qidian/utils/__init__.py +12 -11
novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +6 -7
novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
novel_downloader/core/parsers/quanben5.py +103 -0
novel_downloader/core/parsers/registry.py +57 -0
novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +46 -48
novel_downloader/core/parsers/shencou.py +215 -0
novel_downloader/core/parsers/shuhaige.py +111 -0
novel_downloader/core/parsers/tongrenquan.py +116 -0
novel_downloader/core/parsers/ttkan.py +132 -0
novel_downloader/core/parsers/wanbengo.py +191 -0
novel_downloader/core/parsers/xiaoshuowu.py +173 -0
novel_downloader/core/parsers/xiguashuwu.py +435 -0
novel_downloader/core/parsers/xs63b.py +161 -0
novel_downloader/core/parsers/xshbook.py +134 -0
novel_downloader/core/parsers/yamibo.py +155 -0
novel_downloader/core/parsers/yibige.py +166 -0
novel_downloader/core/searchers/__init__.py +51 -0
novel_downloader/core/searchers/aaatxt.py +107 -0
novel_downloader/core/searchers/b520.py +84 -0
novel_downloader/core/searchers/base.py +168 -0
novel_downloader/core/searchers/dxmwx.py +105 -0
novel_downloader/core/searchers/eightnovel.py +84 -0
novel_downloader/core/searchers/esjzone.py +102 -0
novel_downloader/core/searchers/hetushu.py +92 -0
novel_downloader/core/searchers/i25zw.py +93 -0
novel_downloader/core/searchers/ixdzs8.py +107 -0
novel_downloader/core/searchers/jpxs123.py +107 -0
novel_downloader/core/searchers/piaotia.py +100 -0
novel_downloader/core/searchers/qbtr.py +106 -0
novel_downloader/core/searchers/qianbi.py +165 -0
novel_downloader/core/searchers/quanben5.py +144 -0
novel_downloader/core/searchers/registry.py +79 -0
novel_downloader/core/searchers/shuhaige.py +124 -0
novel_downloader/core/searchers/tongrenquan.py +110 -0
novel_downloader/core/searchers/ttkan.py +92 -0
novel_downloader/core/searchers/xiaoshuowu.py +122 -0
novel_downloader/core/searchers/xiguashuwu.py +95 -0
novel_downloader/core/searchers/xs63b.py +104 -0
novel_downloader/locales/en.json +36 -79
novel_downloader/locales/zh.json +37 -80
novel_downloader/models/__init__.py +23 -50
novel_downloader/models/book.py +44 -0
novel_downloader/models/config.py +16 -43
novel_downloader/models/login.py +1 -1
novel_downloader/models/search.py +21 -0
novel_downloader/resources/config/settings.toml +39 -74
novel_downloader/resources/css_styles/intro.css +83 -0
novel_downloader/resources/css_styles/main.css +30 -89
novel_downloader/resources/json/xiguashuwu.json +718 -0
novel_downloader/utils/__init__.py +43 -0
novel_downloader/utils/chapter_storage.py +247 -226
novel_downloader/utils/constants.py +5 -50
novel_downloader/utils/cookies.py +6 -18
novel_downloader/utils/crypto_utils/__init__.py +13 -0
novel_downloader/utils/crypto_utils/aes_util.py +90 -0
novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
novel_downloader/utils/epub/__init__.py +34 -0
novel_downloader/utils/epub/builder.py +377 -0
novel_downloader/utils/epub/constants.py +118 -0
novel_downloader/utils/epub/documents.py +297 -0
novel_downloader/utils/epub/models.py +120 -0
novel_downloader/utils/epub/utils.py +179 -0
novel_downloader/utils/file_utils/__init__.py +5 -30
novel_downloader/utils/file_utils/io.py +9 -150
novel_downloader/utils/file_utils/normalize.py +2 -2
novel_downloader/utils/file_utils/sanitize.py +2 -7
novel_downloader/utils/fontocr.py +207 -0
novel_downloader/utils/i18n.py +2 -0
novel_downloader/utils/logger.py +10 -16
novel_downloader/utils/network.py +111 -252
novel_downloader/utils/state.py +5 -90
novel_downloader/utils/text_utils/__init__.py +16 -21
novel_downloader/utils/text_utils/diff_display.py +6 -9
novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
novel_downloader/utils/text_utils/text_cleaner.py +179 -0
novel_downloader/utils/text_utils/truncate_utils.py +62 -0
novel_downloader/utils/time_utils/__init__.py +6 -12
novel_downloader/utils/time_utils/datetime_utils.py +23 -33
novel_downloader/utils/time_utils/sleep_utils.py +5 -10
novel_downloader/web/__init__.py +13 -0
novel_downloader/web/components/__init__.py +11 -0
novel_downloader/web/components/navigation.py +35 -0
novel_downloader/web/main.py +66 -0
novel_downloader/web/pages/__init__.py +17 -0
novel_downloader/web/pages/download.py +78 -0
novel_downloader/web/pages/progress.py +147 -0
novel_downloader/web/pages/search.py +329 -0
novel_downloader/web/services/__init__.py +17 -0
novel_downloader/web/services/client_dialog.py +164 -0
novel_downloader/web/services/cred_broker.py +113 -0
novel_downloader/web/services/cred_models.py +35 -0
novel_downloader/web/services/task_manager.py +264 -0
novel_downloader-2.0.0.dist-info/METADATA +171 -0
novel_downloader-2.0.0.dist-info/RECORD +210 -0
{novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
novel_downloader/config/site_rules.py +0 -94
novel_downloader/core/downloaders/biquge.py +0 -25
novel_downloader/core/downloaders/esjzone.py +0 -25
novel_downloader/core/downloaders/linovelib.py +0 -25
novel_downloader/core/downloaders/sfacg.py +0 -25
novel_downloader/core/downloaders/yamibo.py +0 -25
novel_downloader/core/exporters/biquge.py +0 -25
novel_downloader/core/exporters/esjzone.py +0 -25
novel_downloader/core/exporters/qianbi.py +0 -25
novel_downloader/core/exporters/sfacg.py +0 -25
novel_downloader/core/exporters/yamibo.py +0 -25
novel_downloader/core/factory/__init__.py +0 -20
novel_downloader/core/factory/downloader.py +0 -73
novel_downloader/core/factory/exporter.py +0 -58
novel_downloader/core/factory/fetcher.py +0 -96
novel_downloader/core/factory/parser.py +0 -86
novel_downloader/core/fetchers/base/__init__.py +0 -14
novel_downloader/core/fetchers/base/browser.py +0 -403
novel_downloader/core/fetchers/biquge/__init__.py +0 -14
novel_downloader/core/fetchers/common/__init__.py +0 -14
novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
novel_downloader/core/fetchers/esjzone/browser.py +0 -204
novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
novel_downloader/core/fetchers/linovelib/browser.py +0 -193
novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
novel_downloader/core/fetchers/qidian/__init__.py +0 -14
novel_downloader/core/fetchers/qidian/browser.py +0 -318
novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
novel_downloader/core/fetchers/sfacg/browser.py +0 -189
novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
novel_downloader/core/fetchers/yamibo/browser.py +0 -229
novel_downloader/core/parsers/biquge/__init__.py +0 -10
novel_downloader/core/parsers/biquge/main_parser.py +0 -134
novel_downloader/core/parsers/common/__init__.py +0 -13
novel_downloader/core/parsers/common/helper.py +0 -323
novel_downloader/core/parsers/common/main_parser.py +0 -106
novel_downloader/core/parsers/esjzone/__init__.py +0 -10
novel_downloader/core/parsers/linovelib/__init__.py +0 -10
novel_downloader/core/parsers/qianbi/__init__.py +0 -10
novel_downloader/core/parsers/sfacg/__init__.py +0 -10
novel_downloader/core/parsers/yamibo/__init__.py +0 -10
novel_downloader/core/parsers/yamibo/main_parser.py +0 -194
novel_downloader/models/browser.py +0 -21
novel_downloader/models/chapter.py +0 -25
novel_downloader/models/site_rules.py +0 -99
novel_downloader/models/tasks.py +0 -33
novel_downloader/models/types.py +0 -15
novel_downloader/resources/css_styles/volume-intro.css +0 -56
novel_downloader/resources/json/replace_word_map.json +0 -4
novel_downloader/resources/text/blacklist.txt +0 -22
novel_downloader/tui/__init__.py +0 -7
novel_downloader/tui/app.py +0 -32
novel_downloader/tui/main.py +0 -17
novel_downloader/tui/screens/__init__.py +0 -14
novel_downloader/tui/screens/home.py +0 -198
novel_downloader/tui/screens/login.py +0 -74
novel_downloader/tui/styles/home_layout.tcss +0 -79
novel_downloader/tui/widgets/richlog_handler.py +0 -24
novel_downloader/utils/cache.py +0 -24
novel_downloader/utils/fontocr/__init__.py +0 -22
novel_downloader/utils/fontocr/model_loader.py +0 -69
novel_downloader/utils/fontocr/ocr_v1.py +0 -303
novel_downloader/utils/fontocr/ocr_v2.py +0 -752
novel_downloader/utils/hash_store.py +0 -279
novel_downloader/utils/hash_utils.py +0 -103
novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
novel_downloader/utils/text_utils/font_mapping.py +0 -28
novel_downloader/utils/text_utils/text_cleaning.py +0 -107
novel_downloader-1.4.5.dist-info/METADATA +0 -196
novel_downloader-1.4.5.dist-info/RECORD +0 -165
{novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
{novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
{novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0

novel_downloader/core/parsers/ttkan.py ADDED Viewed

@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""
+novel_downloader.core.parsers.ttkan
+-----------------------------------
+"""
+from datetime import datetime
+from typing import Any
+from lxml import html
+from novel_downloader.core.parsers.base import BaseParser
+from novel_downloader.core.parsers.registry import register_parser
+from novel_downloader.models import (
+    BookInfoDict,
+    ChapterDict,
+    ChapterInfoDict,
+    VolumeInfoDict,
+)
+@register_parser(
+    site_keys=["ttkan"],
+)
+class TtkanParser(BaseParser):
+    """
+    Parser for 天天看小說 book pages.
+    """
+    def parse_book_info(
+        self,
+        html_list: list[str],
+        **kwargs: Any,
+    ) -> BookInfoDict | None:
+        if not html_list:
+            return None
+        tree = html.fromstring(html_list[0])
+        # Book metadata
+        book_name = self._first_str(
+            tree.xpath('//div[contains(@class,"novel_info")]//h1/text()')
+        )
+        author = self._first_str(
+            tree.xpath(
+                '//div[contains(@class,"novel_info")]//li[span/text()="作者："]/a/text()'
+            )
+        )
+        cover_url = self._first_str(
+            tree.xpath('//div[contains(@class,"novel_info")]//amp-img/@src')
+        )
+        serial_status = self._first_str(
+            tree.xpath(
+                '//div[contains(@class,"novel_info")]//span[contains(@class,"state_serial")]/text()'
+            )
+        )
+        update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        # Summary
+        summary_nodes = tree.xpath('//div[@class="description"]//p/text()')
+        summary = "".join(summary_nodes).strip()
+        # Single "正文" volume with all chapter links
+        chapters: list[ChapterInfoDict] = []
+        for a in tree.xpath('//div[@class="full_chapters"]/div[1]/a'):
+            url = a.get("href", "").strip()
+            title = a.text_content().strip()
+            # '/novel/pagea/wushenzhuzai-anmoshi_6094.html' -> '6094'
+            chap_id = url.rstrip(".html").split("_")[-1]
+            chapters.append(
+                {
+                    "chapterId": chap_id,
+                    "title": title,
+                    "url": url,
+                }
+            )
+        volumes: list[VolumeInfoDict] = [
+            {
+                "volume_name": "正文",
+                "chapters": chapters,
+            }
+        ]
+        return {
+            "book_name": book_name,
+            "author": author,
+            "cover_url": cover_url,
+            "update_time": update_time,
+            "serial_status": serial_status,
+            "summary": summary,
+            "volumes": volumes,
+            "extra": {},
+        }
+    def parse_chapter(
+        self,
+        html_list: list[str],
+        chapter_id: str,
+        **kwargs: Any,
+    ) -> ChapterDict | None:
+        if not html_list:
+            return None
+        tree = html.fromstring(html_list[0])
+        # Title
+        title_nodes = tree.xpath('//div[@class="title"]/h1/text()')
+        title = title_nodes[0].strip() if title_nodes else ""
+        # Content paragraphs under <div class="content">
+        paras = tree.xpath('//div[@class="content"]/p')
+        lines = []
+        for p in paras:
+            text = p.text_content().strip()
+            if text:
+                lines.append(text)
+        content = "\n".join(lines).strip()
+        if not content:
+            return None
+        return {
+            "id": chapter_id,
+            "title": title,
+            "content": content,
+            "extra": {"site": "ttkan"},
+        }

novel_downloader/core/parsers/wanbengo.py ADDED Viewed

@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""
+novel_downloader.core.parsers.wanbengo
+--------------------------------------
+"""
+import re
+from datetime import datetime
+from html import unescape
+from typing import Any
+from urllib.parse import urljoin
+from lxml import html
+from novel_downloader.core.parsers.base import BaseParser
+from novel_downloader.core.parsers.registry import register_parser
+from novel_downloader.models import (
+    BookInfoDict,
+    ChapterDict,
+    ChapterInfoDict,
+    VolumeInfoDict,
+)
+@register_parser(
+    site_keys=["wanbengo"],
+)
+class WanbengoParser(BaseParser):
+    """
+    Parser for 完本神站 book pages.
+    """
+    BASE = "https://www.wanbengo.com"
+    # XPaths for the book info page
+    X_BOOK_NAME = "//div[@class='detailTopMid']//h1/text()"
+    X_AUTHOR = "//div[@class='detailTopMid']//div[@class='writer']//a/text()"
+    X_COVER = "//div[@class='detailTopLeft']//img/@src"
+    X_STATUS = "//div[@class='detailTopLeft']//span[contains(@class,'end')]/text()"
+    X_WORDS = "//div[@class='detailTopMid']//table//tr[td/span[contains(text(),'字数')]]/td[last()]/text()"  # noqa: E501
+    X_SUMMARY = "//div[@class='detailTopMid']//table//tr[td/span[contains(text(),'简介')]]/td[last()]//text()"  # noqa: E501
+    X_TAG = "//div[@class='route']/a[2]//text()"
+    X_UPDATE_TXT = "//div[@class='chapterTitle']//span//text()"
+    X_CHAPTERS = "//div[@class='chapter']//ul//li/a"
+    # XPaths for the chapter page
+    X_CHAP_TITLE = "//div[contains(@class,'readerTitle')]//h2/text()"
+    _CHAP_SPLIT_RE = re.compile(r"(?:</p\s*>|<p\b[^>]*>|<br\s*/?>)", re.I)
+    _CHAP_READERCON_RE = re.compile(
+        r'<div[^>]*class=(?:"[^"]*readerCon[^"]*"|\'[^\']*readerCon[^\']*\')[^>]*>(.*?)</div>',
+        re.I | re.S,
+    )
+    _TAGS_RE = re.compile(r"<[^>]+>")
+    _SCRUB_RUNS_RE = re.compile(r"[_?]{2,}")
+    _SCRUB_TAIL_RE = re.compile(r"\s*（未完待续.*?$")
+    # fmt: off
+    ADS = {
+        "完本神站", "本站网址", "报错", "键盘", "客户端", "收藏", "书架",
+        "猜你喜欢", "上一章", "下一章", "章节目录", "LastRead", "贴吧",
+        "倾心打造", "全文无错", "分享本站", "点此章节报错", "温馨提示", "域名",
+        "wanbentxt.com", "wanbengo.com",
+    }
+    # fmt: on
+    _PUNCT_ONLY = re.compile(
+        r"^[\s\W_·—\-･。，、；;：:！!？?\(\)（）【】《》“”\"'…·]+$"
+    )  # noqa: E501
+    def parse_book_info(
+        self,
+        html_list: list[str],
+        **kwargs: Any,
+    ) -> BookInfoDict | None:
+        if not html_list:
+            return None
+        tree = html.fromstring(html_list[0])
+        book_name = self._first_str(tree.xpath(self.X_BOOK_NAME))
+        author = self._first_str(tree.xpath(self.X_AUTHOR))
+        cover_url = self._first_str(tree.xpath(self.X_COVER))
+        serial_status = (
+            self._norm_space(self._first_str(tree.xpath(self.X_STATUS))) or "连载中"
+        )
+        word_count = self._norm_space("".join(tree.xpath(self.X_WORDS)))
+        summary = self._norm_space("".join(tree.xpath(self.X_SUMMARY)))
+        book_type = self._norm_space("".join(tree.xpath(self.X_TAG)))
+        tags = [book_type] if book_type else []
+        update_time = self._extract_update_date(tree.xpath(self.X_UPDATE_TXT))
+        chapters: list[ChapterInfoDict] = []
+        for a in tree.xpath(self.X_CHAPTERS):
+            title = self._norm_space("".join(a.xpath(".//text()")))
+            href = a.get("href") or ""
+            url = urljoin(self.BASE, href)
+            # "/129/103950.html" -> "103950"
+            cid = url.rstrip(".html").split("/")[-1]
+            chapters.append({"title": title, "url": url, "chapterId": cid})
+        volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
+        return {
+            "book_name": book_name,
+            "author": author,
+            "cover_url": cover_url,
+            "update_time": update_time,
+            "word_count": word_count,
+            "summary": summary,
+            "tags": tags,
+            "volumes": volumes,
+            "serial_status": serial_status,
+            "extra": {},
+        }
+    def parse_chapter(
+        self,
+        html_list: list[str],
+        chapter_id: str,
+        **kwargs: Any,
+    ) -> ChapterDict | None:
+        if not html_list:
+            return None
+        inner = self._CHAP_READERCON_RE.search(html_list[0])
+        if not inner:
+            return None
+        tree = html.fromstring(html_list[0])
+        title = self._first_str(tree.xpath(self.X_CHAP_TITLE))
+        parts = self._CHAP_SPLIT_RE.split(inner.group(1))
+        lines: list[str] = []
+        for part in parts:
+            if not part:
+                continue
+            s = self._TAGS_RE.sub("", part)
+            s = unescape(s).replace("\xa0", " ")
+            if self._is_noise_line(s):
+                continue
+            s = self._norm_space(self._scrub_ascii_gibberish(s.strip()))
+            if s:
+                lines.append(s)
+        content = "\n".join(lines)
+        if not content:
+            return None
+        return {
+            "id": chapter_id,
+            "title": title,
+            "content": content,
+            "extra": {"site": "wanbengo"},
+        }
+    @staticmethod
+    def _extract_update_date(texts: list[str]) -> str:
+        """
+        Find a YYYY-MM-DD anywhere in the provided text nodes.
+        If none found, return today's date.
+        """
+        joined = " ".join(t for t in texts if t)
+        m = re.search(r"\b(\d{4}-\d{2}-\d{2})\b", joined)
+        if m:
+            return m.group(1)
+        return datetime.now().strftime("%Y-%m-%d")
+    def _is_noise_line(self, s: str) -> bool:
+        """Heuristic to drop obvious ad/footer/noise lines."""
+        if not s.strip():
+            return True
+        if self._is_ad_line(s):
+            return True
+        if self._PUNCT_ONLY.match(s):
+            return True
+        return False
+    @classmethod
+    def _scrub_ascii_gibberish(cls, s: str) -> str:
+        """
+        Remove common injected ASCII junk like long runs of '?' or '_'
+        while keeping normal text intact.
+        """
+        s = s.replace("()?()", "").replace("[(．)]", "")
+        s = s.replace("．", ".")
+        s = cls._SCRUB_RUNS_RE.sub("", s)  # drop runs like ???? or ____
+        s = cls._SCRUB_TAIL_RE.sub("", s)
+        return s.strip()

novel_downloader/core/parsers/xiaoshuowu.py ADDED Viewed

@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""
+novel_downloader.core.parsers.xiaoshuowu
+----------------------------------------
+"""
+from typing import Any
+from lxml import html
+from novel_downloader.core.parsers.base import BaseParser
+from novel_downloader.core.parsers.registry import register_parser
+from novel_downloader.models import (
+    BookInfoDict,
+    ChapterDict,
+    ChapterInfoDict,
+    VolumeInfoDict,
+)
+@register_parser(
+    site_keys=["xiaoshuowu", "xiaoshuoge"],
+)
+class XiaoshuowuParser(BaseParser):
+    """
+    Parser for 小说屋 (xiaoshuoge.info).
+    """
+    AD_STR: str = "小说屋 www.xiaoshuoge.info"
+    def parse_book_info(
+        self,
+        html_list: list[str],
+        **kwargs: Any,
+    ) -> BookInfoDict | None:
+        if len(html_list) < 2:
+            return None
+        # Parse trees
+        info_tree = html.fromstring(html_list[0])
+        catalog_tree = html.fromstring(html_list[1])
+        book_name = self._first_str(
+            info_tree.xpath('//meta[@property="og:novel:book_name"]/@content')
+        )
+        author = self._first_str(
+            info_tree.xpath('//meta[@property="og:novel:author"]/@content')
+        )
+        # Category -> tags
+        cat_val = self._first_str(
+            info_tree.xpath('//meta[@property="og:novel:category"]/@content')
+        )
+        tags = [cat_val] if cat_val else []
+        word_count = self._first_str(
+            info_tree.xpath(
+                '//table[@class="hide"]//td[contains(text(),"全文字数")]/text()'
+            ),
+            replaces=[("全文字数：", "")],
+        )
+        update_time = self._first_str(
+            info_tree.xpath(
+                '//table[@class="hide"]//td[contains(text(),"最后更新")]/text()'
+            ),
+            replaces=[("最后更新：", "")],
+        )
+        serial_status = self._first_str(
+            info_tree.xpath(
+                '//table[@class="hide"]//td[contains(text(),"连载状态")]/text()'
+            ),
+            replaces=[("连载状态：", "")],
+        )
+        cover_url = self._first_str(
+            info_tree.xpath('//meta[@property="og:image"]/@content')
+        )
+        # Summary
+        summary_div = info_tree.xpath('//div[@class="tabvalue"][1]//div')
+        summary: str = summary_div[0].text_content().strip() if summary_div else ""
+        # Chapters (single volume)
+        chapters: list[ChapterInfoDict] = []
+        chapter_links = catalog_tree.xpath(
+            '//ul[contains(@class,"chapters")]//li[contains(@class,"chapter")]/a'
+        )
+        for a in chapter_links:
+            url = a.get("href", "").strip()
+            title = a.text_content().strip()
+            # chapterId is the numeric filename before ".html"
+            chapter_id = url.rsplit("/", 1)[-1].split(".")[0]
+            chapters.append({"title": title, "url": url, "chapterId": chapter_id})
+        # Single volume
+        volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
+        return {
+            "book_name": book_name,
+            "author": author,
+            "cover_url": cover_url,
+            "update_time": update_time,
+            "word_count": word_count,
+            "serial_status": serial_status,
+            "tags": tags,
+            "summary": summary,
+            "volumes": volumes,
+            "extra": {},
+        }
+    def parse_chapter(
+        self,
+        html_list: list[str],
+        chapter_id: str,
+        **kwargs: Any,
+    ) -> ChapterDict | None:
+        if not html_list:
+            return None
+        doc = html.fromstring(html_list[0])
+        # main container
+        content_divs = doc.xpath('//div[@id="acontent"]')
+        if not content_divs:
+            return None
+        container = content_divs[0]
+        # Get the <h1> title
+        title_elem = container.find("h1")
+        title = title_elem.text_content().strip() if title_elem is not None else ""
+        paras: list[str] = []
+        started = False
+        for node in container.xpath("./*"):
+            # anchor: first <div id="content_tip">
+            if node.tag == "div" and node.get("id") == "content_tip":
+                raw = node.tail or ""
+                # drop any "(小说屋 ...)" prefix before the real text
+                if ")" in raw:
+                    raw = raw.split(")", 1)[1]
+                first_line = raw.lstrip("\ufeff").strip()
+                if first_line:
+                    paras.append(first_line)
+                started = True
+                continue
+            if not started:
+                continue
+            # stop collecting once we hit any div
+            cls_name = node.get("class") or ""
+            if node.tag == "div" and any(
+                k in cls_name for k in ("tishi", "footlink", "fullbar")
+            ):
+                break
+            # grab each <br/> tail as a paragraph
+            if node.tag == "br":
+                line = (node.tail or "").strip()
+                if not line or self.AD_STR in line:
+                    continue
+                paras.append(line)
+        if not paras:
+            return None
+        content = "\n".join(paras)
+        return {
+            "id": chapter_id,
+            "title": title,
+            "content": content,
+            "extra": {"site": "xiaoshuowu"},
+        }

novel-downloader 1.4.5__py3-none-any.whl → 2.0.0__py3-none-any.whl

novel-downloader 1.4.5py3-none-any.whl → 2.0.0py3-none-any.whl