PyPI - web-novel-scraper - Versions diffs - 2.0.2__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

web-novel-scraper 2.0.2py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

web_novel_scraper/__main__.py +123 -68
web_novel_scraper/config_manager.py +12 -12
web_novel_scraper/decode.py +225 -80
web_novel_scraper/decode_guide/decode_guide.json +29 -0
web_novel_scraper/file_manager.py +292 -110
web_novel_scraper/models.py +76 -0
web_novel_scraper/novel_scraper.py +895 -424
web_novel_scraper/request_manager.py +50 -17
web_novel_scraper/utils.py +22 -1
web_novel_scraper/version.py +1 -1
{web_novel_scraper-2.0.2.dist-info → web_novel_scraper-2.1.0.dist-info}/METADATA +1 -1
web_novel_scraper-2.1.0.dist-info/RECORD +20 -0
web_novel_scraper-2.0.2.dist-info/RECORD +0 -19
{web_novel_scraper-2.0.2.dist-info → web_novel_scraper-2.1.0.dist-info}/WHEEL +0 -0
{web_novel_scraper-2.0.2.dist-info → web_novel_scraper-2.1.0.dist-info}/entry_points.txt +0 -0

web_novel_scraper/file_manager.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Optional, Dict
 import unicodedata
 from .logger_manager import create_logger
-from .utils import _normalize_dirname, FileOps, now_iso, FileManagerError
+from .utils import _normalize_dirname, FileOps, now_iso, FileManagerError, ValidationError
 NOVEL_JSON_FILENAME = 'main.json'
 NOVEL_COVER_FILENAME = 'cover.jpg'
@@ -15,6 +15,21 @@ logger = create_logger('FILE MANAGER')
 class FileManager:
+    """
+    File manager for handling novel-related file operations.
+    Manages all file operations related to novels including chapters, table of contents,
+    cover images, and metadata.
+    Attributes:
+        novel_base_dir (Path): Base directory for the novel
+        novel_data_dir (Path): Directory for novel data
+        novel_chapters_dir (Path): Directory for chapters
+        novel_toc_dir (Path): Directory for table of contents
+        novel_json_file (Path): Main JSON file
+        novel_cover_file (Path): Cover image file
+    """
     novel_base_dir: Path
     novel_data_dir: Path
     novel_chapters_dir: Path
@@ -25,141 +40,309 @@ class FileManager:
     def __init__(self,
                  title: str,
-                 base_novels_dir: str,
-                 novel_base_dir: str = None,
+                 base_novels_dir: Path,
+                 novel_base_dir: Path = None,
                  read_only: bool = False):
-        logger.debug(f'Initializing FileManager for novel: {title}')
-        self.novel_base_dir = self._get_novel_base_dir(title, base_novels_dir, novel_base_dir)
-        logger.debug(f'Novel base dir: {self.novel_base_dir}')
-        self.novel_data_dir = self.novel_base_dir / 'data'
-        self.novel_chapters_dir = self.novel_data_dir / 'chapters'
-        self.novel_toc_dir = self.novel_data_dir / "toc"
-        self.novel_json_file = self.novel_data_dir / NOVEL_JSON_FILENAME
-        self.novel_cover_file = self.novel_data_dir / NOVEL_COVER_FILENAME
-        if not read_only:
-            FileOps.ensure_dir(self.novel_base_dir)
-            if novel_base_dir is None:
-                self._store_novel_base_dir(title, self.novel_base_dir, base_novels_dir)
-            FileOps.ensure_dir(self.novel_data_dir)
-            FileOps.ensure_dir(self.novel_chapters_dir)
-            FileOps.ensure_dir(self.novel_toc_dir)
+        """
+        Initialize the file manager.
+        Args:
+            title: Novel title
+            base_novels_dir: Base directory for all novels
+            novel_base_dir: Specific novel directory (optional)
+            read_only: If True, doesn't create directories
+        Raises:
+            FileManagerError: If there are errors creating required directories
+        """
+        try:
+            logger.debug(f'Initializing FileManager for novel: {title}')
+            self.novel_base_dir = self._get_novel_base_dir(title, base_novels_dir, novel_base_dir)
+            self.novel_data_dir = self.novel_base_dir / 'data'
+            self.novel_chapters_dir = self.novel_data_dir / 'chapters'
+            self.novel_toc_dir = self.novel_data_dir / "toc"
+            self.novel_json_file = self.novel_data_dir / NOVEL_JSON_FILENAME
+            self.novel_cover_file = self.novel_data_dir / NOVEL_COVER_FILENAME
+            if not read_only:
+                FileOps.ensure_dir(self.novel_base_dir)
+                if novel_base_dir is None:
+                    self._store_novel_base_dir(title, self.novel_base_dir, base_novels_dir)
+                FileOps.ensure_dir(self.novel_data_dir)
+                FileOps.ensure_dir(self.novel_chapters_dir)
+                FileOps.ensure_dir(self.novel_toc_dir)
+        except Exception as e:
+            raise FileManagerError(f"Error initializing FileManager: {str(e)}") from e
     def save_chapter_html(self, chapter_filename: str, content: str) -> None:
-        full_path = self.novel_chapters_dir / chapter_filename
-        logger.debug(f'Saving chapter to {full_path}')
-        content = unicodedata.normalize('NFKC', content)
-        FileOps.save_text(full_path, content)
+        """
+        Save chapter HTML content to file.
+        Args:
+            chapter_filename: Name of the chapter file
+            content: HTML content of the chapter
+        Raises:
+            FileManagerError: If there are errors when saving the file
+        """
+        try:
+            full_path = self.novel_chapters_dir / chapter_filename
+            logger.debug(f'Saving chapter to {full_path}')
+            content = unicodedata.normalize('NFKC', content)
+            FileOps.save_text(full_path, content)
+        except Exception as e:
+            raise FileManagerError(f"Error saving chapter {chapter_filename}: {str(e)}") from e
     def chapter_file_exists(self, chapter_filename: str) -> bool:
         full_path = self.novel_chapters_dir / chapter_filename
         return full_path.exists()
     def load_chapter_html(self, chapter_filename: str) -> Optional[str]:
-        full_path = self.novel_chapters_dir / chapter_filename
-        logger.debug(f'Loading chapter from {full_path}')
-        chapter_content = FileOps.read_text(full_path)
-        if not chapter_content:
-            logger.debug(f'Chapter content not found: {chapter_filename}')
-        return chapter_content
+        """
+        Load chapter HTML content from a file.
+        Args:
+            chapter_filename: Name of the chapter file
+        Returns:
+            str | None: Chapter content or None if the file doesn't exist
+        Raises:
+            FileManagerError: If there are errors reading the file
+        """
+        try:
+            full_path = self.novel_chapters_dir / chapter_filename
+            logger.debug(f'Loading chapter from {full_path}')
+            chapter_content = FileOps.read_text(full_path)
+            if not chapter_content:
+                logger.debug(f'Chapter content not found: {chapter_filename}')
+            return chapter_content
+        except Exception as e:
+            raise FileManagerError(f"Error loading chapter {chapter_filename}: {str(e)}") from e
     def delete_chapter_html(self, chapter_filename: str) -> None:
-        full_path = self.novel_chapters_dir / chapter_filename
-        logger.debug(f'Attempting to delete chapter: {chapter_filename}')
-        FileOps.delete(full_path)
+        """
+        Delete a chapter's HTML file.
+        Args:
+            chapter_filename: Name of the chapter file to delete
+        Raises:
+            FileManagerError: If there are errors deleting the file
+        """
+        try:
+            full_path = self.novel_chapters_dir / chapter_filename
+            logger.debug(f'Attempting to delete chapter: {chapter_filename}')
+            FileOps.delete(full_path)
+        except Exception as e:
+            raise FileManagerError(f"Error deleting chapter {chapter_filename}: {str(e)}") from e
     def save_novel_json(self, novel_data: dict) -> None:
-        logger.debug(f'Saving novel data to {self.novel_json_file}')
-        FileOps.save_json(self.novel_json_file, novel_data)
+        """
+        Save novel data in JSON format.
+        Args:
+            novel_data: Dictionary containing novel data
+        Raises:
+            FileManagerError: If there are errors when saving the JSON file
+        """
+        try:
+            logger.debug(f'Saving novel data to {self.novel_json_file}')
+            FileOps.save_json(self.novel_json_file, novel_data)
+        except Exception as e:
+            raise FileManagerError(f"Error saving novel JSON: {str(e)}") from e
     def load_novel_json(self) -> Optional[str]:
-        logger.debug(f'Loading novel data from {self.novel_json_file}')
-        novel_json = FileOps.read_text(self.novel_json_file)
-        if novel_json is None:
-            logger.debug('Could not read novel JSON file')
-        return novel_json
+        """
+        Load novel data from the JSON file.
+        Returns:
+            str | None: Novel JSON content or None if the file doesn't exist
+        Raises:
+            FileManagerError: If there are errors reading the JSON file
+        """
+        try:
+            logger.debug(f'Loading novel data from {self.novel_json_file}')
+            novel_json = FileOps.read_text(self.novel_json_file)
+            if novel_json is None:
+                logger.debug('Could not read novel JSON file')
+            return novel_json
+        except Exception as e:
+            raise FileManagerError(f"Error loading novel JSON: {str(e)}") from e
     def save_novel_cover(self, source_cover_path: str) -> None:
-        source_cover_path = Path(source_cover_path)
-        logger.debug(f'Attempting to save cover from {source_cover_path}')
-        if not source_cover_path.exists():
-            logger.critical(f'No cover found on {source_cover_path}')
-            raise ValueError(f'No cover found on {source_cover_path}')
-        FileOps.copy(source_cover_path, self.novel_cover_file)
+        """
+        Save the novel's cover image from a source path.
+        Args:
+            source_cover_path: Path to source cover image
+        Raises:
+            ValidationError: If the source cover file doesn't exist
+            FileManagerError: If there are errors copying the file
+        """
+        try:
+            source_cover_path = Path(source_cover_path)
+            logger.debug(f'Attempting to save cover from {source_cover_path}')
+            if not source_cover_path.exists():
+                logger.critical(f'No cover found on {source_cover_path}')
+                raise ValidationError(f'No cover found on {source_cover_path}')
+            FileOps.copy(source_cover_path, self.novel_cover_file)
+        except ValidationError:
+            raise
+        except Exception as e:
+            raise FileManagerError(f"Error saving novel cover: {str(e)}") from e
     def load_novel_cover(self) -> Optional[bytes]:
-        if self.novel_cover_file is None:
-            logger.debug(f'No cover found')
-            return None
-        logger.debug(f'Loading cover from {self.novel_cover_file}')
-        cover = FileOps.read_binary(self.novel_cover_file)
-        if cover is None:
-            logger.debug(f'Could not read cover from {self.novel_cover_file}')
-        return cover
+        """
+        Load novel cover image.
+        Returns:
+            bytes | None: Cover image binary data or None if the file doesn't exist
+        Raises:
+            FileManagerError: If there are errors reading the file
+        """
+        try:
+            if self.novel_cover_file is None:
+                logger.debug('No cover found')
+                return None
+            logger.debug(f'Loading cover from {self.novel_cover_file}')
+            cover = FileOps.read_binary(self.novel_cover_file)
+            if cover is None:
+                logger.debug(f'Could not read cover from {self.novel_cover_file}')
+            return cover
+        except Exception as e:
+            raise FileManagerError(f"Error loading novel cover: {str(e)}") from e
     ## TOC API
     def add_toc(self, html: str) -> int:
-        """Add a new TOC fragment, return its index."""
-        idx = self._next_toc_idx()
-        toc_path = self.novel_toc_dir / f"toc_{idx}.html"
-        FileOps.save_text(toc_path, html)
+        """
+        Add a new table of contents fragment.
-        toc_index = self._load_toc_index()
-        toc_index["entries"].append({"file": toc_path.name, "updated": now_iso()})
-        self._store_toc_index(toc_index)
+        Args:
+            html: HTML content of the TOC fragment
-        logger.debug(f"Added TOC #{idx} → {toc_path}")
-        return idx
+        Returns:
+            int: Index of the added TOC fragment
+        Raises:
+            FileManagerError: If there are errors when saving the TOC fragment
+        """
+        try:
+            idx = self._next_toc_idx()
+            toc_path = self.novel_toc_dir / f"toc_{idx}.html"
+            FileOps.save_text(toc_path, html)
+            toc_index = self._load_toc_index()
+            toc_index["entries"].append({"file": toc_path.name, "updated": now_iso()})
+            self._store_toc_index(toc_index)
+            logger.debug(f"Added TOC #{idx} → {toc_path}")
+            return idx
+        except Exception as e:
+            raise FileManagerError(f"Error adding TOC fragment: {str(e)}") from e
     def update_toc(self, idx: int, html: str) -> None:
-        toc_path = self.novel_toc_dir / f"toc_{idx}.html"
-        if not toc_path.exists():
-            raise FileManagerError(f"TOC #{idx} not found")
+        """
+        Update an existing table of contents fragment.
+        Args:
+            idx: Index of the TOC fragment to update
+            html: New HTML content
-        FileOps.save_text(toc_path, html)
+        Raises:
+            FileManagerError: If TOC fragment doesn't exist or there are errors updating it
+        """
+        try:
+            toc_path = self.novel_toc_dir / f"toc_{idx}.html"
+            if not toc_path.exists():
+                raise FileManagerError(f"TOC #{idx} not found")
-        toc_index = self._load_toc_index()
-        for entry in toc_index["entries"]:
-            if entry["file"] == toc_path.name:
-                entry["updated"] = now_iso()
-                break
-        self._store_toc_index(toc_index)
-        logger.debug(f"Updated TOC #{idx}")
+            FileOps.save_text(toc_path, html)
+            toc_index = self._load_toc_index()
+            for entry in toc_index["entries"]:
+                if entry["file"] == toc_path.name:
+                    entry["updated"] = now_iso()
+                    break
+            self._store_toc_index(toc_index)
+            logger.debug(f"Updated TOC #{idx}")
+        except Exception as e:
+            raise FileManagerError(f"Error updating TOC fragment: {str(e)}") from e
     def delete_toc(self, idx: Optional[int] = None) -> None:
-        """Delete a single TOC by index or all if *idx* is None."""
-        toc_index = self._load_toc_index()
+        """
+        Delete the table of contents fragment(s).
-        def _delete(path: Path) -> None:
-            FileOps.delete(path)
-            logger.debug(f"Deleted {path}")
+        Args:
+            idx: Index of a specific TOC fragment to delete. If None, deletes all TOC fragments.
-        if idx is None:  # delete all
-            for entry in toc_index["entries"]:
-                _delete(self.novel_toc_dir / entry["file"])
-            toc_index["entries"] = []
-        else:
-            toc_path = self.novel_toc_dir / f"toc_{idx}.html"
-            _delete(toc_path)
-            toc_index["entries"] = [
-                e for e in toc_index["entries"] if e["file"] != toc_path.name
-            ]
-        self._store_toc_index(toc_index)
+        Raises:
+            FileManagerError: If there are errors deleting TOC files or updating the index
+        """
+        try:
+            toc_index = self._load_toc_index()
+            def _delete(path: Path) -> None:
+                """Helper function to delete a file and log the action."""
+                try:
+                    FileOps.delete(path)
+                    logger.debug(f"Deleted {path}")
+                except Exception as e:
+                    raise FileManagerError(f"Failed to delete TOC file {path}: {str(e)}")
+            if idx is None:
+                logger.debug("Deleting all TOC fragments")
+                for entry in toc_index["entries"]:
+                    _delete(self.novel_toc_dir / entry["file"])
+                toc_index["entries"] = []
+            else:
+                logger.debug(f"Deleting TOC fragment #{idx}")
+                toc_path = self.novel_toc_dir / f"toc_{idx}.html"
+                _delete(toc_path)
+                toc_index["entries"] = [
+                    e for e in toc_index["entries"] if e["file"] != toc_path.name
+                ]
+            self._store_toc_index(toc_index)
+            logger.info(f"Successfully deleted TOC {'fragments' if idx is None else f'fragment #{idx}'}")
+        except Exception as e:
+            raise FileManagerError(
+                f"Error deleting TOC {'fragments' if idx is None else f'fragment #{idx}'}: {str(e)}") from e
     def get_toc(self, idx: int) -> Optional[str]:
         """Return TOC HTML content or None."""
         return FileOps.read_text(self.novel_toc_dir / f"toc_{idx}.html")
     def get_all_toc(self) -> list[str]:
-        """Return all TOC fragments in order."""
-        toc_index = self._load_toc_index()
-        contents: list[str] = []
-        for entry in toc_index["entries"]:
-            html = FileOps.read_text(self.novel_toc_dir / entry["file"])
-            if html is not None:
-                contents.append(html)
-        return contents
+        """
+        Get all table of contents fragments in order.
+        Returns:
+            list[str]: List of TOC HTML contents
+        Raises:
+            FileManagerError: If there are errors reading TOC files
+        """
+        try:
+            toc_index = self._load_toc_index()
+            contents: list[str] = []
+            for entry in toc_index["entries"]:
+                html = FileOps.read_text(self.novel_toc_dir / entry["file"])
+                if html is not None:
+                    contents.append(html)
+            return contents
+        except Exception as e:
+            raise FileManagerError(f"Error retrieving TOC fragments: {str(e)}") from e
     def save_book(self, book: epub.EpubBook, filename: str) -> bool:
         book_path = self.novel_base_dir / filename
@@ -177,7 +360,7 @@ class FileManager:
             return False
         except Exception as e:
             logger.critical(f'Unexpected error saving book to {book_path}: {e}')
-            return False
+            raise
     def _load_toc_index(self) -> dict:
         """Return the toc.json structure (creates a blank one if missing)."""
@@ -201,28 +384,27 @@ class FileManager:
     @staticmethod
     def _get_novel_base_dir(
             title: str,
-            base_novels_dir: str,
-            novel_base_dir: str | None = None
+            base_novels_dir: Path,
+            novel_base_dir: Path | None = None
     ) -> Path:
         """
         Resolve the base directory for *title* without creating any directories.
         Priority:
-        1. Explicit *novel_base_dir* argument.
+        1. Explicit *base_novels_dir* argument.
         2. Stored value in <base_novels_dir>/meta.json.
-        3. New path derived from normalized title, recorded back to meta.json.
+        3. New path derived from a normalized title, recorded back to meta.json.
         """
-        base_dir_path = Path(base_novels_dir)
-        if not base_dir_path.exists():
-            logger.info(f'{base_dir_path} does not exist. Creating new base directory.')
-            FileOps.ensure_dir(base_dir_path)
+        if not base_novels_dir.exists():
+            logger.info(f'{base_novels_dir} does not exist. Creating new base directory.')
+            FileOps.ensure_dir(base_novels_dir)
         # — 1. If the caller supplied a path, return it
         if novel_base_dir:
             return Path(novel_base_dir)
         # — 2. Try to read meta.json
-        meta_path = base_dir_path / "meta.json"
+        meta_path = base_novels_dir / "meta.json"
         if meta_path.exists():
             try:
                 meta: Dict[str, Dict[str, str]] = FileOps.read_json(meta_path)
@@ -234,18 +416,18 @@ class FileManager:
         # — 3. Fallback, generate a new directory name
         clean_title = _normalize_dirname(title)
-        return base_dir_path / clean_title
+        return base_novels_dir / clean_title
     @staticmethod
     def _store_novel_base_dir(
             title: str,
             resolved_path: Path,
-            base_novels_dir: str,
+            base_novels_dir: Path,
     ) -> None:
         """
         Persist <title, resolved_path> in <base_novels_dir>/meta.json.
         """
-        meta_path = Path(base_novels_dir) / "meta.json"
+        meta_path = base_novels_dir / "meta.json"
         try:
             # Load existing metadata (ignore errors, start fresh on corruption)
             meta: Dict[str, Dict[str, str]] = {}

web_novel_scraper/models.py ADDED Viewed

@@ -0,0 +1,76 @@
+from __future__ import annotations
+from dataclasses import dataclass, field, asdict
+from dataclasses_json import dataclass_json, config
+from typing import Optional, Tuple
+from urllib.parse import urlparse
+import pprint
+from .utils import _always, ValidationError
+def _pretty(obj, *, skip: set[str] | None = None) -> str:
+    """Pretty-print dataclass dict, omits keys in *skip*."""
+    d = asdict(obj)
+    if skip:
+        for key in skip:
+            d.pop(key, None)
+    return pprint.pformat(d, sort_dicts=False, compact=True)
+@dataclass_json
+@dataclass(slots=True, frozen=True)
+class Metadata:
+    author: Optional[str] = None
+    start_date: Optional[str] = None
+    end_date: Optional[str] = None
+    language: str = "en"
+    description: Optional[str] = None
+    tags: Tuple[str, ...] = field(default_factory=tuple)
+    def __str__(self) -> str:
+        return "Metadata:\n" + _pretty(self)
+@dataclass_json
+@dataclass(slots=True, frozen=True)
+class ScraperBehavior:
+    # Some novels already have the title in the content.
+    save_title_to_content: bool = False
+    # Some novels have the toc link without the host
+    auto_add_host: bool = False
+    # Some hosts return 403 when scrapping, this will force the use of FlareSolver
+    # to save time
+    force_flaresolver: bool = False
+    # When you clean the HTML files, you can use hard clean by default
+    hard_clean: bool = False
+    def __str__(self) -> str:
+        return "ScraperBehavior:\n" + _pretty(self)
+@dataclass_json()
+@dataclass
+class Chapter:
+    chapter_url: str
+    chapter_html: Optional[str] = field(
+        default=None,
+        repr=False,
+        compare=False,
+        metadata=config(exclude=_always)
+    )
+    chapter_content: Optional[str] = field(
+        default=None,
+        repr=False,
+        compare=False,
+        metadata=config(exclude=_always)
+    )
+    chapter_html_filename: Optional[str] = None
+    chapter_title: Optional[str] = field(default=None, compare=False)
+    def __post_init__(self):
+        if not urlparse(self.chapter_url).scheme:
+            raise ValidationError(f"Invalid URL: {self.chapter_url}")
+    def __str__(self) -> str:
+        return "Chapter:\n" + _pretty(self, skip={"chapter_html"})

web-novel-scraper 2.0.2__py3-none-any.whl → 2.1.0__py3-none-any.whl

web-novel-scraper 2.0.2py3-none-any.whl → 2.1.0py3-none-any.whl