PyPI - web-novel-scraper - Versions diffs - 2.0.2__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

web-novel-scraper 2.0.2py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

web_novel_scraper/__main__.py +123 -68
web_novel_scraper/config_manager.py +12 -12
web_novel_scraper/decode.py +225 -80
web_novel_scraper/decode_guide/decode_guide.json +29 -0
web_novel_scraper/file_manager.py +292 -110
web_novel_scraper/models.py +76 -0
web_novel_scraper/novel_scraper.py +895 -424
web_novel_scraper/request_manager.py +50 -17
web_novel_scraper/utils.py +22 -1
web_novel_scraper/version.py +1 -1
{web_novel_scraper-2.0.2.dist-info → web_novel_scraper-2.1.0.dist-info}/METADATA +1 -1
web_novel_scraper-2.1.0.dist-info/RECORD +20 -0
web_novel_scraper-2.0.2.dist-info/RECORD +0 -19
{web_novel_scraper-2.0.2.dist-info → web_novel_scraper-2.1.0.dist-info}/WHEEL +0 -0
{web_novel_scraper-2.0.2.dist-info → web_novel_scraper-2.1.0.dist-info}/entry_points.txt +0 -0

web_novel_scraper/decode.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import json
 from typing import Optional
+from pathlib import Path
 from . import logger_manager
 from .custom_processor.custom_processor import ProcessorRegistry
-from .utils import FileOps
+from .utils import FileOps, DecodeError, ValidationError
 from bs4 import BeautifulSoup
@@ -18,23 +20,48 @@ DEFAULT_REQUEST_CONFIG = {
     "request_time_between_retries": 3
 }
+class HTMLParseError(DecodeError):
+    """Raised when HTML parsing fails"""
+class DecodeGuideError(DecodeError):
+    """Raised when there are issues with decode guide configuration"""
+class ContentExtractionError(DecodeError):
+    """Raised when content extraction fails"""
 class Decoder:
     host: str
-    decode_guide_file: str
+    decode_guide_file: Path
     decode_guide: json
     request_config: dict
-    def __init__(self, host: str, decode_guide_file: str):
+    def __init__(self, host: str, decode_guide_file: Path):
         self.decode_guide_file = decode_guide_file
         self.set_host(host)
     def set_host(self, host: str) -> None:
         self.host = host
-        self._set_decode_guide()
+        try:
+            self._set_decode_guide()
+        except ValidationError:
+            raise
         host_request_config = self.get_request_config()
         self.request_config = DEFAULT_REQUEST_CONFIG | host_request_config
     def get_request_config(self) -> dict:
+        """
+        Retrieves the request configuration for the current host.
+        Returns:
+            dict: Request configuration parameters for the current host.
+                Returns DEFAULT_REQUEST_CONFIG if no custom configuration exists.
+        """
         request_config = self.decode_guide.get('request_config')
         if request_config:
             logger.debug(f'Host "{self.host}" has a custom request configuration on the Decode Guide file.')
@@ -43,110 +70,192 @@ class Decoder:
         return DEFAULT_REQUEST_CONFIG
     def is_index_inverted(self) -> bool:
+        """
+        Checks if the index order should be inverted for the current host.
+        Returns:
+            bool: True if the index should be processed in reverse order, False otherwise.
+        """
+        logger.debug('Checking if index should be inverted...')
         return self.decode_guide.get('index', {}).get('inverted', False)
     def save_title_to_content(self) -> bool:
-        return self.decode_guide.get('save_title_to_content', False)
+        """
+        Checks if the title should be included in the content for the current host.
+        Returns:
+            bool: True if the title should be saved with the content, False otherwise.
+        """
+        logger.debug('Checking if title should be saved to content...')
+        try:
+            return self.decode_guide.get('save_title_to_content', False)
+        except DecodeError:
+            raise
     def add_host_to_chapter(self) -> bool:
+        """
+        Checks if the host information should be added to chapter url.
+        Returns:
+            bool: True if host information should be included in chapter url, False otherwise.
+        """
+        logger.debug('Checking if host should be added to chapter url...')
         return self.decode_guide.get('add_host_to_chapter', False)
     def get_chapter_urls(self, html: str) -> list[str]:
-        logger.debug('Obtaining chapter URLs...')
-        chapter_urls = self.decode_html(html, 'index')
+        """
+        Extracts chapter URLs from the table of contents HTML.
+        Args:
+            html (str): The HTML content of the table of contents
+        Returns:
+            list[str]: List of chapter URLs found in the HTML
-        if chapter_urls is None:
-            logger.critical(f"Failed to obtain chapter URLs for {self.host}")
-            raise ValueError(f"Failed to obtain chapter URLs for {self.host}")
+        Raises:
+            ContentExtractionError: If chapter URLs cannot be extracted.
+            HTMLParseError: If HTML parsing fails.
+        """
+        try:
+            logger.debug('Obtaining chapter URLs...')
+            chapter_urls = self.decode_html(html, 'index')
+            if chapter_urls is None:
+                msg = f"Failed to obtain chapter URLs for {self.host}"
+                logger.error(msg)
+                raise ContentExtractionError(msg)
-        if isinstance(chapter_urls, str):
-            logger.warning('When obtaining chapter urls, obtained a String but expected a List')
-            logger.warning('Check decode config')
-            chapter_urls = [chapter_urls]
+            if isinstance(chapter_urls, str):
+                logger.warning('Expected List of URLs but got String, converting to single-item list')
+                chapter_urls = [chapter_urls]
-        return chapter_urls
+            return chapter_urls
+        except DecodeError:
+            raise
+        except Exception as e:
+            msg = f"Error extracting chapter URLs: {e}"
+            logger.error(msg)
+            raise ContentExtractionError(msg) from e
     def get_toc_next_page_url(self, html: str) -> Optional[str]:
+        """
+        Extracts the URL for the next page of the table of contents.
+        Args:
+            html (str): The HTML content of the current TOC page
+        Returns:
+            Optional[str]: URL of the next page if it exists, None otherwise
+        Raises:
+            HTMLParseError: If HTML parsing fails
+            ContentExtractionError: If URL extraction fails
+        """
         logger.debug('Obtaining toc next page URL...')
-        toc_next_page_url = self.decode_html(html, 'next_page')
-        if toc_next_page_url is None:
-            logger.debug('No next page URL found, assuming last page...')
-            return None
-        return toc_next_page_url
+        try:
+            toc_next_page_url = self.decode_html(html, 'next_page')
+            if toc_next_page_url is None:
+                logger.debug('No next page URL found, assuming last page...')
+                return None
+            return toc_next_page_url
+        except DecodeError:
+            raise
     def get_chapter_title(self, html: str) -> Optional[str]:
-        logger.debug('Obtaining chapter title...')
-        chapter_title = self.decode_html(html, 'title')
-        if chapter_title is None:
-            logger.debug(f'No chapter_title found.')
-        return chapter_title
+        """
+        Extracts the chapter title from HTML content.
-    def get_chapter_content(self, html: str, save_title_to_content: bool, chapter_title: str) -> str:
-        logger.debug('Obtaining chapter content...')
-        full_chapter_content = ""
-        chapter_content = self.decode_html(html, 'content')
-        if chapter_content is None:
-            logger.critical('No content found on chapter')
-            raise ValueError('No content found on chapter')
-        if save_title_to_content:
-            logger.debug('Saving chapter title to content...')
-            full_chapter_content += f'<h4>{chapter_title}</h4>'
-        if isinstance(chapter_content, list):
-            logger.debug(f'{len(chapter_content)} paragraphs found in chapter')
-            logger.debug('Converting list of paragraphs to a single string')
-            for paragraph in chapter_content:
-                full_chapter_content += str(paragraph)
-        else:
-            logger.debug('Chapter content is not a list, no conversion made')
-            full_chapter_content += str(chapter_content)
-        return full_chapter_content
+        Args:
+            html (str): The HTML content of the chapter
-    def decode_html(self, html: str, content_type: str) -> str | list[str] | None:
-        logger.debug(f'Decoding HTML...')
-        logger.debug(f'Content type: {content_type}')
-        logger.debug(f'Decode guide: {self.decode_guide_file}')
-        logger.debug(f'Host: {self.host}')
-        if not content_type in self.decode_guide:
-            logger.critical(f'{content_type} key does not exists on decode guide {self.decode_guide_file}'
-                            f'for host {self.host}')
-            raise ValueError(f'{content_type} key does not exists on decode guide {self.decode_guide_file}'
-                            f'for host {self.host}')
+        Returns:
+            Optional[str]: The extracted title, or None if not found
-        if ProcessorRegistry.has_processor(self.host, content_type):
-            logger.debug(f'Host {self.host} will use a custom processor')
-            processor = ProcessorRegistry.get_processor(self.host, content_type)
-            return processor.process(html)
+        Raises:
+            HTMLParseError: If HTML parsing fails
+        """
-        logger.debug('Starting HTML parsing...')
         try:
-            soup = BeautifulSoup(html, 'html.parser')
+            logger.debug('Obtaining chapter title...')
+            chapter_title = self.decode_html(html, 'title')
+            if chapter_title is None:
+                logger.debug('No chapter title found')
+                return None
+            return str(chapter_title).strip()
+        except DecodeError as e:
+            logger.warning(f"Error when trying to extract chapter title: {e}")
+            return None
         except Exception as e:
-            logger.critical(f'Error parsing HTML with BeautifulSoup: {e}')
-            raise ValueError(f'Error parsing HTML with BeautifulSoup: {e}')
+            msg = f"Error extracting chapter title: {e}"
+            logger.error(msg)
+            raise HTMLParseError(msg) from e
-        decoder = self.decode_guide[content_type]
-        elements = self._find_elements(soup, decoder)
-        if not elements:
-            logger.debug(f'{content_type} not found on html using {self.decode_guide_file} '
-                           f'for host {self.host}')
+    def get_chapter_content(self, html: str, save_title_to_content: bool, chapter_title: str) -> str:
+        """
+         Extracts and processes chapter content from HTML.
-        # Investigate this conditional
-        if content_type == 'title' and isinstance(elements, list):
-            logger.debug('Joining titles...')
-            return ' '.join(elements)
-        return elements
+         Args:
+             html (str): The HTML content of the chapter
+             save_title_to_content (bool): Whether to include the title in the content
+             chapter_title (str): The chapter title to include if save_title_to_content is True
+         Returns:
+             str: The processed chapter content with HTML formatting
+         Raises:
+             ContentExtractionError: If content cannot be extracted,
+             HTMLParseError: If HTML parsing fails
+         """
+        try:
+            logger.debug('Obtaining chapter content...')
+            full_chapter_content = ""
+            chapter_content = self.decode_html(html, 'content')
+            if chapter_content is None:
+                msg = 'No content found in chapter'
+                logger.error(msg)
+                raise ContentExtractionError(msg)
+            if save_title_to_content:
+                logger.debug('Adding chapter title to content...')
+                full_chapter_content += f'<h4>{chapter_title}</h4>'
+            if isinstance(chapter_content, list):
+                logger.debug(f'Processing {len(chapter_content)} content paragraphs')
+                full_chapter_content += ''.join(str(p) for p in chapter_content)
+            else:
+                logger.debug('Processing single content block')
+                full_chapter_content += str(chapter_content)
+            return full_chapter_content
+        except DecodeError:
+            raise
+        except Exception as e:
+            msg = f"Error extracting chapter content: {e}"
+            logger.error(msg)
+            raise ContentExtractionError(msg) from e
     def has_pagination(self) -> bool:
-        return self.decode_guide['has_pagination']
+        """
+        Checks if the current host's content uses pagination.
+        Returns:
+            bool: True if the host uses pagination, False otherwise.
+        """
+        logger.debug('Checking if index has pagination...')
+        return self.decode_guide.get('has_pagination', False)
     def clean_html(self, html: str, hard_clean: bool = False):
         tags_for_soft_clean = ['script', 'style', 'link',
                                'form', 'meta', 'hr', 'noscript', 'button']
-        tags_for_hard_clean = ['header', 'footer', 'nav', 'aside', 'iframe', 'object', 'embed', 'svg', 'canvas', 'map', 'area',
-                               'audio', 'video', 'track', 'source', 'applet', 'frame', 'frameset', 'noframes', 'noembed', 'blink', 'marquee']
+        tags_for_hard_clean = ['header', 'footer', 'nav', 'aside', 'iframe', 'object', 'embed', 'svg', 'canvas', 'map',
+                               'area',
+                               'audio', 'video', 'track', 'source', 'applet', 'frame', 'frameset', 'noframes',
+                               'noembed', 'blink', 'marquee']
         tags_for_custom_clean = []
         if 'clean' in self.decode_guide:
@@ -162,12 +271,48 @@ class Decoder:
         return "\n".join([line.strip() for line in str(soup).splitlines() if line.strip()])
+    def decode_html(self, html: str, content_type: str) -> str | list[str] | None:
+        logger.debug(f'Decoding HTML...')
+        logger.debug(f'Content type: {content_type}')
+        logger.debug(f'Decode guide: {self.decode_guide_file}')
+        logger.debug(f'Host: {self.host}')
+        if content_type not in self.decode_guide:
+            msg = f'No decode rules found for {content_type} in guide {self.decode_guide_file}'
+            logger.critical(msg)
+            raise DecodeGuideError(msg)
+        if ProcessorRegistry.has_processor(self.host, content_type):
+            logger.debug(f'Using custom processor for {self.host}')
+            return ProcessorRegistry.get_processor(self.host, content_type).process(html)
+        logger.debug('Parsing HTML...')
+        try:
+            soup = BeautifulSoup(html, 'html.parser')
+        except Exception as e:
+            logger.error(f'Error parsing HTML with BeautifulSoup: {e}')
+            raise HTMLParseError(f'Error parsing HTML with BeautifulSoup: {e}')
+        decoder = self.decode_guide.get(content_type)
+        if decoder is None:
+            logger.error(f'No decode rules found for {content_type} in guide {self.decode_guide_file}')
+            raise DecodeGuideError(f'No decode rules found for {content_type} in guide {self.decode_guide_file}')
+        elements = self._find_elements(soup, decoder)
+        if not elements:
+            logger.debug(f'No {content_type} found in HTML')
+            return None
+        # Investigate this conditional
+        if content_type == 'title' and isinstance(elements, list):
+            logger.debug('Joining multiple title elements')
+            return ' '.join(elements)
+        return elements
     def _set_decode_guide(self) -> None:
         decode_guide = FileOps.read_json(self.decode_guide_file)
         self.decode_guide = self._get_element_by_key(decode_guide, 'host', self.host)
         if self.decode_guide is None:
-            logger.critical(f'No decode guide found for host {self.host}')
-            raise ValueError(f'No decode guide found for host {self.host}')
+            logger.error(f'No decode guide found for host {self.host}')
+            raise ValidationError(f'No decode guide found for host {self.host}')
     @staticmethod
     def _find_elements(soup: BeautifulSoup, decoder: dict):

web_novel_scraper/decode_guide/decode_guide.json CHANGED Viewed

@@ -380,5 +380,34 @@
                 "key": "href"
             }
         }
+    },
+    {
+        "host": "foxaholic.com",
+        "has_pagination": false,
+        "request_config": {
+            "force_flaresolver": "true",
+            "request_timeout": 30
+        },
+        "save_title_to_content": true,
+        "title": {
+            "element": "li",
+            "class": "active",
+            "extract": {
+                "type": "text"
+            }
+        },
+        "content": {
+            "selector": "div.text-left p:not([class])",
+            "array": true
+        },
+        "index": {
+            "selector": "li.free-chap a",
+            "inverted": true,
+            "array": true,
+            "extract": {
+                "type": "attr",
+                "key": "href"
+            }
+        }
     }
 ]

web-novel-scraper 2.0.2__py3-none-any.whl → 2.1.0__py3-none-any.whl

web-novel-scraper 2.0.2py3-none-any.whl → 2.1.0py3-none-any.whl