PyPI - web-novel-scraper - Versions diffs - 2.0.3__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

web-novel-scraper 2.0.3py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

web_novel_scraper/__main__.py +123 -68
web_novel_scraper/config_manager.py +12 -12
web_novel_scraper/decode.py +225 -80
web_novel_scraper/decode_guide/decode_guide.json +1 -0
web_novel_scraper/file_manager.py +292 -110
web_novel_scraper/models.py +76 -0
web_novel_scraper/novel_scraper.py +895 -424
web_novel_scraper/request_manager.py +50 -17
web_novel_scraper/utils.py +22 -1
web_novel_scraper/version.py +1 -1
{web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.0.dist-info}/METADATA +1 -1
web_novel_scraper-2.1.0.dist-info/RECORD +20 -0
web_novel_scraper-2.0.3.dist-info/RECORD +0 -19
{web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.0.dist-info}/WHEEL +0 -0
{web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.0.dist-info}/entry_points.txt +0 -0

web_novel_scraper/request_manager.py CHANGED Viewed

@@ -1,18 +1,22 @@
 import requests
 import os
-from . import logger_manager
-from dotenv import load_dotenv
 import json
 import time
 from typing import Optional
+from dotenv import load_dotenv
+from urllib.parse import urlparse
+from .logger_manager import create_logger
+from .utils import ValidationError, NetworkError
 load_dotenv()
 FLARESOLVER_URL = os.getenv('SCRAPER_FLARESOLVER_URL', 'http://localhost:8191/v1')
 FLARE_HEADERS = {'Content-Type': 'application/json'}
 FORCE_FLARESOLVER = os.getenv('FORCE_FLARESOLVER', '0') == '1'
-logger = logger_manager.create_logger('GET HTML CONTENT')
+logger = create_logger('GET HTML CONTENT')
 def _get_request(url: str,
@@ -38,7 +42,6 @@ def _get_request(url: str,
             logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
         except requests.exceptions.InvalidSchema as e:
             logger.debug(f'Invalid URL schema for "{url}": {e}')
-            break  # Don't retry on invalid schema
         except requests.exceptions.RequestException as e:
             logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
@@ -106,6 +109,36 @@ def get_html_content(url: str,
                      time_between_retries: int = 3,
                      flaresolver_url: str = FLARESOLVER_URL,
                      force_flaresolver: bool = FORCE_FLARESOLVER) -> Optional[str]:
+    """
+    Retrieves HTML content from a URL with support for anti-bot protection bypass.
+    Implements a two-step strategy:
+    1. Attempts a standard HTTP request first (unless force_flaresolver is True)
+    2. Falls back to FlareSolver if the standard request fails
+    Args:
+        url (str): The URL to fetch content from
+        retries (int, optional): Number of retry attempts for failed requests. Defaults to 3.
+        timeout (int, optional): Timeout in seconds for each request. Defaults to 20.
+        time_between_retries (int, optional): Delay in seconds between retries. Defaults to 3.
+        flaresolver_url (str, optional): URL of the FlareSolver service.
+            Defaults to FLARESOLVER_URL env variable.
+        force_flaresolver (bool, optional): If True, skips standard HTTP request and uses
+            FlareSolver directly. Defaults to FORCE_FLARESOLVER env variable.
+    Returns:
+        Optional[str]: The HTML content if successful, None otherwise
+    Raises:
+        ValidationError: If the provided URL is invalid
+        NetworkError: If all attempts to fetch content fail
+    """
+    parsed_url = urlparse(url)
+    if not all([parsed_url.scheme, parsed_url.netloc]):
+        raise ValidationError(f"Invalid URL format: {url}")
     logger.debug(
         f'Requesting HTML Content for "{url}" with '
         f'retries: "{retries}", '
@@ -132,16 +165,16 @@ def get_html_content(url: str,
                                 time_between_retries=time_between_retries,
                                 flaresolver_url=flaresolver_url)
     if not response or not response.ok:
-        logger.warning(f'Failed all attempts to get HTML content from "{url}')
-        return None
-    response_json = response.json()
-    if 'solution' not in response_json:
-        logger.warning(f'No solution found in FlareSolver response for "{url}"')
-        return None
-    if 'response' not in response_json['solution']:
-        logger.warning(f'No response found in FlareSolver solution for "{url}"')
-        return None
-    logger.debug(f'Successfully retrieved HTML content from "{url}" using FlareSolver')
-    return response_json['solution']['response']
+        logger.debug(f'Failed all attempts to get HTML content from "{url}')
+        raise NetworkError(f'Failed all attempts to get HTML content from "{url}"')
+    try:
+        response_json = response.json()
+        response_content = response_json.get('solution', {}).get('response')
+        if not response_content:
+            raise NetworkError(f'No solution response for "{url}"')
+        return response_content
+    except json.JSONDecodeError as e:
+        logger.error(f'Failed to decode FlareSolver response: {e}')
+        raise NetworkError(f'Invalid FlareSolver response for "{url}"')

web_novel_scraper/utils.py CHANGED Viewed

@@ -9,9 +9,30 @@ from urllib.parse import urlparse
 import re
 import unicodedata
-class FileManagerError(Exception):
+def _always(_: object) -> bool:
+    """Predicate used by dataclasses_json to skip a field."""
+    return True
+## EXCEPTIONS
+class ScraperError(Exception):
+    """Default Exception for Scraper Exceptions"""
+class NetworkError(ScraperError):
+    """Exception raised for any exception for request operations"""
+class DecodeError(ScraperError):
+    """Exception raised for any exception for decoding operations"""
+class FileManagerError(ScraperError):
     """Exception raised for any exception for file operations"""
+class ValidationError(ScraperError):
+    """Exception raised for any exception for invalid values"""
+## FILE OPERATIONS HELPER
 class FileOps:
     """Static helper for disc operations."""

web_novel_scraper/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "2.0.3"
1	+ __version__ = "2.1.0"

{web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: web-novel-scraper
-Version: 2.0.3
+Version: 2.1.0
 Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
 Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
 Project-URL: Documentation, https://web-novel-scraper.readthedocs.io

web_novel_scraper-2.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,20 @@
+web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+web_novel_scraper/__main__.py,sha256=QI8oncZiSuyXu2D377uFq5BpAy4U6uyedEs-PfkY2Ek,18312
+web_novel_scraper/config_manager.py,sha256=KVnCIVs1nsEI3W6YolTV9FejjwJXtUYhDn2dmHr9SC4,3193
+web_novel_scraper/decode.py,sha256=iLjXlKUh8UgTkgsQupVAhW-eUDxYrcOTx_EEgjV5s_A,15186
+web_novel_scraper/file_manager.py,sha256=6OZAuaUykUmS0fLzcSVyQSqcPanjEVpjrQ-0pa2vKls,16767
+web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
+web_novel_scraper/models.py,sha256=7lS1Mb5h0qunNOslwCl6vQiuZGsMNw7h1BYK03yHeFM,2227
+web_novel_scraper/novel_scraper.py,sha256=hWZvi1c3Nnq4qSfLsfZPsjQj-UTlWaKwTlqyPtFe36I,50506
+web_novel_scraper/request_manager.py,sha256=yfXaPCeTu7FGiWnR3rImkaCWETfT3d5zxwOIylEP-_M,7954
+web_novel_scraper/utils.py,sha256=DO1bTgyGNqjHrBlMYdS0ljwegTSC_kYsIbxi-0KeuY8,6953
+web_novel_scraper/version.py,sha256=Xybt2skBZamGMNlLuOX1IG-h4uIxqUDGAO8MIGWrJac,22
+web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
+web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
+web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
+web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
+web_novel_scraper/decode_guide/decode_guide.json,sha256=ecIBugJ8ddzFH0rJHkonkrE6ogSXf98jVypQ-GnVDnE,10418
+web_novel_scraper-2.1.0.dist-info/METADATA,sha256=2EeY0XmULbnE6AFUgsuFPT0NYZSnwk6nVzb45rh_tow,8423
+web_novel_scraper-2.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+web_novel_scraper-2.1.0.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
+web_novel_scraper-2.1.0.dist-info/RECORD,,

web_novel_scraper-2.0.3.dist-info/RECORD DELETED Viewed

@@ -1,19 +0,0 @@
-web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-web_novel_scraper/__main__.py,sha256=2SnDA-UKx30dO2YiQoNDF8LKC2ySYYoqkUCV6dNnGYM,18181
-web_novel_scraper/config_manager.py,sha256=duwKc6jyLj8NmST5F98qGgpW_o6D6GAenKWsYQ80gcU,3121
-web_novel_scraper/decode.py,sha256=fyHr17TsrtGPYYUMRKB8R5qwdt5yvY4vOx9sB9HCmb4,10809
-web_novel_scraper/file_manager.py,sha256=02sbDB7AYPI2dmVVcBVSsyjqDmfRy3LmQqffua-blio,10640
-web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
-web_novel_scraper/novel_scraper.py,sha256=yos0rQGe6UeHVmfmylkB5FM7gE6NCjUCl-80QZTf5-o,30525
-web_novel_scraper/request_manager.py,sha256=BVWMtUO3HRs44phU2ODkPUjy7tJiIBX_R0rxGGLaJzw,6617
-web_novel_scraper/utils.py,sha256=1V8UwYhpp_27zqPnBDbl7fohu2Z7Sy_4Fq5J2_JAEvU,6405
-web_novel_scraper/version.py,sha256=_GEKEa6BYjBV34SZkSlAR87aCM5Y9G0aSI0LXL52iJg,22
-web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
-web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
-web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
-web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
-web_novel_scraper/decode_guide/decode_guide.json,sha256=-xSfkCoCJtcYX7SM2ukO2PRKm_i9vew_szctVNHY1xk,10379
-web_novel_scraper-2.0.3.dist-info/METADATA,sha256=32euwoTPwXoXLcBYfH0iIDnHwVjUXJjVSEDZDsMX03s,8423
-web_novel_scraper-2.0.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-web_novel_scraper-2.0.3.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
-web_novel_scraper-2.0.3.dist-info/RECORD,,

{web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

web-novel-scraper 2.0.3__py3-none-any.whl → 2.1.0__py3-none-any.whl

web-novel-scraper 2.0.3py3-none-any.whl → 2.1.0py3-none-any.whl