web-novel-scraper 2.0.3__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,18 +1,22 @@
1
1
  import requests
2
2
  import os
3
- from . import logger_manager
4
- from dotenv import load_dotenv
5
3
  import json
6
4
  import time
7
5
  from typing import Optional
8
6
 
7
+ from dotenv import load_dotenv
8
+ from urllib.parse import urlparse
9
+
10
+ from .logger_manager import create_logger
11
+ from .utils import ValidationError, NetworkError
12
+
9
13
  load_dotenv()
10
14
 
11
15
  FLARESOLVER_URL = os.getenv('SCRAPER_FLARESOLVER_URL', 'http://localhost:8191/v1')
12
16
  FLARE_HEADERS = {'Content-Type': 'application/json'}
13
17
  FORCE_FLARESOLVER = os.getenv('FORCE_FLARESOLVER', '0') == '1'
14
18
 
15
- logger = logger_manager.create_logger('GET HTML CONTENT')
19
+ logger = create_logger('GET HTML CONTENT')
16
20
 
17
21
 
18
22
  def _get_request(url: str,
@@ -38,7 +42,6 @@ def _get_request(url: str,
38
42
  logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
39
43
  except requests.exceptions.InvalidSchema as e:
40
44
  logger.debug(f'Invalid URL schema for "{url}": {e}')
41
- break # Don't retry on invalid schema
42
45
  except requests.exceptions.RequestException as e:
43
46
  logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
44
47
 
@@ -106,6 +109,36 @@ def get_html_content(url: str,
106
109
  time_between_retries: int = 3,
107
110
  flaresolver_url: str = FLARESOLVER_URL,
108
111
  force_flaresolver: bool = FORCE_FLARESOLVER) -> Optional[str]:
112
+ """
113
+ Retrieves HTML content from a URL with support for anti-bot protection bypass.
114
+
115
+ Implements a two-step strategy:
116
+ 1. Attempts a standard HTTP request first (unless force_flaresolver is True)
117
+ 2. Falls back to FlareSolver if the standard request fails
118
+
119
+ Args:
120
+ url (str): The URL to fetch content from
121
+ retries (int, optional): Number of retry attempts for failed requests. Defaults to 3.
122
+ timeout (int, optional): Timeout in seconds for each request. Defaults to 20.
123
+ time_between_retries (int, optional): Delay in seconds between retries. Defaults to 3.
124
+ flaresolver_url (str, optional): URL of the FlareSolver service.
125
+ Defaults to FLARESOLVER_URL env variable.
126
+ force_flaresolver (bool, optional): If True, skips standard HTTP request and uses
127
+ FlareSolver directly. Defaults to FORCE_FLARESOLVER env variable.
128
+
129
+ Returns:
130
+ Optional[str]: The HTML content if successful, None otherwise
131
+
132
+ Raises:
133
+ ValidationError: If the provided URL is invalid
134
+ NetworkError: If all attempts to fetch content fail
135
+ """
136
+
137
+ parsed_url = urlparse(url)
138
+ if not all([parsed_url.scheme, parsed_url.netloc]):
139
+ raise ValidationError(f"Invalid URL format: {url}")
140
+
141
+
109
142
  logger.debug(
110
143
  f'Requesting HTML Content for "{url}" with '
111
144
  f'retries: "{retries}", '
@@ -132,16 +165,16 @@ def get_html_content(url: str,
132
165
  time_between_retries=time_between_retries,
133
166
  flaresolver_url=flaresolver_url)
134
167
  if not response or not response.ok:
135
- logger.warning(f'Failed all attempts to get HTML content from "{url}')
136
- return None
137
-
138
- response_json = response.json()
139
- if 'solution' not in response_json:
140
- logger.warning(f'No solution found in FlareSolver response for "{url}"')
141
- return None
142
- if 'response' not in response_json['solution']:
143
- logger.warning(f'No response found in FlareSolver solution for "{url}"')
144
- return None
145
-
146
- logger.debug(f'Successfully retrieved HTML content from "{url}" using FlareSolver')
147
- return response_json['solution']['response']
168
+ logger.debug(f'Failed all attempts to get HTML content from "{url}')
169
+ raise NetworkError(f'Failed all attempts to get HTML content from "{url}"')
170
+
171
+ try:
172
+ response_json = response.json()
173
+ response_content = response_json.get('solution', {}).get('response')
174
+ if not response_content:
175
+ raise NetworkError(f'No solution response for "{url}"')
176
+
177
+ return response_content
178
+ except json.JSONDecodeError as e:
179
+ logger.error(f'Failed to decode FlareSolver response: {e}')
180
+ raise NetworkError(f'Invalid FlareSolver response for "{url}"')
@@ -9,9 +9,30 @@ from urllib.parse import urlparse
9
9
  import re
10
10
  import unicodedata
11
11
 
12
- class FileManagerError(Exception):
12
+ def _always(_: object) -> bool:
13
+ """Predicate used by dataclasses_json to skip a field."""
14
+ return True
15
+
16
+ ## EXCEPTIONS
17
+
18
+ class ScraperError(Exception):
19
+ """Default Exception for Scraper Exceptions"""
20
+
21
+ class NetworkError(ScraperError):
22
+ """Exception raised for any exception for request operations"""
23
+
24
+ class DecodeError(ScraperError):
25
+ """Exception raised for any exception for decoding operations"""
26
+
27
+ class FileManagerError(ScraperError):
13
28
  """Exception raised for any exception for file operations"""
14
29
 
30
+ class ValidationError(ScraperError):
31
+ """Exception raised for any exception for invalid values"""
32
+
33
+
34
+ ## FILE OPERATIONS HELPER
35
+
15
36
  class FileOps:
16
37
  """Static helper for disc operations."""
17
38
 
@@ -1 +1 @@
1
- __version__ = "2.0.3"
1
+ __version__ = "2.1.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: web-novel-scraper
3
- Version: 2.0.3
3
+ Version: 2.1.0
4
4
  Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
5
5
  Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
6
6
  Project-URL: Documentation, https://web-novel-scraper.readthedocs.io
@@ -0,0 +1,20 @@
1
+ web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ web_novel_scraper/__main__.py,sha256=QI8oncZiSuyXu2D377uFq5BpAy4U6uyedEs-PfkY2Ek,18312
3
+ web_novel_scraper/config_manager.py,sha256=KVnCIVs1nsEI3W6YolTV9FejjwJXtUYhDn2dmHr9SC4,3193
4
+ web_novel_scraper/decode.py,sha256=iLjXlKUh8UgTkgsQupVAhW-eUDxYrcOTx_EEgjV5s_A,15186
5
+ web_novel_scraper/file_manager.py,sha256=6OZAuaUykUmS0fLzcSVyQSqcPanjEVpjrQ-0pa2vKls,16767
6
+ web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
7
+ web_novel_scraper/models.py,sha256=7lS1Mb5h0qunNOslwCl6vQiuZGsMNw7h1BYK03yHeFM,2227
8
+ web_novel_scraper/novel_scraper.py,sha256=hWZvi1c3Nnq4qSfLsfZPsjQj-UTlWaKwTlqyPtFe36I,50506
9
+ web_novel_scraper/request_manager.py,sha256=yfXaPCeTu7FGiWnR3rImkaCWETfT3d5zxwOIylEP-_M,7954
10
+ web_novel_scraper/utils.py,sha256=DO1bTgyGNqjHrBlMYdS0ljwegTSC_kYsIbxi-0KeuY8,6953
11
+ web_novel_scraper/version.py,sha256=Xybt2skBZamGMNlLuOX1IG-h4uIxqUDGAO8MIGWrJac,22
12
+ web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
13
+ web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
14
+ web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
15
+ web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
16
+ web_novel_scraper/decode_guide/decode_guide.json,sha256=ecIBugJ8ddzFH0rJHkonkrE6ogSXf98jVypQ-GnVDnE,10418
17
+ web_novel_scraper-2.1.0.dist-info/METADATA,sha256=2EeY0XmULbnE6AFUgsuFPT0NYZSnwk6nVzb45rh_tow,8423
18
+ web_novel_scraper-2.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
19
+ web_novel_scraper-2.1.0.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
20
+ web_novel_scraper-2.1.0.dist-info/RECORD,,
@@ -1,19 +0,0 @@
1
- web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- web_novel_scraper/__main__.py,sha256=2SnDA-UKx30dO2YiQoNDF8LKC2ySYYoqkUCV6dNnGYM,18181
3
- web_novel_scraper/config_manager.py,sha256=duwKc6jyLj8NmST5F98qGgpW_o6D6GAenKWsYQ80gcU,3121
4
- web_novel_scraper/decode.py,sha256=fyHr17TsrtGPYYUMRKB8R5qwdt5yvY4vOx9sB9HCmb4,10809
5
- web_novel_scraper/file_manager.py,sha256=02sbDB7AYPI2dmVVcBVSsyjqDmfRy3LmQqffua-blio,10640
6
- web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
7
- web_novel_scraper/novel_scraper.py,sha256=yos0rQGe6UeHVmfmylkB5FM7gE6NCjUCl-80QZTf5-o,30525
8
- web_novel_scraper/request_manager.py,sha256=BVWMtUO3HRs44phU2ODkPUjy7tJiIBX_R0rxGGLaJzw,6617
9
- web_novel_scraper/utils.py,sha256=1V8UwYhpp_27zqPnBDbl7fohu2Z7Sy_4Fq5J2_JAEvU,6405
10
- web_novel_scraper/version.py,sha256=_GEKEa6BYjBV34SZkSlAR87aCM5Y9G0aSI0LXL52iJg,22
11
- web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
12
- web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
13
- web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
14
- web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
15
- web_novel_scraper/decode_guide/decode_guide.json,sha256=-xSfkCoCJtcYX7SM2ukO2PRKm_i9vew_szctVNHY1xk,10379
16
- web_novel_scraper-2.0.3.dist-info/METADATA,sha256=32euwoTPwXoXLcBYfH0iIDnHwVjUXJjVSEDZDsMX03s,8423
17
- web_novel_scraper-2.0.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
18
- web_novel_scraper-2.0.3.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
19
- web_novel_scraper-2.0.3.dist-info/RECORD,,