web-novel-scraper 2.0.3__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web_novel_scraper/__main__.py +123 -68
- web_novel_scraper/config_manager.py +12 -12
- web_novel_scraper/decode.py +225 -80
- web_novel_scraper/decode_guide/decode_guide.json +1 -0
- web_novel_scraper/file_manager.py +292 -110
- web_novel_scraper/models.py +76 -0
- web_novel_scraper/novel_scraper.py +895 -424
- web_novel_scraper/request_manager.py +50 -17
- web_novel_scraper/utils.py +22 -1
- web_novel_scraper/version.py +1 -1
- {web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.0.dist-info}/METADATA +1 -1
- web_novel_scraper-2.1.0.dist-info/RECORD +20 -0
- web_novel_scraper-2.0.3.dist-info/RECORD +0 -19
- {web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.0.dist-info}/WHEEL +0 -0
- {web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.0.dist-info}/entry_points.txt +0 -0
@@ -1,18 +1,22 @@
|
|
1
1
|
import requests
|
2
2
|
import os
|
3
|
-
from . import logger_manager
|
4
|
-
from dotenv import load_dotenv
|
5
3
|
import json
|
6
4
|
import time
|
7
5
|
from typing import Optional
|
8
6
|
|
7
|
+
from dotenv import load_dotenv
|
8
|
+
from urllib.parse import urlparse
|
9
|
+
|
10
|
+
from .logger_manager import create_logger
|
11
|
+
from .utils import ValidationError, NetworkError
|
12
|
+
|
9
13
|
load_dotenv()
|
10
14
|
|
11
15
|
FLARESOLVER_URL = os.getenv('SCRAPER_FLARESOLVER_URL', 'http://localhost:8191/v1')
|
12
16
|
FLARE_HEADERS = {'Content-Type': 'application/json'}
|
13
17
|
FORCE_FLARESOLVER = os.getenv('FORCE_FLARESOLVER', '0') == '1'
|
14
18
|
|
15
|
-
logger =
|
19
|
+
logger = create_logger('GET HTML CONTENT')
|
16
20
|
|
17
21
|
|
18
22
|
def _get_request(url: str,
|
@@ -38,7 +42,6 @@ def _get_request(url: str,
|
|
38
42
|
logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
|
39
43
|
except requests.exceptions.InvalidSchema as e:
|
40
44
|
logger.debug(f'Invalid URL schema for "{url}": {e}')
|
41
|
-
break # Don't retry on invalid schema
|
42
45
|
except requests.exceptions.RequestException as e:
|
43
46
|
logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
|
44
47
|
|
@@ -106,6 +109,36 @@ def get_html_content(url: str,
|
|
106
109
|
time_between_retries: int = 3,
|
107
110
|
flaresolver_url: str = FLARESOLVER_URL,
|
108
111
|
force_flaresolver: bool = FORCE_FLARESOLVER) -> Optional[str]:
|
112
|
+
"""
|
113
|
+
Retrieves HTML content from a URL with support for anti-bot protection bypass.
|
114
|
+
|
115
|
+
Implements a two-step strategy:
|
116
|
+
1. Attempts a standard HTTP request first (unless force_flaresolver is True)
|
117
|
+
2. Falls back to FlareSolver if the standard request fails
|
118
|
+
|
119
|
+
Args:
|
120
|
+
url (str): The URL to fetch content from
|
121
|
+
retries (int, optional): Number of retry attempts for failed requests. Defaults to 3.
|
122
|
+
timeout (int, optional): Timeout in seconds for each request. Defaults to 20.
|
123
|
+
time_between_retries (int, optional): Delay in seconds between retries. Defaults to 3.
|
124
|
+
flaresolver_url (str, optional): URL of the FlareSolver service.
|
125
|
+
Defaults to FLARESOLVER_URL env variable.
|
126
|
+
force_flaresolver (bool, optional): If True, skips standard HTTP request and uses
|
127
|
+
FlareSolver directly. Defaults to FORCE_FLARESOLVER env variable.
|
128
|
+
|
129
|
+
Returns:
|
130
|
+
Optional[str]: The HTML content if successful, None otherwise
|
131
|
+
|
132
|
+
Raises:
|
133
|
+
ValidationError: If the provided URL is invalid
|
134
|
+
NetworkError: If all attempts to fetch content fail
|
135
|
+
"""
|
136
|
+
|
137
|
+
parsed_url = urlparse(url)
|
138
|
+
if not all([parsed_url.scheme, parsed_url.netloc]):
|
139
|
+
raise ValidationError(f"Invalid URL format: {url}")
|
140
|
+
|
141
|
+
|
109
142
|
logger.debug(
|
110
143
|
f'Requesting HTML Content for "{url}" with '
|
111
144
|
f'retries: "{retries}", '
|
@@ -132,16 +165,16 @@ def get_html_content(url: str,
|
|
132
165
|
time_between_retries=time_between_retries,
|
133
166
|
flaresolver_url=flaresolver_url)
|
134
167
|
if not response or not response.ok:
|
135
|
-
logger.
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
return
|
145
|
-
|
146
|
-
|
147
|
-
|
168
|
+
logger.debug(f'Failed all attempts to get HTML content from "{url}')
|
169
|
+
raise NetworkError(f'Failed all attempts to get HTML content from "{url}"')
|
170
|
+
|
171
|
+
try:
|
172
|
+
response_json = response.json()
|
173
|
+
response_content = response_json.get('solution', {}).get('response')
|
174
|
+
if not response_content:
|
175
|
+
raise NetworkError(f'No solution response for "{url}"')
|
176
|
+
|
177
|
+
return response_content
|
178
|
+
except json.JSONDecodeError as e:
|
179
|
+
logger.error(f'Failed to decode FlareSolver response: {e}')
|
180
|
+
raise NetworkError(f'Invalid FlareSolver response for "{url}"')
|
web_novel_scraper/utils.py
CHANGED
@@ -9,9 +9,30 @@ from urllib.parse import urlparse
|
|
9
9
|
import re
|
10
10
|
import unicodedata
|
11
11
|
|
12
|
-
|
12
|
+
def _always(_: object) -> bool:
|
13
|
+
"""Predicate used by dataclasses_json to skip a field."""
|
14
|
+
return True
|
15
|
+
|
16
|
+
## EXCEPTIONS
|
17
|
+
|
18
|
+
class ScraperError(Exception):
|
19
|
+
"""Default Exception for Scraper Exceptions"""
|
20
|
+
|
21
|
+
class NetworkError(ScraperError):
|
22
|
+
"""Exception raised for any exception for request operations"""
|
23
|
+
|
24
|
+
class DecodeError(ScraperError):
|
25
|
+
"""Exception raised for any exception for decoding operations"""
|
26
|
+
|
27
|
+
class FileManagerError(ScraperError):
|
13
28
|
"""Exception raised for any exception for file operations"""
|
14
29
|
|
30
|
+
class ValidationError(ScraperError):
|
31
|
+
"""Exception raised for any exception for invalid values"""
|
32
|
+
|
33
|
+
|
34
|
+
## FILE OPERATIONS HELPER
|
35
|
+
|
15
36
|
class FileOps:
|
16
37
|
"""Static helper for disc operations."""
|
17
38
|
|
web_novel_scraper/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "2.0
|
1
|
+
__version__ = "2.1.0"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: web-novel-scraper
|
3
|
-
Version: 2.0
|
3
|
+
Version: 2.1.0
|
4
4
|
Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
|
5
5
|
Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
|
6
6
|
Project-URL: Documentation, https://web-novel-scraper.readthedocs.io
|
@@ -0,0 +1,20 @@
|
|
1
|
+
web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
web_novel_scraper/__main__.py,sha256=QI8oncZiSuyXu2D377uFq5BpAy4U6uyedEs-PfkY2Ek,18312
|
3
|
+
web_novel_scraper/config_manager.py,sha256=KVnCIVs1nsEI3W6YolTV9FejjwJXtUYhDn2dmHr9SC4,3193
|
4
|
+
web_novel_scraper/decode.py,sha256=iLjXlKUh8UgTkgsQupVAhW-eUDxYrcOTx_EEgjV5s_A,15186
|
5
|
+
web_novel_scraper/file_manager.py,sha256=6OZAuaUykUmS0fLzcSVyQSqcPanjEVpjrQ-0pa2vKls,16767
|
6
|
+
web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
|
7
|
+
web_novel_scraper/models.py,sha256=7lS1Mb5h0qunNOslwCl6vQiuZGsMNw7h1BYK03yHeFM,2227
|
8
|
+
web_novel_scraper/novel_scraper.py,sha256=hWZvi1c3Nnq4qSfLsfZPsjQj-UTlWaKwTlqyPtFe36I,50506
|
9
|
+
web_novel_scraper/request_manager.py,sha256=yfXaPCeTu7FGiWnR3rImkaCWETfT3d5zxwOIylEP-_M,7954
|
10
|
+
web_novel_scraper/utils.py,sha256=DO1bTgyGNqjHrBlMYdS0ljwegTSC_kYsIbxi-0KeuY8,6953
|
11
|
+
web_novel_scraper/version.py,sha256=Xybt2skBZamGMNlLuOX1IG-h4uIxqUDGAO8MIGWrJac,22
|
12
|
+
web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
|
13
|
+
web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
|
14
|
+
web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
|
15
|
+
web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
|
16
|
+
web_novel_scraper/decode_guide/decode_guide.json,sha256=ecIBugJ8ddzFH0rJHkonkrE6ogSXf98jVypQ-GnVDnE,10418
|
17
|
+
web_novel_scraper-2.1.0.dist-info/METADATA,sha256=2EeY0XmULbnE6AFUgsuFPT0NYZSnwk6nVzb45rh_tow,8423
|
18
|
+
web_novel_scraper-2.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
19
|
+
web_novel_scraper-2.1.0.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
|
20
|
+
web_novel_scraper-2.1.0.dist-info/RECORD,,
|
@@ -1,19 +0,0 @@
|
|
1
|
-
web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
web_novel_scraper/__main__.py,sha256=2SnDA-UKx30dO2YiQoNDF8LKC2ySYYoqkUCV6dNnGYM,18181
|
3
|
-
web_novel_scraper/config_manager.py,sha256=duwKc6jyLj8NmST5F98qGgpW_o6D6GAenKWsYQ80gcU,3121
|
4
|
-
web_novel_scraper/decode.py,sha256=fyHr17TsrtGPYYUMRKB8R5qwdt5yvY4vOx9sB9HCmb4,10809
|
5
|
-
web_novel_scraper/file_manager.py,sha256=02sbDB7AYPI2dmVVcBVSsyjqDmfRy3LmQqffua-blio,10640
|
6
|
-
web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
|
7
|
-
web_novel_scraper/novel_scraper.py,sha256=yos0rQGe6UeHVmfmylkB5FM7gE6NCjUCl-80QZTf5-o,30525
|
8
|
-
web_novel_scraper/request_manager.py,sha256=BVWMtUO3HRs44phU2ODkPUjy7tJiIBX_R0rxGGLaJzw,6617
|
9
|
-
web_novel_scraper/utils.py,sha256=1V8UwYhpp_27zqPnBDbl7fohu2Z7Sy_4Fq5J2_JAEvU,6405
|
10
|
-
web_novel_scraper/version.py,sha256=_GEKEa6BYjBV34SZkSlAR87aCM5Y9G0aSI0LXL52iJg,22
|
11
|
-
web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
|
12
|
-
web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
|
13
|
-
web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
|
14
|
-
web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
|
15
|
-
web_novel_scraper/decode_guide/decode_guide.json,sha256=-xSfkCoCJtcYX7SM2ukO2PRKm_i9vew_szctVNHY1xk,10379
|
16
|
-
web_novel_scraper-2.0.3.dist-info/METADATA,sha256=32euwoTPwXoXLcBYfH0iIDnHwVjUXJjVSEDZDsMX03s,8423
|
17
|
-
web_novel_scraper-2.0.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
18
|
-
web_novel_scraper-2.0.3.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
|
19
|
-
web_novel_scraper-2.0.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|