web-novel-scraper 1.1.0__tar.gz → 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/PKG-INFO +1 -1
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/web_novel_scraper/decode.py +28 -3
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/web_novel_scraper/decode_guide/decode_guide.json +38 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/web_novel_scraper/novel_scraper.py +27 -6
- web_novel_scraper-1.1.1/web_novel_scraper/request_manager.py +147 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/web_novel_scraper/utils.py +7 -0
- web_novel_scraper-1.1.1/web_novel_scraper/version.py +1 -0
- web_novel_scraper-1.1.0/web_novel_scraper/request_manager.py +0 -134
- web_novel_scraper-1.1.0/web_novel_scraper/version.py +0 -1
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/.github/scripts/update_version.py +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/.github/workflows/build.yaml +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/.github/workflows/publish.yaml +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/.github/workflows/test.yaml +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/.gitignore +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/.readthedocs.yaml +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/README.md +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/Makefile +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/make.bat +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/requirements.txt +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/README.rst +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/_static/README.md +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/commands/chapters_commands.rst +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/commands/creation_commands.rst +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/commands/index.rst +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/commands/output_commands.rst +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/commands/toc_commands.rst +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/commands/utils_commands.rst +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/concepts.rst +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/conf.py +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/config_options.rst +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/index.rst +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/tutorial.rst +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/pyproject.toml +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/requirements.txt +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/web_novel_scraper/__init__.py +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/web_novel_scraper/__main__.py +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/web_novel_scraper/custom_processor/__init__.py +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/web_novel_scraper/custom_processor/custom_processor.py +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/web_novel_scraper/custom_processor/sites/genesis.py +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/web_novel_scraper/custom_processor/sites/royalroad.py +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/web_novel_scraper/file_manager.py +0 -0
- {web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/web_novel_scraper/logger_manager.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: web-novel-scraper
|
3
|
-
Version: 1.1.
|
3
|
+
Version: 1.1.1
|
4
4
|
Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
|
5
5
|
Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
|
6
6
|
Project-URL: Documentation, https://web-novel-scraper.readthedocs.io
|
@@ -16,6 +16,13 @@ DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{CURRENT_DIR}/decode_guide/
|
|
16
16
|
|
17
17
|
XOR_SEPARATOR = "XOR"
|
18
18
|
|
19
|
+
DEFAULT_REQUEST_CONFIG = {
|
20
|
+
"force_flaresolver": False,
|
21
|
+
"request_retries": 3,
|
22
|
+
"request_timeout": 20,
|
23
|
+
"request_time_between_retries": 3
|
24
|
+
}
|
25
|
+
|
19
26
|
try:
|
20
27
|
with open(DECODE_GUIDE_FILE, 'r', encoding='UTF-8') as f:
|
21
28
|
DECODE_GUIDE = json.load(f)
|
@@ -36,11 +43,30 @@ except Exception as e:
|
|
36
43
|
class Decoder:
|
37
44
|
host: str
|
38
45
|
decode_guide: json
|
46
|
+
request_config: dict
|
39
47
|
|
40
48
|
def __init__(self, host: str):
|
41
49
|
self.host = host
|
42
50
|
self.decode_guide = self._get_element_by_key(
|
43
51
|
DECODE_GUIDE, 'host', host)
|
52
|
+
host_request_config = self.get_request_config()
|
53
|
+
self.request_config = DEFAULT_REQUEST_CONFIG | host_request_config
|
54
|
+
|
55
|
+
def get_request_config(self) -> dict:
|
56
|
+
request_config = self.decode_guide.get('request_config')
|
57
|
+
if request_config:
|
58
|
+
logger.debug(f'Host "{self.host}" has a custom request configuration on the Decode Guide file.')
|
59
|
+
return request_config
|
60
|
+
|
61
|
+
return DEFAULT_REQUEST_CONFIG
|
62
|
+
|
63
|
+
def is_index_inverted(self, host:str = None) -> bool:
|
64
|
+
if host:
|
65
|
+
decode_guide = self._get_element_by_key(DECODE_GUIDE, 'host', host)
|
66
|
+
else:
|
67
|
+
decode_guide = self.decode_guide
|
68
|
+
|
69
|
+
return decode_guide.get('index', {}).get('inverted', False)
|
44
70
|
|
45
71
|
def get_chapter_urls(self, html: str) -> list[str]:
|
46
72
|
logger.debug('Obtaining chapter URLs...')
|
@@ -130,7 +156,7 @@ class Decoder:
|
|
130
156
|
return ' '.join(elements)
|
131
157
|
return elements
|
132
158
|
|
133
|
-
def has_pagination(self, host: str = None):
|
159
|
+
def has_pagination(self, host: str = None) -> bool:
|
134
160
|
if host:
|
135
161
|
decode_guide = self._get_element_by_key(DECODE_GUIDE, 'host', host)
|
136
162
|
return decode_guide['has_pagination']
|
@@ -245,9 +271,8 @@ class Decoder:
|
|
245
271
|
return elements[0]
|
246
272
|
|
247
273
|
@staticmethod
|
248
|
-
def _get_element_by_key(json_data, key, value):
|
274
|
+
def _get_element_by_key(json_data, key: str, value: str):
|
249
275
|
for item in json_data:
|
250
276
|
if item[key] == value:
|
251
277
|
return item
|
252
|
-
logger.warning('Host not found, using default decoder.')
|
253
278
|
return json_data[0]
|
{web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/web_novel_scraper/decode_guide/decode_guide.json
RENAMED
@@ -160,6 +160,9 @@
|
|
160
160
|
},
|
161
161
|
{
|
162
162
|
"host": "novelbin.com",
|
163
|
+
"request_config": {
|
164
|
+
"force_flaresolver": "true"
|
165
|
+
},
|
163
166
|
"has_pagination": false,
|
164
167
|
"title": {
|
165
168
|
"element": "h2 a.chr-title",
|
@@ -295,5 +298,40 @@
|
|
295
298
|
"key": "href"
|
296
299
|
}
|
297
300
|
}
|
301
|
+
},
|
302
|
+
{
|
303
|
+
"host": "scribblehub.com",
|
304
|
+
"request_config": {
|
305
|
+
"force_flaresolver": "true",
|
306
|
+
"request_timeout": 60
|
307
|
+
},
|
308
|
+
"has_pagination": true,
|
309
|
+
"title": {
|
310
|
+
"selector": "div.chapter-title",
|
311
|
+
"extract": {
|
312
|
+
"type": "text"
|
313
|
+
}
|
314
|
+
},
|
315
|
+
"content": {
|
316
|
+
"selector": "div.chp_raw p",
|
317
|
+
"array": true
|
318
|
+
},
|
319
|
+
"index": {
|
320
|
+
"selector": "div.toc ol li a",
|
321
|
+
"array": true,
|
322
|
+
"inverted": true,
|
323
|
+
"extract": {
|
324
|
+
"type": "attr",
|
325
|
+
"key": "href"
|
326
|
+
}
|
327
|
+
},
|
328
|
+
"next_page": {
|
329
|
+
"selector": "div ul.simple-pagination li a.next",
|
330
|
+
"array": false,
|
331
|
+
"extract": {
|
332
|
+
"type": "attr",
|
333
|
+
"key": "href"
|
334
|
+
}
|
335
|
+
}
|
298
336
|
}
|
299
337
|
]
|
@@ -10,7 +10,7 @@ from .decode import Decoder
|
|
10
10
|
from .file_manager import FileManager
|
11
11
|
from . import utils
|
12
12
|
|
13
|
-
from . import
|
13
|
+
from .request_manager import get_html_content
|
14
14
|
|
15
15
|
logger = logger_manager.create_logger('NOVEL SCRAPPING')
|
16
16
|
|
@@ -277,8 +277,16 @@ class Novel:
|
|
277
277
|
if chapters_url_from_toc_content is None:
|
278
278
|
logger.error('Chapters url not found on toc_content')
|
279
279
|
return False
|
280
|
-
|
281
|
-
|
280
|
+
# First we save a list of lists in case we need to invert the orderAdd commentMore actions
|
281
|
+
self.chapters_url_list.append(chapters_url_from_toc_content)
|
282
|
+
|
283
|
+
invert = self.decoder.is_index_inverted()
|
284
|
+
self.chapters_url_list = [
|
285
|
+
chapter
|
286
|
+
for chapters_url in (self.chapters_url_list[::-1] if invert else self.chapters_url_list)
|
287
|
+
for chapter in chapters_url
|
288
|
+
]
|
289
|
+
|
282
290
|
if self.scraper_behavior.auto_add_host:
|
283
291
|
self.chapters_url_list = [
|
284
292
|
f'https://{self.host}{chapter_url}' for chapter_url in self.chapters_url_list]
|
@@ -464,6 +472,16 @@ class Novel:
|
|
464
472
|
toc = self.decoder.clean_html(toc, hard_clean=hard_clean)
|
465
473
|
self.file_manager.update_toc(toc, i)
|
466
474
|
|
475
|
+
def _request_html_content(self, url: str) -> Optional[str]:
|
476
|
+
request_config = self.decoder.request_config
|
477
|
+
force_flaresolver = request_config.get('force_flaresolver') or self.scraper_behavior.force_flaresolver
|
478
|
+
html_content = get_html_content(url,
|
479
|
+
retries=request_config.get('request_retries'),
|
480
|
+
timeout=request_config.get('request_timeout'),
|
481
|
+
time_between_retries=request_config.get('request_time_between_retries'),
|
482
|
+
force_flaresolver=force_flaresolver)
|
483
|
+
return html_content
|
484
|
+
|
467
485
|
def _get_chapter(self,
|
468
486
|
chapter: Chapter,
|
469
487
|
reload: bool = False) -> Chapter | None:
|
@@ -481,8 +499,7 @@ class Novel:
|
|
481
499
|
return chapter
|
482
500
|
|
483
501
|
# Fetch fresh content
|
484
|
-
chapter.chapter_html =
|
485
|
-
force_flaresolver=self.scraper_behavior.force_flaresolver)
|
502
|
+
chapter.chapter_html = self._request_html_content(chapter.chapter_url)
|
486
503
|
if not chapter.chapter_html:
|
487
504
|
logger.error(f'No content found on link {chapter.chapter_url}')
|
488
505
|
return chapter
|
@@ -501,7 +518,11 @@ class Novel:
|
|
501
518
|
if content:
|
502
519
|
return content
|
503
520
|
|
504
|
-
|
521
|
+
if utils.check_incomplete_url(url):
|
522
|
+
url = self.toc_main_url + url
|
523
|
+
|
524
|
+
# Fetch fresh content
|
525
|
+
content = self._request_html_content(url)
|
505
526
|
if not content:
|
506
527
|
logger.warning(f'No content found on link {url}')
|
507
528
|
sys.exit(1)
|
@@ -0,0 +1,147 @@
|
|
1
|
+
import requests
|
2
|
+
import os
|
3
|
+
from . import logger_manager
|
4
|
+
from dotenv import load_dotenv
|
5
|
+
import json
|
6
|
+
import time
|
7
|
+
from typing import Optional
|
8
|
+
|
9
|
+
load_dotenv()
|
10
|
+
|
11
|
+
FLARESOLVER_URL = os.getenv('SCRAPER_FLARESOLVER_URL', 'http://localhost:8191/v1')
|
12
|
+
FLARE_HEADERS = {'Content-Type': 'application/json'}
|
13
|
+
FORCE_FLARESOLVER = os.getenv('FORCE_FLARESOLVER', '0') == '1'
|
14
|
+
|
15
|
+
logger = logger_manager.create_logger('GET HTML CONTENT')
|
16
|
+
|
17
|
+
|
18
|
+
def _get_request(url: str,
|
19
|
+
timeout: int,
|
20
|
+
retries: int,
|
21
|
+
time_between_retries: int) -> Optional[requests.Response]:
|
22
|
+
logger.debug(
|
23
|
+
f'Starting get_request for "{url}" with timeout={timeout}, '
|
24
|
+
f'retries={retries}, '
|
25
|
+
f'time_between_retries={time_between_retries}')
|
26
|
+
for attempt in range(retries):
|
27
|
+
logger.debug(f'Attempt {attempt + 1} for "{url}"')
|
28
|
+
try:
|
29
|
+
response = requests.get(url, timeout=timeout)
|
30
|
+
response.raise_for_status()
|
31
|
+
logger.debug(f'Successful response for "{url}" on attempt {attempt + 1}')
|
32
|
+
return response
|
33
|
+
except requests.exceptions.ConnectionError as e:
|
34
|
+
logger.debug(f'Connection error ({attempt + 1}/{retries}): {e}')
|
35
|
+
except requests.exceptions.Timeout as e:
|
36
|
+
logger.debug(f'Request timed out ({attempt + 1}/{retries}): {e}')
|
37
|
+
except requests.exceptions.HTTPError as e:
|
38
|
+
logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
|
39
|
+
except requests.exceptions.InvalidSchema as e:
|
40
|
+
logger.debug(f'Invalid URL schema for "{url}": {e}')
|
41
|
+
break # Don't retry on invalid schema
|
42
|
+
except requests.exceptions.RequestException as e:
|
43
|
+
logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
|
44
|
+
|
45
|
+
if attempt < retries - 1:
|
46
|
+
logger.debug(f'Waiting {time_between_retries} seconds before retrying')
|
47
|
+
time.sleep(time_between_retries) # Wait before retrying
|
48
|
+
logger.warning(f'Failed to get a successful response for "{url}" after {retries} attempts using common HTTP Request')
|
49
|
+
return None
|
50
|
+
|
51
|
+
|
52
|
+
def _get_request_flaresolver(url: str,
|
53
|
+
timeout: int,
|
54
|
+
retries: int,
|
55
|
+
time_between_retries: int,
|
56
|
+
flaresolver_url: str) -> Optional[requests.Response]:
|
57
|
+
logger.debug(
|
58
|
+
f'Starting get_request_flaresolver for "{url}" with timeout={timeout}, '
|
59
|
+
f'retries={retries}, '
|
60
|
+
f'time_between_retries={time_between_retries}')
|
61
|
+
for attempt in range(retries):
|
62
|
+
logger.debug(f'Attempt {attempt + 1} for "{url}" using FlareSolver')
|
63
|
+
try:
|
64
|
+
response = requests.post(
|
65
|
+
flaresolver_url,
|
66
|
+
headers=FLARE_HEADERS,
|
67
|
+
json={
|
68
|
+
'cmd': 'request.get',
|
69
|
+
'url': url,
|
70
|
+
'maxTimeout': timeout * 1000
|
71
|
+
},
|
72
|
+
timeout=timeout
|
73
|
+
)
|
74
|
+
response.raise_for_status()
|
75
|
+
logger.debug(f'Successful response for "{url}" on attempt {attempt + 1} using FlareSolver')
|
76
|
+
return response
|
77
|
+
|
78
|
+
except requests.exceptions.ConnectionError as e:
|
79
|
+
logger.warning(f'Connection error with flaresolver (URL: "{flaresolver_url}"): {e}')
|
80
|
+
logger.warning(f'If the url is incorrect, set the env variable "FLARESOLVER_URL" to the correct value')
|
81
|
+
logger.warning('If FlareSolver is not installed in your machine, consider installing it.')
|
82
|
+
break # Don't retry on Connection Error
|
83
|
+
except requests.exceptions.Timeout as e:
|
84
|
+
logger.debug(f'Request timed out ({attempt + 1}/{retries}): {e}')
|
85
|
+
except requests.exceptions.InvalidSchema as e:
|
86
|
+
logger.debug(f'Invalid FlareSolver URL schema "{flaresolver_url}": {e}')
|
87
|
+
break # Don't retry on invalid schema
|
88
|
+
except requests.exceptions.HTTPError as e:
|
89
|
+
logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
|
90
|
+
except requests.exceptions.RequestException as e:
|
91
|
+
logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
|
92
|
+
except json.JSONDecodeError as e:
|
93
|
+
logger.debug(f'Invalid JSON response ({attempt + 1}/{retries}): {e}')
|
94
|
+
|
95
|
+
if attempt < retries - 1:
|
96
|
+
logger.debug(f'Waiting {time_between_retries} seconds before retrying')
|
97
|
+
time.sleep(time_between_retries) # Wait before retrying
|
98
|
+
|
99
|
+
logger.warning(f'Failed to get a successful response for "{url}" using FlareSolver after {retries} attempts')
|
100
|
+
return None
|
101
|
+
|
102
|
+
|
103
|
+
def get_html_content(url: str,
|
104
|
+
retries: int = 3,
|
105
|
+
timeout: int = 20,
|
106
|
+
time_between_retries: int = 3,
|
107
|
+
flaresolver_url: str = FLARESOLVER_URL,
|
108
|
+
force_flaresolver: bool = FORCE_FLARESOLVER) -> Optional[str]:
|
109
|
+
logger.debug(
|
110
|
+
f'Requesting HTML Content for "{url}" with '
|
111
|
+
f'retries: "{retries}", '
|
112
|
+
f'timeout: "{timeout}", '
|
113
|
+
f'time between retries: "{time_between_retries}"')
|
114
|
+
if force_flaresolver:
|
115
|
+
logger.debug('Will directly try with FlareSolver')
|
116
|
+
|
117
|
+
# First try with common HTTP request
|
118
|
+
if not force_flaresolver:
|
119
|
+
response = _get_request(url,
|
120
|
+
timeout=timeout,
|
121
|
+
retries=retries,
|
122
|
+
time_between_retries=time_between_retries)
|
123
|
+
if response and response.ok:
|
124
|
+
logger.debug(f'Successfully retrieved HTML content from "{url}" using common HTTP request')
|
125
|
+
return response.text
|
126
|
+
|
127
|
+
# Try with Flaresolver
|
128
|
+
logger.debug(f'Trying with Flaresolver for "{url}"')
|
129
|
+
response = _get_request_flaresolver(url,
|
130
|
+
timeout=timeout,
|
131
|
+
retries=retries,
|
132
|
+
time_between_retries=time_between_retries,
|
133
|
+
flaresolver_url=flaresolver_url)
|
134
|
+
if not response or not response.ok:
|
135
|
+
logger.warning(f'Failed all attempts to get HTML content from "{url}')
|
136
|
+
return None
|
137
|
+
|
138
|
+
response_json = response.json()
|
139
|
+
if 'solution' not in response_json:
|
140
|
+
logger.warning(f'No solution found in FlareSolver response for "{url}"')
|
141
|
+
return None
|
142
|
+
if 'response' not in response_json['solution']:
|
143
|
+
logger.warning(f'No response found in FlareSolver solution for "{url}"')
|
144
|
+
return None
|
145
|
+
|
146
|
+
logger.debug(f'Successfully retrieved HTML content from "{url}" using FlareSolver')
|
147
|
+
return response_json['solution']['response']
|
@@ -64,3 +64,10 @@ def check_exclusive_params(param1: any, param2: any) -> bool:
|
|
64
64
|
|
65
65
|
def create_volume_id(n: int):
|
66
66
|
return f'v{n:02}'
|
67
|
+
|
68
|
+
def check_incomplete_url(url: str) -> bool:
|
69
|
+
if url.startswith('?') or url.startswith('#'):
|
70
|
+
return True
|
71
|
+
|
72
|
+
parsed = urlparse(url)
|
73
|
+
return not parsed.scheme or not parsed.netloc
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.1.1"
|
@@ -1,134 +0,0 @@
|
|
1
|
-
import requests
|
2
|
-
import os
|
3
|
-
from . import logger_manager
|
4
|
-
from dotenv import load_dotenv
|
5
|
-
import json
|
6
|
-
import time
|
7
|
-
|
8
|
-
load_dotenv()
|
9
|
-
|
10
|
-
FLARESOLVER_URL = os.getenv('SCRAPER_FLARESOLVER_URL', 'http://localhost:8191/v1')
|
11
|
-
FLARE_HEADERS = {'Content-Type': 'application/json'}
|
12
|
-
FORCE_FLARESOLVER = os.getenv('FORCE_FLARESOLVER', '0') == '1'
|
13
|
-
|
14
|
-
logger = logger_manager.create_logger('GET HTML CONTENT')
|
15
|
-
|
16
|
-
def get_request(url: str,
|
17
|
-
timeout: int = 20,
|
18
|
-
retries: int = 3,
|
19
|
-
time_between_retries: int = 1) -> requests.Response | None:
|
20
|
-
logger.debug(f'Starting get_request for {url} with timeout={timeout}, retries={retries}, time_between_retries={time_between_retries}')
|
21
|
-
for attempt in range(retries):
|
22
|
-
logger.debug(f'Attempt {attempt + 1} for {url}')
|
23
|
-
try:
|
24
|
-
response = requests.get(url, timeout=timeout)
|
25
|
-
response.raise_for_status()
|
26
|
-
logger.debug(f'Successful response for {url} on attempt {attempt + 1}')
|
27
|
-
return response
|
28
|
-
except requests.exceptions.ConnectionError as e:
|
29
|
-
logger.error(f'Connection error ({attempt + 1}/{retries}): {e}')
|
30
|
-
except requests.exceptions.Timeout as e:
|
31
|
-
logger.error(f'Request timed out ({attempt + 1}/{retries}): {e}')
|
32
|
-
except requests.exceptions.HTTPError as e:
|
33
|
-
logger.error(f'HTTP error ({attempt + 1}/{retries}): {e}')
|
34
|
-
except requests.exceptions.InvalidSchema as e:
|
35
|
-
logger.error(f'Invalid URL schema for "{url}": {e}')
|
36
|
-
break # Don't retry on invalid schema
|
37
|
-
except requests.exceptions.RequestException as e:
|
38
|
-
logger.error(f'Request failed ({attempt + 1}/{retries}): {e}')
|
39
|
-
|
40
|
-
if attempt < retries - 1:
|
41
|
-
logger.debug(f'Waiting {time_between_retries} seconds before retrying')
|
42
|
-
time.sleep(time_between_retries) # Wait before retrying
|
43
|
-
logger.debug(f'Failed to get a successful response for {url} after {retries} attempts')
|
44
|
-
return None
|
45
|
-
|
46
|
-
|
47
|
-
def get_request_flaresolver(url: str,
|
48
|
-
timeout: int = 20,
|
49
|
-
flaresolver_url: str = FLARESOLVER_URL,
|
50
|
-
retries: int = 3,
|
51
|
-
time_between_retries: int = 1) -> requests.Response | None:
|
52
|
-
logger.debug(f'Starting get_request_flaresolver for {url} with timeout={timeout}, retries={retries}, time_between_retries={time_between_retries}')
|
53
|
-
for attempt in range(retries):
|
54
|
-
logger.debug(f'Attempt {attempt + 1} for {url} using FlareSolver')
|
55
|
-
try:
|
56
|
-
response = requests.post(
|
57
|
-
flaresolver_url,
|
58
|
-
headers=FLARE_HEADERS,
|
59
|
-
json={
|
60
|
-
'cmd': 'request.get',
|
61
|
-
'url': url,
|
62
|
-
'maxTimeout': timeout * 1000
|
63
|
-
},
|
64
|
-
timeout=timeout
|
65
|
-
)
|
66
|
-
response.raise_for_status()
|
67
|
-
logger.debug(f'Successful response for {url} on attempt {attempt + 1} using FlareSolver')
|
68
|
-
return response
|
69
|
-
|
70
|
-
except requests.exceptions.ConnectionError as e:
|
71
|
-
logger.error(f'Connection error ({attempt + 1}/{retries}), check FlareSolver host: {flaresolver_url}: {e}')
|
72
|
-
except requests.exceptions.Timeout as e:
|
73
|
-
logger.error(f'Request timed out ({attempt + 1}/{retries}): {e}')
|
74
|
-
except requests.exceptions.InvalidSchema as e:
|
75
|
-
logger.error(f'Invalid FlareSolver URL schema "{flaresolver_url}": {e}')
|
76
|
-
break # Don't retry on invalid schema
|
77
|
-
except requests.exceptions.HTTPError as e:
|
78
|
-
logger.error(f'HTTP error ({attempt + 1}/{retries}): {e}')
|
79
|
-
except requests.exceptions.RequestException as e:
|
80
|
-
logger.error(f'Request failed ({attempt + 1}/{retries}): {e}')
|
81
|
-
except json.JSONDecodeError as e:
|
82
|
-
logger.error(f'Invalid JSON response ({attempt + 1}/{retries}): {e}')
|
83
|
-
|
84
|
-
if attempt < retries - 1:
|
85
|
-
logger.debug(f'Waiting {time_between_retries} seconds before retrying')
|
86
|
-
time.sleep(time_between_retries) # Wait before retrying
|
87
|
-
logger.debug(f'Failed to get a successful response for {url} using FlareSolver after {retries} attempts')
|
88
|
-
return None
|
89
|
-
|
90
|
-
|
91
|
-
def get_html_content(url: str,
|
92
|
-
retries: int = 5,
|
93
|
-
flaresolver: bool = True,
|
94
|
-
flaresolver_url: str = FLARESOLVER_URL,
|
95
|
-
time_between_retries: int = 1,
|
96
|
-
force_flaresolver: bool = FORCE_FLARESOLVER) -> str | None:
|
97
|
-
logger.debug(f'Starting get_html_content for {url} with retries={retries}, flaresolver={flaresolver}, flaresolver_url={flaresolver_url}, time_between_retries={time_between_retries}, force_flaresolver={force_flaresolver}')
|
98
|
-
# First try with common HTTP request
|
99
|
-
if not force_flaresolver:
|
100
|
-
response = get_request(
|
101
|
-
url, timeout=20, retries=retries, time_between_retries=time_between_retries)
|
102
|
-
if not response:
|
103
|
-
logger.warning(f'Failed to get response from {url} using common HTTP request')
|
104
|
-
elif not response.ok:
|
105
|
-
logger.warning(f'Response with errors from {url} using common HTTP request')
|
106
|
-
else:
|
107
|
-
logger.debug(f'Successfully retrieved HTML content from {url} using common HTTP request')
|
108
|
-
return response.text
|
109
|
-
|
110
|
-
# If flaresolver is disabled, return None
|
111
|
-
if not flaresolver:
|
112
|
-
logger.debug(f'Flaresolver is disabled, returning None for {url}')
|
113
|
-
return None
|
114
|
-
|
115
|
-
# Try with Flaresolver
|
116
|
-
logger.debug(f'Trying with Flaresolver for {url}')
|
117
|
-
response = get_request_flaresolver(
|
118
|
-
url, timeout=20, flaresolver_url=flaresolver_url, time_between_retries=time_between_retries)
|
119
|
-
if not response:
|
120
|
-
logger.critical(f'Failed to get response from {url} using FlareSolver')
|
121
|
-
return None
|
122
|
-
if not response.ok:
|
123
|
-
logger.critical(f'Response with errors from {url} using FlareSolver')
|
124
|
-
return None
|
125
|
-
|
126
|
-
response_json = response.json()
|
127
|
-
if 'solution' not in response_json:
|
128
|
-
logger.critical(f'No solution found in FlareSolver response for {url}')
|
129
|
-
return None
|
130
|
-
if 'response' not in response_json['solution']:
|
131
|
-
logger.critical(f'No response found in FlareSolver solution for {url}')
|
132
|
-
return None
|
133
|
-
logger.debug(f'Successfully retrieved HTML content from {url} using FlareSolver')
|
134
|
-
return response_json['solution']['response']
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "1.1.0"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/commands/chapters_commands.rst
RENAMED
File without changes
|
{web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/commands/creation_commands.rst
RENAMED
File without changes
|
File without changes
|
{web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/docs/source/commands/output_commands.rst
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{web_novel_scraper-1.1.0 → web_novel_scraper-1.1.1}/web_novel_scraper/custom_processor/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|