web-novel-scraper 1.0.4__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web_novel_scraper/decode.py +29 -5
- web_novel_scraper/decode_guide/decode_guide.json +40 -2
- web_novel_scraper/file_manager.py +7 -10
- web_novel_scraper/novel_scraper.py +52 -33
- web_novel_scraper/request_manager.py +70 -57
- web_novel_scraper/utils.py +7 -0
- web_novel_scraper/version.py +1 -1
- {web_novel_scraper-1.0.4.dist-info → web_novel_scraper-1.1.1.dist-info}/METADATA +1 -1
- web_novel_scraper-1.1.1.dist-info/RECORD +18 -0
- web_novel_scraper-1.0.4.dist-info/RECORD +0 -18
- {web_novel_scraper-1.0.4.dist-info → web_novel_scraper-1.1.1.dist-info}/WHEEL +0 -0
- {web_novel_scraper-1.0.4.dist-info → web_novel_scraper-1.1.1.dist-info}/entry_points.txt +0 -0
web_novel_scraper/decode.py
CHANGED
@@ -12,11 +12,17 @@ logger = logger_manager.create_logger('DECODE HTML')
|
|
12
12
|
|
13
13
|
CURRENT_DIR = Path(__file__).resolve().parent
|
14
14
|
|
15
|
-
DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{
|
16
|
-
CURRENT_DIR}/decode_guide/decode_guide.json')
|
15
|
+
DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{CURRENT_DIR}/decode_guide/decode_guide.json')
|
17
16
|
|
18
17
|
XOR_SEPARATOR = "XOR"
|
19
18
|
|
19
|
+
DEFAULT_REQUEST_CONFIG = {
|
20
|
+
"force_flaresolver": False,
|
21
|
+
"request_retries": 3,
|
22
|
+
"request_timeout": 20,
|
23
|
+
"request_time_between_retries": 3
|
24
|
+
}
|
25
|
+
|
20
26
|
try:
|
21
27
|
with open(DECODE_GUIDE_FILE, 'r', encoding='UTF-8') as f:
|
22
28
|
DECODE_GUIDE = json.load(f)
|
@@ -37,11 +43,30 @@ except Exception as e:
|
|
37
43
|
class Decoder:
|
38
44
|
host: str
|
39
45
|
decode_guide: json
|
46
|
+
request_config: dict
|
40
47
|
|
41
48
|
def __init__(self, host: str):
|
42
49
|
self.host = host
|
43
50
|
self.decode_guide = self._get_element_by_key(
|
44
51
|
DECODE_GUIDE, 'host', host)
|
52
|
+
host_request_config = self.get_request_config()
|
53
|
+
self.request_config = DEFAULT_REQUEST_CONFIG | host_request_config
|
54
|
+
|
55
|
+
def get_request_config(self) -> dict:
|
56
|
+
request_config = self.decode_guide.get('request_config')
|
57
|
+
if request_config:
|
58
|
+
logger.debug(f'Host "{self.host}" has a custom request configuration on the Decode Guide file.')
|
59
|
+
return request_config
|
60
|
+
|
61
|
+
return DEFAULT_REQUEST_CONFIG
|
62
|
+
|
63
|
+
def is_index_inverted(self, host:str = None) -> bool:
|
64
|
+
if host:
|
65
|
+
decode_guide = self._get_element_by_key(DECODE_GUIDE, 'host', host)
|
66
|
+
else:
|
67
|
+
decode_guide = self.decode_guide
|
68
|
+
|
69
|
+
return decode_guide.get('index', {}).get('inverted', False)
|
45
70
|
|
46
71
|
def get_chapter_urls(self, html: str) -> list[str]:
|
47
72
|
logger.debug('Obtaining chapter URLs...')
|
@@ -131,7 +156,7 @@ class Decoder:
|
|
131
156
|
return ' '.join(elements)
|
132
157
|
return elements
|
133
158
|
|
134
|
-
def has_pagination(self, host: str = None):
|
159
|
+
def has_pagination(self, host: str = None) -> bool:
|
135
160
|
if host:
|
136
161
|
decode_guide = self._get_element_by_key(DECODE_GUIDE, 'host', host)
|
137
162
|
return decode_guide['has_pagination']
|
@@ -246,9 +271,8 @@ class Decoder:
|
|
246
271
|
return elements[0]
|
247
272
|
|
248
273
|
@staticmethod
|
249
|
-
def _get_element_by_key(json_data, key, value):
|
274
|
+
def _get_element_by_key(json_data, key: str, value: str):
|
250
275
|
for item in json_data:
|
251
276
|
if item[key] == value:
|
252
277
|
return item
|
253
|
-
logger.warning('Host not found, using default decoder.')
|
254
278
|
return json_data[0]
|
@@ -130,10 +130,10 @@
|
|
130
130
|
}
|
131
131
|
},
|
132
132
|
"content": {
|
133
|
-
"element":
|
133
|
+
"element": null,
|
134
134
|
"id": null,
|
135
135
|
"class": null,
|
136
|
-
"selector":
|
136
|
+
"selector": "div#chr-content p",
|
137
137
|
"attributes": null,
|
138
138
|
"array": true
|
139
139
|
},
|
@@ -160,6 +160,9 @@
|
|
160
160
|
},
|
161
161
|
{
|
162
162
|
"host": "novelbin.com",
|
163
|
+
"request_config": {
|
164
|
+
"force_flaresolver": "true"
|
165
|
+
},
|
163
166
|
"has_pagination": false,
|
164
167
|
"title": {
|
165
168
|
"element": "h2 a.chr-title",
|
@@ -295,5 +298,40 @@
|
|
295
298
|
"key": "href"
|
296
299
|
}
|
297
300
|
}
|
301
|
+
},
|
302
|
+
{
|
303
|
+
"host": "scribblehub.com",
|
304
|
+
"request_config": {
|
305
|
+
"force_flaresolver": "true",
|
306
|
+
"request_timeout": 60
|
307
|
+
},
|
308
|
+
"has_pagination": true,
|
309
|
+
"title": {
|
310
|
+
"selector": "div.chapter-title",
|
311
|
+
"extract": {
|
312
|
+
"type": "text"
|
313
|
+
}
|
314
|
+
},
|
315
|
+
"content": {
|
316
|
+
"selector": "div.chp_raw p",
|
317
|
+
"array": true
|
318
|
+
},
|
319
|
+
"index": {
|
320
|
+
"selector": "div.toc ol li a",
|
321
|
+
"array": true,
|
322
|
+
"inverted": true,
|
323
|
+
"extract": {
|
324
|
+
"type": "attr",
|
325
|
+
"key": "href"
|
326
|
+
}
|
327
|
+
},
|
328
|
+
"next_page": {
|
329
|
+
"selector": "div ul.simple-pagination li a.next",
|
330
|
+
"array": false,
|
331
|
+
"extract": {
|
332
|
+
"type": "attr",
|
333
|
+
"key": "href"
|
334
|
+
}
|
335
|
+
}
|
298
336
|
}
|
299
337
|
]
|
@@ -45,10 +45,10 @@ class FileManager:
|
|
45
45
|
novel_config_dir: str = None,
|
46
46
|
read_only: bool = False):
|
47
47
|
logger.debug(f'Initializing FileManager for novel: {novel_title}, read_only: {read_only}')
|
48
|
-
novel_base_dir = novel_base_dir if novel_base_dir else
|
49
|
-
|
50
|
-
novel_config_dir = novel_config_dir if novel_config_dir else
|
51
|
-
|
48
|
+
novel_base_dir = novel_base_dir if novel_base_dir else \
|
49
|
+
f'{SCRAPER_BASE_DATA_DIR}/{novel_title}'
|
50
|
+
novel_config_dir = novel_config_dir if novel_config_dir else \
|
51
|
+
f'{SCRAPER_BASE_CONFIG_DIR}/{novel_title}'
|
52
52
|
|
53
53
|
logger.debug(f'Using base dir: {novel_base_dir}, config dir: {novel_config_dir}')
|
54
54
|
|
@@ -243,8 +243,7 @@ def _save_content_to_file(filepath: Path, content: str | dict, is_json: bool = F
|
|
243
243
|
except (OSError, IOError) as e:
|
244
244
|
logger.error(f'Error saving file "{filepath}": {e}')
|
245
245
|
except Exception as e:
|
246
|
-
logger.error(f'Unexpected error saving file "{
|
247
|
-
filepath}": {e}', exc_info=True)
|
246
|
+
logger.error(f'Unexpected error saving file "{filepath}": {e}', exc_info=True)
|
248
247
|
|
249
248
|
|
250
249
|
def _read_content_from_file(filepath: Path, bytes: bool = False) -> str:
|
@@ -263,8 +262,7 @@ def _read_content_from_file(filepath: Path, bytes: bool = False) -> str:
|
|
263
262
|
logger.error(f'Error reading file "{filepath}": {e}')
|
264
263
|
except Exception as e:
|
265
264
|
# Log for unexpected errors
|
266
|
-
logger.error(f'Unexpected error reading file "{
|
267
|
-
filepath}": {e}', exc_info=True)
|
265
|
+
logger.error(f'Unexpected error reading file "{filepath}": {e}', exc_info=True)
|
268
266
|
|
269
267
|
|
270
268
|
def _delete_file(filepath: Path) -> None:
|
@@ -280,8 +278,7 @@ def _delete_file(filepath: Path) -> None:
|
|
280
278
|
logger.error(f'Error deleting file "{filepath}": {e}')
|
281
279
|
except Exception as e:
|
282
280
|
# Log any unexpected errors
|
283
|
-
logger.error(f'Unexpected error deleting file "{
|
284
|
-
filepath}": {e}', exc_info=True)
|
281
|
+
logger.error(f'Unexpected error deleting file "{filepath}": {e}', exc_info=True)
|
285
282
|
|
286
283
|
|
287
284
|
def _copy_file(source: Path, destination: Path) -> bool:
|
@@ -10,7 +10,7 @@ from .decode import Decoder
|
|
10
10
|
from .file_manager import FileManager
|
11
11
|
from . import utils
|
12
12
|
|
13
|
-
from . import
|
13
|
+
from .request_manager import get_html_content
|
14
14
|
|
15
15
|
logger = logger_manager.create_logger('NOVEL SCRAPPING')
|
16
16
|
|
@@ -39,9 +39,11 @@ class Metadata:
|
|
39
39
|
"""
|
40
40
|
Dynamic string representation of the configuration.
|
41
41
|
"""
|
42
|
-
attributes = [f"{field.name}=
|
43
|
-
|
44
|
-
|
42
|
+
attributes = [(f"{field.name}="
|
43
|
+
f"{getattr(self, field.name)}") for field in fields(self)]
|
44
|
+
attributes_str = '\n'.join(attributes)
|
45
|
+
return (f"Metadata: \n"
|
46
|
+
f"{attributes_str}")
|
45
47
|
|
46
48
|
|
47
49
|
@dataclass_json
|
@@ -70,9 +72,11 @@ class ScraperBehavior:
|
|
70
72
|
"""
|
71
73
|
Dynamic string representation of the configuration.
|
72
74
|
"""
|
73
|
-
attributes = [f"{field.name}=
|
74
|
-
|
75
|
-
|
75
|
+
attributes = [(f"{field.name}="
|
76
|
+
f"{getattr(self, field.name)}") for field in fields(self)]
|
77
|
+
attributes_str = '\n'.join(attributes)
|
78
|
+
return (f"Scraper Behavior: \n"
|
79
|
+
f"{attributes_str}")
|
76
80
|
|
77
81
|
|
78
82
|
@dataclass_json(undefined=Undefined.EXCLUDE)
|
@@ -169,7 +173,9 @@ class Novel:
|
|
169
173
|
f"TOC Info: {toc_info}",
|
170
174
|
f"Host: {self.host}"
|
171
175
|
]
|
172
|
-
|
176
|
+
attributes_str = '\n'.join(attributes)
|
177
|
+
return (f"Novel Info: \n"
|
178
|
+
f"{attributes_str}")
|
173
179
|
|
174
180
|
# NOVEL PARAMETERS MANAGEMENT
|
175
181
|
|
@@ -186,8 +192,7 @@ class Novel:
|
|
186
192
|
self.metadata.tags.append(tag)
|
187
193
|
self.save_novel()
|
188
194
|
return True
|
189
|
-
logger.warning(f'Tag "{tag}" already exists on novel {
|
190
|
-
self.metadata.novel_title}')
|
195
|
+
logger.warning(f'Tag "{tag}" already exists on novel {self.metadata.novel_title}')
|
191
196
|
return False
|
192
197
|
|
193
198
|
def remove_tag(self, tag: str) -> bool:
|
@@ -195,8 +200,7 @@ class Novel:
|
|
195
200
|
self.metadata.tags.remove(tag)
|
196
201
|
self.save_novel()
|
197
202
|
return True
|
198
|
-
logger.warning(f'Tag "{tag}" doesn\'t exist on novel {
|
199
|
-
self.metadata.novel_title}')
|
203
|
+
logger.warning(f'Tag "{tag}" doesn\'t exist on novel {self.metadata.novel_title}')
|
200
204
|
return False
|
201
205
|
|
202
206
|
def set_cover_image(self, cover_image_path: str) -> bool:
|
@@ -273,8 +277,16 @@ class Novel:
|
|
273
277
|
if chapters_url_from_toc_content is None:
|
274
278
|
logger.error('Chapters url not found on toc_content')
|
275
279
|
return False
|
276
|
-
|
277
|
-
|
280
|
+
# First we save a list of lists in case we need to invert the orderAdd commentMore actions
|
281
|
+
self.chapters_url_list.append(chapters_url_from_toc_content)
|
282
|
+
|
283
|
+
invert = self.decoder.is_index_inverted()
|
284
|
+
self.chapters_url_list = [
|
285
|
+
chapter
|
286
|
+
for chapters_url in (self.chapters_url_list[::-1] if invert else self.chapters_url_list)
|
287
|
+
for chapter in chapters_url
|
288
|
+
]
|
289
|
+
|
278
290
|
if self.scraper_behavior.auto_add_host:
|
279
291
|
self.chapters_url_list = [
|
280
292
|
f'https://{self.host}{chapter_url}' for chapter_url in self.chapters_url_list]
|
@@ -298,11 +310,9 @@ class Novel:
|
|
298
310
|
chapter_list = "Chapters List:\n"
|
299
311
|
for i, chapter in enumerate(self.chapters):
|
300
312
|
chapter_list += f"Chapter {i + 1}:\n"
|
301
|
-
chapter_list += f" Title: {
|
302
|
-
chapter.chapter_title if chapter.chapter_title else 'Title not yet scrapped'}\n"
|
313
|
+
chapter_list += f" Title: {chapter.chapter_title if chapter.chapter_title else 'Title not yet scrapped'}\n"
|
303
314
|
chapter_list += f" URL: {chapter.chapter_url}\n"
|
304
|
-
chapter_list += f" Filename: {
|
305
|
-
chapter.chapter_html_filename if chapter.chapter_html_filename else 'File not yet requested'}\n"
|
315
|
+
chapter_list += f" Filename: {chapter.chapter_html_filename if chapter.chapter_html_filename else 'File not yet requested'}\n"
|
306
316
|
return chapter_list
|
307
317
|
|
308
318
|
def scrap_chapter(self, chapter_url: str = None, chapter_idx: int = None, update_html: bool = False) -> Chapter:
|
@@ -379,8 +389,7 @@ class Novel:
|
|
379
389
|
chapter = self._get_chapter(
|
380
390
|
chapter=chapter, reload=update_html)
|
381
391
|
if not chapter.chapter_html_filename:
|
382
|
-
logger.critical(f'Error requesting chapter {
|
383
|
-
i} with url {chapter.chapter_url}')
|
392
|
+
logger.critical(f'Error requesting chapter {i} with url {chapter.chapter_url}')
|
384
393
|
return False
|
385
394
|
|
386
395
|
self._add_or_update_chapter_data(chapter=chapter, link_idx=i,
|
@@ -402,16 +411,15 @@ class Novel:
|
|
402
411
|
self.sync_toc()
|
403
412
|
|
404
413
|
if start_chapter > len(self.chapters):
|
405
|
-
logger.info(f'The start chapter is bigger than the number of chapters saved ({
|
406
|
-
len(self.chapters)})')
|
414
|
+
logger.info(f'The start chapter is bigger than the number of chapters saved ({len(self.chapters)})')
|
407
415
|
return
|
408
416
|
|
409
417
|
if not end_chapter:
|
410
418
|
end_chapter = len(self.chapters)
|
411
419
|
elif end_chapter > len(self.chapters):
|
412
420
|
end_chapter = len(self.chapters)
|
413
|
-
logger.info(f'The end chapter is bigger than the number of chapters,
|
414
|
-
end_chapter}.')
|
421
|
+
logger.info(f'The end chapter is bigger than the number of chapters, '
|
422
|
+
f'automatically setting it to {end_chapter}.')
|
415
423
|
|
416
424
|
idx = 1
|
417
425
|
start = start_chapter
|
@@ -421,8 +429,8 @@ class Novel:
|
|
421
429
|
end_chapter=end,
|
422
430
|
collection_idx=idx)
|
423
431
|
if not result:
|
424
|
-
logger.critical(f'Error with saving novel to epub, with start chapter:
|
425
|
-
start_chapter} and end chapter: {end_chapter}')
|
432
|
+
logger.critical(f'Error with saving novel to epub, with start chapter: '
|
433
|
+
f'{start_chapter} and end chapter: {end_chapter}')
|
426
434
|
return False
|
427
435
|
start = start + chapters_by_book
|
428
436
|
idx = idx + 1
|
@@ -464,6 +472,16 @@ class Novel:
|
|
464
472
|
toc = self.decoder.clean_html(toc, hard_clean=hard_clean)
|
465
473
|
self.file_manager.update_toc(toc, i)
|
466
474
|
|
475
|
+
def _request_html_content(self, url: str) -> Optional[str]:
|
476
|
+
request_config = self.decoder.request_config
|
477
|
+
force_flaresolver = request_config.get('force_flaresolver') or self.scraper_behavior.force_flaresolver
|
478
|
+
html_content = get_html_content(url,
|
479
|
+
retries=request_config.get('request_retries'),
|
480
|
+
timeout=request_config.get('request_timeout'),
|
481
|
+
time_between_retries=request_config.get('request_time_between_retries'),
|
482
|
+
force_flaresolver=force_flaresolver)
|
483
|
+
return html_content
|
484
|
+
|
467
485
|
def _get_chapter(self,
|
468
486
|
chapter: Chapter,
|
469
487
|
reload: bool = False) -> Chapter | None:
|
@@ -481,8 +499,7 @@ class Novel:
|
|
481
499
|
return chapter
|
482
500
|
|
483
501
|
# Fetch fresh content
|
484
|
-
chapter.chapter_html =
|
485
|
-
force_flaresolver=self.scraper_behavior.force_flaresolver)
|
502
|
+
chapter.chapter_html = self._request_html_content(chapter.chapter_url)
|
486
503
|
if not chapter.chapter_html:
|
487
504
|
logger.error(f'No content found on link {chapter.chapter_url}')
|
488
505
|
return chapter
|
@@ -501,7 +518,11 @@ class Novel:
|
|
501
518
|
if content:
|
502
519
|
return content
|
503
520
|
|
504
|
-
|
521
|
+
if utils.check_incomplete_url(url):
|
522
|
+
url = self.toc_main_url + url
|
523
|
+
|
524
|
+
# Fetch fresh content
|
525
|
+
content = self._request_html_content(url)
|
505
526
|
if not content:
|
506
527
|
logger.warning(f'No content found on link {url}')
|
507
528
|
sys.exit(1)
|
@@ -679,8 +700,7 @@ class Novel:
|
|
679
700
|
idx_start = start_chapter - 1
|
680
701
|
idx_end = end_chapter
|
681
702
|
# We create the epub book
|
682
|
-
book_title = f'{self.metadata.novel_title} Chapters {
|
683
|
-
start_chapter} - {end_chapter}'
|
703
|
+
book_title = f'{self.metadata.novel_title} Chapters {start_chapter} - {end_chapter}'
|
684
704
|
calibre_collection = None
|
685
705
|
# If collection_idx is set, we create a calibre collection
|
686
706
|
if collection_idx:
|
@@ -692,8 +712,7 @@ class Novel:
|
|
692
712
|
book = self._add_chapter_to_epub_book(chapter=chapter,
|
693
713
|
book=book)
|
694
714
|
if book is None:
|
695
|
-
logger.critical(f'Error saving epub {book_title}, could not decode chapter {
|
696
|
-
chapter} using host {self.host}')
|
715
|
+
logger.critical(f'Error saving epub {book_title}, could not decode chapter {chapter} using host {self.host}')
|
697
716
|
return False
|
698
717
|
|
699
718
|
book.add_item(epub.EpubNcx())
|
@@ -4,6 +4,7 @@ from . import logger_manager
|
|
4
4
|
from dotenv import load_dotenv
|
5
5
|
import json
|
6
6
|
import time
|
7
|
+
from typing import Optional
|
7
8
|
|
8
9
|
load_dotenv()
|
9
10
|
|
@@ -13,45 +14,52 @@ FORCE_FLARESOLVER = os.getenv('FORCE_FLARESOLVER', '0') == '1'
|
|
13
14
|
|
14
15
|
logger = logger_manager.create_logger('GET HTML CONTENT')
|
15
16
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
17
|
+
|
18
|
+
def _get_request(url: str,
|
19
|
+
timeout: int,
|
20
|
+
retries: int,
|
21
|
+
time_between_retries: int) -> Optional[requests.Response]:
|
22
|
+
logger.debug(
|
23
|
+
f'Starting get_request for "{url}" with timeout={timeout}, '
|
24
|
+
f'retries={retries}, '
|
25
|
+
f'time_between_retries={time_between_retries}')
|
21
26
|
for attempt in range(retries):
|
22
|
-
logger.debug(f'Attempt {attempt + 1} for {url}')
|
27
|
+
logger.debug(f'Attempt {attempt + 1} for "{url}"')
|
23
28
|
try:
|
24
29
|
response = requests.get(url, timeout=timeout)
|
25
30
|
response.raise_for_status()
|
26
|
-
logger.debug(f'Successful response for {url} on attempt {attempt + 1}')
|
31
|
+
logger.debug(f'Successful response for "{url}" on attempt {attempt + 1}')
|
27
32
|
return response
|
28
33
|
except requests.exceptions.ConnectionError as e:
|
29
|
-
logger.
|
34
|
+
logger.debug(f'Connection error ({attempt + 1}/{retries}): {e}')
|
30
35
|
except requests.exceptions.Timeout as e:
|
31
|
-
logger.
|
36
|
+
logger.debug(f'Request timed out ({attempt + 1}/{retries}): {e}')
|
32
37
|
except requests.exceptions.HTTPError as e:
|
33
|
-
logger.
|
38
|
+
logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
|
34
39
|
except requests.exceptions.InvalidSchema as e:
|
35
|
-
logger.
|
40
|
+
logger.debug(f'Invalid URL schema for "{url}": {e}')
|
36
41
|
break # Don't retry on invalid schema
|
37
42
|
except requests.exceptions.RequestException as e:
|
38
|
-
logger.
|
43
|
+
logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
|
39
44
|
|
40
45
|
if attempt < retries - 1:
|
41
46
|
logger.debug(f'Waiting {time_between_retries} seconds before retrying')
|
42
47
|
time.sleep(time_between_retries) # Wait before retrying
|
43
|
-
logger.
|
48
|
+
logger.warning(f'Failed to get a successful response for "{url}" after {retries} attempts using common HTTP Request')
|
44
49
|
return None
|
45
50
|
|
46
51
|
|
47
|
-
def
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
logger.debug(
|
52
|
+
def _get_request_flaresolver(url: str,
|
53
|
+
timeout: int,
|
54
|
+
retries: int,
|
55
|
+
time_between_retries: int,
|
56
|
+
flaresolver_url: str) -> Optional[requests.Response]:
|
57
|
+
logger.debug(
|
58
|
+
f'Starting get_request_flaresolver for "{url}" with timeout={timeout}, '
|
59
|
+
f'retries={retries}, '
|
60
|
+
f'time_between_retries={time_between_retries}')
|
53
61
|
for attempt in range(retries):
|
54
|
-
logger.debug(f'Attempt {attempt + 1} for {url} using FlareSolver')
|
62
|
+
logger.debug(f'Attempt {attempt + 1} for "{url}" using FlareSolver')
|
55
63
|
try:
|
56
64
|
response = requests.post(
|
57
65
|
flaresolver_url,
|
@@ -64,71 +72,76 @@ def get_request_flaresolver(url: str,
|
|
64
72
|
timeout=timeout
|
65
73
|
)
|
66
74
|
response.raise_for_status()
|
67
|
-
logger.debug(f'Successful response for {url} on attempt {attempt + 1} using FlareSolver')
|
75
|
+
logger.debug(f'Successful response for "{url}" on attempt {attempt + 1} using FlareSolver')
|
68
76
|
return response
|
69
77
|
|
70
78
|
except requests.exceptions.ConnectionError as e:
|
71
|
-
logger.
|
79
|
+
logger.warning(f'Connection error with flaresolver (URL: "{flaresolver_url}"): {e}')
|
80
|
+
logger.warning(f'If the url is incorrect, set the env variable "FLARESOLVER_URL" to the correct value')
|
81
|
+
logger.warning('If FlareSolver is not installed in your machine, consider installing it.')
|
82
|
+
break # Don't retry on Connection Error
|
72
83
|
except requests.exceptions.Timeout as e:
|
73
|
-
logger.
|
84
|
+
logger.debug(f'Request timed out ({attempt + 1}/{retries}): {e}')
|
74
85
|
except requests.exceptions.InvalidSchema as e:
|
75
|
-
logger.
|
86
|
+
logger.debug(f'Invalid FlareSolver URL schema "{flaresolver_url}": {e}')
|
76
87
|
break # Don't retry on invalid schema
|
77
88
|
except requests.exceptions.HTTPError as e:
|
78
|
-
logger.
|
89
|
+
logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
|
79
90
|
except requests.exceptions.RequestException as e:
|
80
|
-
logger.
|
91
|
+
logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
|
81
92
|
except json.JSONDecodeError as e:
|
82
|
-
logger.
|
93
|
+
logger.debug(f'Invalid JSON response ({attempt + 1}/{retries}): {e}')
|
83
94
|
|
84
95
|
if attempt < retries - 1:
|
85
96
|
logger.debug(f'Waiting {time_between_retries} seconds before retrying')
|
86
97
|
time.sleep(time_between_retries) # Wait before retrying
|
87
|
-
|
98
|
+
|
99
|
+
logger.warning(f'Failed to get a successful response for "{url}" using FlareSolver after {retries} attempts')
|
88
100
|
return None
|
89
101
|
|
90
102
|
|
91
103
|
def get_html_content(url: str,
|
92
|
-
retries: int =
|
93
|
-
|
104
|
+
retries: int = 3,
|
105
|
+
timeout: int = 20,
|
106
|
+
time_between_retries: int = 3,
|
94
107
|
flaresolver_url: str = FLARESOLVER_URL,
|
95
|
-
|
96
|
-
|
97
|
-
|
108
|
+
force_flaresolver: bool = FORCE_FLARESOLVER) -> Optional[str]:
|
109
|
+
logger.debug(
|
110
|
+
f'Requesting HTML Content for "{url}" with '
|
111
|
+
f'retries: "{retries}", '
|
112
|
+
f'timeout: "{timeout}", '
|
113
|
+
f'time between retries: "{time_between_retries}"')
|
114
|
+
if force_flaresolver:
|
115
|
+
logger.debug('Will directly try with FlareSolver')
|
116
|
+
|
98
117
|
# First try with common HTTP request
|
99
118
|
if not force_flaresolver:
|
100
|
-
response =
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
logger.
|
106
|
-
else:
|
107
|
-
logger.debug(f'Successfully retrieved HTML content from {url} using common HTTP request')
|
119
|
+
response = _get_request(url,
|
120
|
+
timeout=timeout,
|
121
|
+
retries=retries,
|
122
|
+
time_between_retries=time_between_retries)
|
123
|
+
if response and response.ok:
|
124
|
+
logger.debug(f'Successfully retrieved HTML content from "{url}" using common HTTP request')
|
108
125
|
return response.text
|
109
126
|
|
110
|
-
# If flaresolver is disabled, return None
|
111
|
-
if not flaresolver:
|
112
|
-
logger.debug(f'Flaresolver is disabled, returning None for {url}')
|
113
|
-
return None
|
114
|
-
|
115
127
|
# Try with Flaresolver
|
116
|
-
logger.debug(f'Trying with Flaresolver for {url}')
|
117
|
-
response =
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
if not response.ok:
|
123
|
-
logger.
|
128
|
+
logger.debug(f'Trying with Flaresolver for "{url}"')
|
129
|
+
response = _get_request_flaresolver(url,
|
130
|
+
timeout=timeout,
|
131
|
+
retries=retries,
|
132
|
+
time_between_retries=time_between_retries,
|
133
|
+
flaresolver_url=flaresolver_url)
|
134
|
+
if not response or not response.ok:
|
135
|
+
logger.warning(f'Failed all attempts to get HTML content from "{url}')
|
124
136
|
return None
|
125
137
|
|
126
138
|
response_json = response.json()
|
127
139
|
if 'solution' not in response_json:
|
128
|
-
logger.
|
140
|
+
logger.warning(f'No solution found in FlareSolver response for "{url}"')
|
129
141
|
return None
|
130
142
|
if 'response' not in response_json['solution']:
|
131
|
-
logger.
|
143
|
+
logger.warning(f'No response found in FlareSolver solution for "{url}"')
|
132
144
|
return None
|
133
|
-
|
145
|
+
|
146
|
+
logger.debug(f'Successfully retrieved HTML content from "{url}" using FlareSolver')
|
134
147
|
return response_json['solution']['response']
|
web_novel_scraper/utils.py
CHANGED
@@ -64,3 +64,10 @@ def check_exclusive_params(param1: any, param2: any) -> bool:
|
|
64
64
|
|
65
65
|
def create_volume_id(n: int):
|
66
66
|
return f'v{n:02}'
|
67
|
+
|
68
|
+
def check_incomplete_url(url: str) -> bool:
|
69
|
+
if url.startswith('?') or url.startswith('#'):
|
70
|
+
return True
|
71
|
+
|
72
|
+
parsed = urlparse(url)
|
73
|
+
return not parsed.scheme or not parsed.netloc
|
web_novel_scraper/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.
|
1
|
+
__version__ = "1.1.1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: web-novel-scraper
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.1.1
|
4
4
|
Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
|
5
5
|
Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
|
6
6
|
Project-URL: Documentation, https://web-novel-scraper.readthedocs.io
|
@@ -0,0 +1,18 @@
|
|
1
|
+
web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
web_novel_scraper/__main__.py,sha256=OQQVX5CttmAkUwdrnjBSjKPaoh_boUI2ysHi3rLGOSs,17769
|
3
|
+
web_novel_scraper/decode.py,sha256=U-78PhJ4SU2hiUmfAWeWGEBJ3YSoCW3Lupw9cUqQuI0,11013
|
4
|
+
web_novel_scraper/file_manager.py,sha256=qAqgqtaRb7QyVtyEOW2cMhPYWdKM6nJ69weUCYKwVtM,11862
|
5
|
+
web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
|
6
|
+
web_novel_scraper/novel_scraper.py,sha256=DsYnY15s8cZZ2w8pRvmD3_NJw54xarhcnEQdvnTD8XI,29421
|
7
|
+
web_novel_scraper/request_manager.py,sha256=WU8LG6D_fqmDapX6wpVwpQQSItcNU8Qb9dMAlLCYI8U,6621
|
8
|
+
web_novel_scraper/utils.py,sha256=dPp7D2ji9mC2nFydqxsJ_9vkAntA_3VTt8ZmG-F1f78,2270
|
9
|
+
web_novel_scraper/version.py,sha256=q8_5C0f-8mHWNb6mMw02zlYPnEGXBqvOmP3z0CEwZKM,22
|
10
|
+
web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
|
11
|
+
web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
|
12
|
+
web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
|
13
|
+
web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
|
14
|
+
web_novel_scraper/decode_guide/decode_guide.json,sha256=gNVencLtK0HmZPlubTm1wA7eatWADCxJ_LCOYWHWuA0,8556
|
15
|
+
web_novel_scraper-1.1.1.dist-info/METADATA,sha256=ow5piBhzzo4mZ0secvHrqc4KCCt4VInpDa09Qo9l4AE,8423
|
16
|
+
web_novel_scraper-1.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
17
|
+
web_novel_scraper-1.1.1.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
|
18
|
+
web_novel_scraper-1.1.1.dist-info/RECORD,,
|
@@ -1,18 +0,0 @@
|
|
1
|
-
web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
web_novel_scraper/__main__.py,sha256=OQQVX5CttmAkUwdrnjBSjKPaoh_boUI2ysHi3rLGOSs,17769
|
3
|
-
web_novel_scraper/decode.py,sha256=0RMHx1buR01KhuXiVQwdSpCGN960Xh-iPw1eYHxLeDg,10181
|
4
|
-
web_novel_scraper/file_manager.py,sha256=Q3DH-c8fWz9sziMps7A3p_sQoDMEpqBket07Agh-__Q,11898
|
5
|
-
web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
|
6
|
-
web_novel_scraper/novel_scraper.py,sha256=Notk0O94HZrO-MVKDGCBL0VopApFchn13FO2_N3ZfRM,28418
|
7
|
-
web_novel_scraper/request_manager.py,sha256=VtGpRi5b_Dp3h8viCdt7yMCb9M21Lk7oLC_Q_0EkXH8,6448
|
8
|
-
web_novel_scraper/utils.py,sha256=vq5ROuPv04k3MhbksTe0ni_yP0i_a7T_33mkBB1DUbQ,2076
|
9
|
-
web_novel_scraper/version.py,sha256=acuR_XSJzp4OrQ5T8-Ac5gYe48mUwObuwjRmisFmZ7k,22
|
10
|
-
web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
|
11
|
-
web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
|
12
|
-
web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
|
13
|
-
web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
|
14
|
-
web_novel_scraper/decode_guide/decode_guide.json,sha256=IBBzbSSVO-yQ5PCY7o8ralnaonMwBpEZW1v1TStiVqc,7582
|
15
|
-
web_novel_scraper-1.0.4.dist-info/METADATA,sha256=IhvDqK_Gz1POjzbH2cQVUYql1dhZJvdHnM9R--le0uc,8423
|
16
|
-
web_novel_scraper-1.0.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
17
|
-
web_novel_scraper-1.0.4.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
|
18
|
-
web_novel_scraper-1.0.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|