web-novel-scraper 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web_novel_scraper/__main__.py +116 -94
- web_novel_scraper/config_manager.py +84 -0
- web_novel_scraper/decode.py +49 -38
- web_novel_scraper/decode_guide/decode_guide.json +85 -0
- web_novel_scraper/file_manager.py +226 -257
- web_novel_scraper/novel_scraper.py +90 -46
- web_novel_scraper/request_manager.py +70 -57
- web_novel_scraper/utils.py +139 -2
- web_novel_scraper/version.py +1 -1
- {web_novel_scraper-1.1.0.dist-info → web_novel_scraper-2.0.0.dist-info}/METADATA +1 -1
- web_novel_scraper-2.0.0.dist-info/RECORD +19 -0
- web_novel_scraper-1.1.0.dist-info/RECORD +0 -18
- {web_novel_scraper-1.1.0.dist-info → web_novel_scraper-2.0.0.dist-info}/WHEEL +0 -0
- {web_novel_scraper-1.1.0.dist-info → web_novel_scraper-2.0.0.dist-info}/entry_points.txt +0 -0
@@ -10,7 +10,8 @@ from .decode import Decoder
|
|
10
10
|
from .file_manager import FileManager
|
11
11
|
from . import utils
|
12
12
|
|
13
|
-
from . import
|
13
|
+
from .request_manager import get_html_content
|
14
|
+
from .config_manager import ScraperConfig
|
14
15
|
|
15
16
|
logger = logger_manager.create_logger('NOVEL SCRAPPING')
|
16
17
|
|
@@ -18,7 +19,6 @@ logger = logger_manager.create_logger('NOVEL SCRAPPING')
|
|
18
19
|
@dataclass_json
|
19
20
|
@dataclass
|
20
21
|
class Metadata:
|
21
|
-
novel_title: str
|
22
22
|
author: Optional[str] = None
|
23
23
|
start_date: Optional[str] = None
|
24
24
|
end_date: Optional[str] = None
|
@@ -105,10 +105,11 @@ class Chapter:
|
|
105
105
|
return self.chapter_title < another.chapter_title
|
106
106
|
|
107
107
|
|
108
|
-
@dataclass_json
|
108
|
+
@dataclass_json(undefined=Undefined.EXCLUDE)
|
109
109
|
@dataclass
|
110
110
|
class Novel:
|
111
|
-
metadata: Metadata
|
111
|
+
metadata: Metadata = None
|
112
|
+
title: str = None
|
112
113
|
scraper_behavior: ScraperBehavior = None
|
113
114
|
chapters: list[Chapter] = field(default_factory=list)
|
114
115
|
toc_main_url: Optional[str] = None
|
@@ -116,30 +117,23 @@ class Novel:
|
|
116
117
|
host: str = None
|
117
118
|
|
118
119
|
def __init__(self,
|
119
|
-
|
120
|
+
title: str,
|
120
121
|
toc_main_url: str = None,
|
121
122
|
toc_html: str = None,
|
122
123
|
chapters_url_list: list[str] = None,
|
123
124
|
metadata: Metadata = None,
|
124
125
|
chapters: list[Chapter] = None,
|
125
|
-
novel_base_dir: str = None,
|
126
126
|
scraper_behavior: ScraperBehavior = None,
|
127
|
-
host: str = None
|
128
|
-
|
127
|
+
host: str = None
|
128
|
+
):
|
129
129
|
if toc_main_url and toc_html:
|
130
|
-
logger.
|
131
|
-
|
130
|
+
logger.critical('There can only be one or toc_main_url or toc_html')
|
131
|
+
raise ValueError('There can only be one or toc_main_url or toc_html')
|
132
132
|
|
133
|
+
self.title = title
|
134
|
+
self.metadata = Metadata()
|
133
135
|
if metadata is not None:
|
134
136
|
self.metadata = metadata
|
135
|
-
elif novel_title is not None:
|
136
|
-
self.metadata = Metadata(novel_title)
|
137
|
-
else:
|
138
|
-
logger.error('You need to set "novel_title" or "metadata".')
|
139
|
-
sys.exit(1)
|
140
|
-
|
141
|
-
self.file_manager = FileManager(novel_title=self.metadata.novel_title,
|
142
|
-
novel_base_dir=novel_base_dir)
|
143
137
|
|
144
138
|
if toc_html:
|
145
139
|
self.file_manager.add_toc(toc_html)
|
@@ -155,9 +149,10 @@ class Novel:
|
|
155
149
|
sys.exit(1)
|
156
150
|
|
157
151
|
self.host = host if host else utils.obtain_host(self.toc_main_url)
|
158
|
-
self.decoder = Decoder(self.host)
|
159
152
|
|
160
|
-
self.
|
153
|
+
self.config = None
|
154
|
+
self.file_manager = None
|
155
|
+
self.decoder = None
|
161
156
|
|
162
157
|
def __str__(self):
|
163
158
|
"""
|
@@ -165,7 +160,7 @@ class Novel:
|
|
165
160
|
"""
|
166
161
|
toc_info = self.toc_main_url if self.toc_main_url else "TOC added manually"
|
167
162
|
attributes = [
|
168
|
-
f"Title: {self.
|
163
|
+
f"Title: {self.title}",
|
169
164
|
f"Author: {self.metadata.author}",
|
170
165
|
f"Language: {self.metadata.language}",
|
171
166
|
f"Description: {self.metadata.description}",
|
@@ -177,30 +172,57 @@ class Novel:
|
|
177
172
|
return (f"Novel Info: \n"
|
178
173
|
f"{attributes_str}")
|
179
174
|
|
175
|
+
@staticmethod
|
176
|
+
def load(title: str, cfg: ScraperConfig, novel_base_dir: str | None = None):
|
177
|
+
fm = FileManager(title, cfg.base_novels_dir, novel_base_dir, read_only=True)
|
178
|
+
raw = fm.load_novel_json()
|
179
|
+
if raw is None:
|
180
|
+
logger.debug(f'Novel "{title}" was not found.')
|
181
|
+
raise ValueError(f'Novel "{title}" was not found.')
|
182
|
+
novel = Novel.from_json(raw)
|
183
|
+
novel.config = cfg
|
184
|
+
novel.set_config(cfg=cfg, novel_base_dir=novel_base_dir)
|
185
|
+
return novel
|
186
|
+
|
180
187
|
# NOVEL PARAMETERS MANAGEMENT
|
181
188
|
|
182
|
-
def
|
189
|
+
def set_config(self,
|
190
|
+
cfg: ScraperConfig = None,
|
191
|
+
config_file: str = None,
|
192
|
+
base_novels_dir: str = None,
|
193
|
+
novel_base_dir: str = None,
|
194
|
+
decode_guide_file: str = None):
|
195
|
+
if cfg is not None:
|
196
|
+
self.config = cfg
|
197
|
+
else:
|
198
|
+
self.config = ScraperConfig(config_file=config_file,
|
199
|
+
base_novels_dir=base_novels_dir,
|
200
|
+
decode_guide_file=decode_guide_file)
|
201
|
+
|
202
|
+
self.file_manager = FileManager(title=self.title,
|
203
|
+
base_novels_dir=self.config.base_novels_dir,
|
204
|
+
novel_base_dir=novel_base_dir)
|
205
|
+
|
206
|
+
self.decoder = Decoder(self.host, self.config.decode_guide_file)
|
207
|
+
|
208
|
+
def set_scraper_behavior(self, save: bool = False, **kwargs) -> None:
|
183
209
|
self.scraper_behavior.update_behavior(**kwargs)
|
184
|
-
self.save_novel()
|
185
210
|
|
186
211
|
def set_metadata(self, **kwargs) -> None:
|
187
212
|
self.metadata.update_behavior(**kwargs)
|
188
|
-
self.save_novel()
|
189
213
|
|
190
214
|
def add_tag(self, tag: str) -> bool:
|
191
215
|
if tag not in self.metadata.tags:
|
192
216
|
self.metadata.tags.append(tag)
|
193
|
-
self.save_novel()
|
194
217
|
return True
|
195
|
-
logger.warning(f'Tag "{tag}" already exists on novel {self.
|
218
|
+
logger.warning(f'Tag "{tag}" already exists on novel {self.title}')
|
196
219
|
return False
|
197
220
|
|
198
221
|
def remove_tag(self, tag: str) -> bool:
|
199
222
|
if tag in self.metadata.tags:
|
200
223
|
self.metadata.tags.remove(tag)
|
201
|
-
self.save_novel()
|
202
224
|
return True
|
203
|
-
logger.warning(f'Tag "{tag}" doesn\'t exist on novel {self.
|
225
|
+
logger.warning(f'Tag "{tag}" doesn\'t exist on novel {self.title}')
|
204
226
|
return False
|
205
227
|
|
206
228
|
def set_cover_image(self, cover_image_path: str) -> bool:
|
@@ -208,10 +230,9 @@ class Novel:
|
|
208
230
|
|
209
231
|
def set_host(self, host: str) -> None:
|
210
232
|
self.host = host
|
211
|
-
self.decoder
|
212
|
-
self.save_novel()
|
233
|
+
self.decoder.set_host(host)
|
213
234
|
|
214
|
-
def save_novel(self) -> None:
|
235
|
+
def save_novel(self, save: bool = True) -> None:
|
215
236
|
self.file_manager.save_novel_json(self.to_dict())
|
216
237
|
|
217
238
|
# TABLE OF CONTENTS MANAGEMENT
|
@@ -224,7 +245,6 @@ class Novel:
|
|
224
245
|
self.decoder = Decoder(self.host)
|
225
246
|
elif update_host:
|
226
247
|
self.decoder = Decoder(utils.obtain_host(self.toc_main_url))
|
227
|
-
self.save_novel()
|
228
248
|
|
229
249
|
def add_toc_html(self, html: str, host: str = None) -> None:
|
230
250
|
if self.toc_main_url:
|
@@ -236,13 +256,11 @@ class Novel:
|
|
236
256
|
self.decoder = Decoder(self.host)
|
237
257
|
self.file_manager.add_toc(html)
|
238
258
|
# Delete toc_main_url since they are exclusive
|
239
|
-
self.save_novel()
|
240
259
|
|
241
260
|
def delete_toc(self):
|
242
261
|
self.file_manager.delete_toc()
|
243
262
|
self.chapters = []
|
244
263
|
self.chapters_url_list = []
|
245
|
-
self.save_novel()
|
246
264
|
|
247
265
|
def sync_toc(self, reload_files: bool = False) -> bool:
|
248
266
|
# Hard reload will request again the toc files from the toc_main_url
|
@@ -277,9 +295,17 @@ class Novel:
|
|
277
295
|
if chapters_url_from_toc_content is None:
|
278
296
|
logger.error('Chapters url not found on toc_content')
|
279
297
|
return False
|
280
|
-
|
281
|
-
|
282
|
-
|
298
|
+
# First we save a list of lists in case we need to invert the orderAdd commentMore actions
|
299
|
+
self.chapters_url_list.append(chapters_url_from_toc_content)
|
300
|
+
|
301
|
+
invert = self.decoder.is_index_inverted()
|
302
|
+
self.chapters_url_list = [
|
303
|
+
chapter
|
304
|
+
for chapters_url in (self.chapters_url_list[::-1] if invert else self.chapters_url_list)
|
305
|
+
for chapter in chapters_url
|
306
|
+
]
|
307
|
+
add_host_to_chapter = self.scraper_behavior.auto_add_host or self.decoder.add_host_to_chapter()
|
308
|
+
if add_host_to_chapter:
|
283
309
|
self.chapters_url_list = [
|
284
310
|
f'https://{self.host}{chapter_url}' for chapter_url in self.chapters_url_list]
|
285
311
|
self.chapters_url_list = utils.delete_duplicates(
|
@@ -329,6 +355,7 @@ class Novel:
|
|
329
355
|
chapter = self.chapters[chapter_idx]
|
330
356
|
if update_html:
|
331
357
|
logger.debug('HTML will be updated...')
|
358
|
+
|
332
359
|
chapter = self._get_chapter(chapter,
|
333
360
|
reload=update_html)
|
334
361
|
|
@@ -429,7 +456,7 @@ class Novel:
|
|
429
456
|
return True
|
430
457
|
|
431
458
|
|
432
|
-
|
459
|
+
## UTILS
|
433
460
|
|
434
461
|
|
435
462
|
def clean_files(self, clean_chapters: bool = True, clean_toc: bool = True, hard_clean: bool = False) -> None:
|
@@ -445,6 +472,9 @@ class Novel:
|
|
445
472
|
def show_novel_dir(self) -> str:
|
446
473
|
return self.file_manager.novel_base_dir
|
447
474
|
|
475
|
+
|
476
|
+
## PRIVATE HELPERS
|
477
|
+
|
448
478
|
def _clean_chapter(self, chapter_html_filename: str, hard_clean: bool = False) -> None:
|
449
479
|
hard_clean = hard_clean or self.scraper_behavior.hard_clean
|
450
480
|
chapter_html = self.file_manager.load_chapter_html(
|
@@ -464,6 +494,16 @@ class Novel:
|
|
464
494
|
toc = self.decoder.clean_html(toc, hard_clean=hard_clean)
|
465
495
|
self.file_manager.update_toc(toc, i)
|
466
496
|
|
497
|
+
def _request_html_content(self, url: str) -> Optional[str]:
|
498
|
+
request_config = self.decoder.request_config
|
499
|
+
force_flaresolver = request_config.get('force_flaresolver') or self.scraper_behavior.force_flaresolver
|
500
|
+
html_content = get_html_content(url,
|
501
|
+
retries=request_config.get('request_retries'),
|
502
|
+
timeout=request_config.get('request_timeout'),
|
503
|
+
time_between_retries=request_config.get('request_time_between_retries'),
|
504
|
+
force_flaresolver=force_flaresolver)
|
505
|
+
return html_content
|
506
|
+
|
467
507
|
def _get_chapter(self,
|
468
508
|
chapter: Chapter,
|
469
509
|
reload: bool = False) -> Chapter | None:
|
@@ -481,8 +521,7 @@ class Novel:
|
|
481
521
|
return chapter
|
482
522
|
|
483
523
|
# Fetch fresh content
|
484
|
-
chapter.chapter_html =
|
485
|
-
force_flaresolver=self.scraper_behavior.force_flaresolver)
|
524
|
+
chapter.chapter_html = self._request_html_content(chapter.chapter_url)
|
486
525
|
if not chapter.chapter_html:
|
487
526
|
logger.error(f'No content found on link {chapter.chapter_url}')
|
488
527
|
return chapter
|
@@ -501,7 +540,11 @@ class Novel:
|
|
501
540
|
if content:
|
502
541
|
return content
|
503
542
|
|
504
|
-
|
543
|
+
if utils.check_incomplete_url(url):
|
544
|
+
url = self.toc_main_url + url
|
545
|
+
|
546
|
+
# Fetch fresh content
|
547
|
+
content = self._request_html_content(url)
|
505
548
|
if not content:
|
506
549
|
logger.warning(f'No content found on link {url}')
|
507
550
|
sys.exit(1)
|
@@ -579,13 +622,14 @@ class Novel:
|
|
579
622
|
chapter_title = self.decoder.get_chapter_title(chapter.chapter_html)
|
580
623
|
if not chapter_title:
|
581
624
|
logger.debug('No chapter title found, generating one...')
|
582
|
-
chapter_title = f'{self.
|
625
|
+
chapter_title = f'{self.title} Chapter {idx_for_chapter_name}'
|
583
626
|
chapter.chapter_title = str(chapter_title)
|
584
627
|
logger.debug(f'Chapter title: "{chapter_title}"')
|
585
628
|
|
586
629
|
logger.debug('Obtaining chapter content...')
|
630
|
+
save_title_to_content = self.scraper_behavior.save_title_to_content or self.decoder.save_title_to_content()
|
587
631
|
chapter.chapter_content = self.decoder.get_chapter_content(chapter.chapter_html,
|
588
|
-
|
632
|
+
save_title_to_content,
|
589
633
|
chapter.chapter_title)
|
590
634
|
logger.debug('Chapter successfully decoded')
|
591
635
|
|
@@ -594,7 +638,7 @@ class Novel:
|
|
594
638
|
def _create_epub_book(self, book_title: str = None, calibre_collection: dict = None) -> epub.EpubBook:
|
595
639
|
book = epub.EpubBook()
|
596
640
|
if not book_title:
|
597
|
-
book_title = self.
|
641
|
+
book_title = self.title
|
598
642
|
book.set_title(book_title)
|
599
643
|
book.set_language(self.metadata.language)
|
600
644
|
book.add_metadata('DC', 'description', self.metadata.description)
|
@@ -679,11 +723,11 @@ class Novel:
|
|
679
723
|
idx_start = start_chapter - 1
|
680
724
|
idx_end = end_chapter
|
681
725
|
# We create the epub book
|
682
|
-
book_title = f'{self.
|
726
|
+
book_title = f'{self.title} Chapters {start_chapter} - {end_chapter}'
|
683
727
|
calibre_collection = None
|
684
728
|
# If collection_idx is set, we create a calibre collection
|
685
729
|
if collection_idx:
|
686
|
-
calibre_collection = {'title': self.
|
730
|
+
calibre_collection = {'title': self.title,
|
687
731
|
'idx': str(collection_idx)}
|
688
732
|
book = self._create_epub_book(book_title, calibre_collection)
|
689
733
|
|
@@ -4,6 +4,7 @@ from . import logger_manager
|
|
4
4
|
from dotenv import load_dotenv
|
5
5
|
import json
|
6
6
|
import time
|
7
|
+
from typing import Optional
|
7
8
|
|
8
9
|
load_dotenv()
|
9
10
|
|
@@ -13,45 +14,52 @@ FORCE_FLARESOLVER = os.getenv('FORCE_FLARESOLVER', '0') == '1'
|
|
13
14
|
|
14
15
|
logger = logger_manager.create_logger('GET HTML CONTENT')
|
15
16
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
17
|
+
|
18
|
+
def _get_request(url: str,
|
19
|
+
timeout: int,
|
20
|
+
retries: int,
|
21
|
+
time_between_retries: int) -> Optional[requests.Response]:
|
22
|
+
logger.debug(
|
23
|
+
f'Starting get_request for "{url}" with timeout={timeout}, '
|
24
|
+
f'retries={retries}, '
|
25
|
+
f'time_between_retries={time_between_retries}')
|
21
26
|
for attempt in range(retries):
|
22
|
-
logger.debug(f'Attempt {attempt + 1} for {url}')
|
27
|
+
logger.debug(f'Attempt {attempt + 1} for "{url}"')
|
23
28
|
try:
|
24
29
|
response = requests.get(url, timeout=timeout)
|
25
30
|
response.raise_for_status()
|
26
|
-
logger.debug(f'Successful response for {url} on attempt {attempt + 1}')
|
31
|
+
logger.debug(f'Successful response for "{url}" on attempt {attempt + 1}')
|
27
32
|
return response
|
28
33
|
except requests.exceptions.ConnectionError as e:
|
29
|
-
logger.
|
34
|
+
logger.debug(f'Connection error ({attempt + 1}/{retries}): {e}')
|
30
35
|
except requests.exceptions.Timeout as e:
|
31
|
-
logger.
|
36
|
+
logger.debug(f'Request timed out ({attempt + 1}/{retries}): {e}')
|
32
37
|
except requests.exceptions.HTTPError as e:
|
33
|
-
logger.
|
38
|
+
logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
|
34
39
|
except requests.exceptions.InvalidSchema as e:
|
35
|
-
logger.
|
40
|
+
logger.debug(f'Invalid URL schema for "{url}": {e}')
|
36
41
|
break # Don't retry on invalid schema
|
37
42
|
except requests.exceptions.RequestException as e:
|
38
|
-
logger.
|
43
|
+
logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
|
39
44
|
|
40
45
|
if attempt < retries - 1:
|
41
46
|
logger.debug(f'Waiting {time_between_retries} seconds before retrying')
|
42
47
|
time.sleep(time_between_retries) # Wait before retrying
|
43
|
-
logger.debug(f'Failed to get a successful response for {url} after {retries} attempts')
|
48
|
+
logger.debug(f'Failed to get a successful response for "{url}" after {retries} attempts using common HTTP Request')
|
44
49
|
return None
|
45
50
|
|
46
51
|
|
47
|
-
def
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
logger.debug(
|
52
|
+
def _get_request_flaresolver(url: str,
|
53
|
+
timeout: int,
|
54
|
+
retries: int,
|
55
|
+
time_between_retries: int,
|
56
|
+
flaresolver_url: str) -> Optional[requests.Response]:
|
57
|
+
logger.debug(
|
58
|
+
f'Starting get_request_flaresolver for "{url}" with timeout={timeout}, '
|
59
|
+
f'retries={retries}, '
|
60
|
+
f'time_between_retries={time_between_retries}')
|
53
61
|
for attempt in range(retries):
|
54
|
-
logger.debug(f'Attempt {attempt + 1} for {url} using FlareSolver')
|
62
|
+
logger.debug(f'Attempt {attempt + 1} for "{url}" using FlareSolver')
|
55
63
|
try:
|
56
64
|
response = requests.post(
|
57
65
|
flaresolver_url,
|
@@ -64,71 +72,76 @@ def get_request_flaresolver(url: str,
|
|
64
72
|
timeout=timeout
|
65
73
|
)
|
66
74
|
response.raise_for_status()
|
67
|
-
logger.debug(f'Successful response for {url} on attempt {attempt + 1} using FlareSolver')
|
75
|
+
logger.debug(f'Successful response for "{url}" on attempt {attempt + 1} using FlareSolver')
|
68
76
|
return response
|
69
77
|
|
70
78
|
except requests.exceptions.ConnectionError as e:
|
71
|
-
logger.
|
79
|
+
logger.warning(f'Connection error with flaresolver (URL: "{flaresolver_url}"): {e}')
|
80
|
+
logger.warning(f'If the url is incorrect, set the env variable "FLARESOLVER_URL" to the correct value')
|
81
|
+
logger.warning('If FlareSolver is not installed in your machine, consider installing it.')
|
82
|
+
break # Don't retry on Connection Error
|
72
83
|
except requests.exceptions.Timeout as e:
|
73
|
-
logger.
|
84
|
+
logger.debug(f'Request timed out ({attempt + 1}/{retries}): {e}')
|
74
85
|
except requests.exceptions.InvalidSchema as e:
|
75
|
-
logger.
|
86
|
+
logger.debug(f'Invalid FlareSolver URL schema "{flaresolver_url}": {e}')
|
76
87
|
break # Don't retry on invalid schema
|
77
88
|
except requests.exceptions.HTTPError as e:
|
78
|
-
logger.
|
89
|
+
logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
|
79
90
|
except requests.exceptions.RequestException as e:
|
80
|
-
logger.
|
91
|
+
logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
|
81
92
|
except json.JSONDecodeError as e:
|
82
|
-
logger.
|
93
|
+
logger.debug(f'Invalid JSON response ({attempt + 1}/{retries}): {e}')
|
83
94
|
|
84
95
|
if attempt < retries - 1:
|
85
96
|
logger.debug(f'Waiting {time_between_retries} seconds before retrying')
|
86
97
|
time.sleep(time_between_retries) # Wait before retrying
|
87
|
-
|
98
|
+
|
99
|
+
logger.debug(f'Failed to get a successful response for "{url}" using FlareSolver after {retries} attempts')
|
88
100
|
return None
|
89
101
|
|
90
102
|
|
91
103
|
def get_html_content(url: str,
|
92
|
-
retries: int =
|
93
|
-
|
104
|
+
retries: int = 3,
|
105
|
+
timeout: int = 20,
|
106
|
+
time_between_retries: int = 3,
|
94
107
|
flaresolver_url: str = FLARESOLVER_URL,
|
95
|
-
|
96
|
-
|
97
|
-
|
108
|
+
force_flaresolver: bool = FORCE_FLARESOLVER) -> Optional[str]:
|
109
|
+
logger.debug(
|
110
|
+
f'Requesting HTML Content for "{url}" with '
|
111
|
+
f'retries: "{retries}", '
|
112
|
+
f'timeout: "{timeout}", '
|
113
|
+
f'time between retries: "{time_between_retries}"')
|
114
|
+
if force_flaresolver:
|
115
|
+
logger.debug('Will directly try with FlareSolver')
|
116
|
+
|
98
117
|
# First try with common HTTP request
|
99
118
|
if not force_flaresolver:
|
100
|
-
response =
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
logger.
|
106
|
-
else:
|
107
|
-
logger.debug(f'Successfully retrieved HTML content from {url} using common HTTP request')
|
119
|
+
response = _get_request(url,
|
120
|
+
timeout=timeout,
|
121
|
+
retries=retries,
|
122
|
+
time_between_retries=time_between_retries)
|
123
|
+
if response and response.ok:
|
124
|
+
logger.debug(f'Successfully retrieved HTML content from "{url}" using common HTTP request')
|
108
125
|
return response.text
|
109
126
|
|
110
|
-
# If flaresolver is disabled, return None
|
111
|
-
if not flaresolver:
|
112
|
-
logger.debug(f'Flaresolver is disabled, returning None for {url}')
|
113
|
-
return None
|
114
|
-
|
115
127
|
# Try with Flaresolver
|
116
|
-
logger.debug(f'Trying with Flaresolver for {url}')
|
117
|
-
response =
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
if not response.ok:
|
123
|
-
logger.
|
128
|
+
logger.debug(f'Trying with Flaresolver for "{url}"')
|
129
|
+
response = _get_request_flaresolver(url,
|
130
|
+
timeout=timeout,
|
131
|
+
retries=retries,
|
132
|
+
time_between_retries=time_between_retries,
|
133
|
+
flaresolver_url=flaresolver_url)
|
134
|
+
if not response or not response.ok:
|
135
|
+
logger.warning(f'Failed all attempts to get HTML content from "{url}')
|
124
136
|
return None
|
125
137
|
|
126
138
|
response_json = response.json()
|
127
139
|
if 'solution' not in response_json:
|
128
|
-
logger.
|
140
|
+
logger.warning(f'No solution found in FlareSolver response for "{url}"')
|
129
141
|
return None
|
130
142
|
if 'response' not in response_json['solution']:
|
131
|
-
logger.
|
143
|
+
logger.warning(f'No response found in FlareSolver solution for "{url}"')
|
132
144
|
return None
|
133
|
-
|
145
|
+
|
146
|
+
logger.debug(f'Successfully retrieved HTML content from "{url}" using FlareSolver')
|
134
147
|
return response_json['solution']['response']
|
web_novel_scraper/utils.py
CHANGED
@@ -1,10 +1,140 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
import json
|
2
|
+
import shutil
|
3
|
+
from datetime import datetime, timezone
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Optional
|
6
|
+
|
3
7
|
import hashlib
|
4
8
|
from urllib.parse import urlparse
|
5
9
|
import re
|
6
10
|
import unicodedata
|
7
11
|
|
12
|
+
class FileManagerError(Exception):
|
13
|
+
"""Exception raised for any exception for file operations"""
|
14
|
+
|
15
|
+
class FileOps:
|
16
|
+
"""Static helper for disc operations."""
|
17
|
+
|
18
|
+
## HELPERS
|
19
|
+
|
20
|
+
@staticmethod
|
21
|
+
def _atomic_tmp(path: Path) -> Path:
|
22
|
+
"""Temporary file path in the same directory as *path*."""
|
23
|
+
return path.with_suffix(path.suffix + ".tmp")
|
24
|
+
|
25
|
+
## DIRECTORY MANAGEMENT
|
26
|
+
@staticmethod
|
27
|
+
def ensure_dir(path: Path) -> Path:
|
28
|
+
"""Create *path* (and parents) if missing."""
|
29
|
+
try:
|
30
|
+
path.mkdir(parents=True, exist_ok=True)
|
31
|
+
return path
|
32
|
+
except Exception as e:
|
33
|
+
raise FileManagerError(str(e)) from e
|
34
|
+
|
35
|
+
## READ OPERATIONS
|
36
|
+
|
37
|
+
@staticmethod
|
38
|
+
def read_text(path: Path) -> Optional[str]:
|
39
|
+
"""Return UTF-8 contents or None if *path* does not exist."""
|
40
|
+
if not path.exists():
|
41
|
+
return None
|
42
|
+
try:
|
43
|
+
return path.read_text(encoding="utf-8")
|
44
|
+
except Exception as e:
|
45
|
+
raise FileManagerError(str(e)) from e
|
46
|
+
|
47
|
+
@staticmethod
|
48
|
+
def read_json(path: Path | str) -> Optional[dict]:
|
49
|
+
"""Return JSON object or None if *path* does not exist."""
|
50
|
+
path = Path(path)
|
51
|
+
raw = FileOps.read_text(path)
|
52
|
+
if raw is None:
|
53
|
+
return None
|
54
|
+
try:
|
55
|
+
return json.loads(raw)
|
56
|
+
except Exception as e:
|
57
|
+
raise FileManagerError(str(e)) from e
|
58
|
+
|
59
|
+
@staticmethod
|
60
|
+
def read_binary(path: Path) -> Optional[bytes]:
|
61
|
+
"""Return binary contents or None if *path* does not exist."""
|
62
|
+
if not path.exists():
|
63
|
+
return None
|
64
|
+
try:
|
65
|
+
return path.read_bytes()
|
66
|
+
except Exception as e:
|
67
|
+
raise FileManagerError(str(e)) from e
|
68
|
+
|
69
|
+
## WRITE OPERATION
|
70
|
+
|
71
|
+
@staticmethod
|
72
|
+
def save_text(path: Path, text: str) -> None:
|
73
|
+
"""Atomically write UTF-8 text to *path*."""
|
74
|
+
tmp = FileOps._atomic_tmp(path)
|
75
|
+
try:
|
76
|
+
tmp.write_text(text, encoding="utf-8")
|
77
|
+
tmp.replace(path)
|
78
|
+
except Exception as e:
|
79
|
+
FileOps.delete(tmp)
|
80
|
+
raise FileManagerError(str(e)) from e
|
81
|
+
|
82
|
+
@staticmethod
|
83
|
+
def save_json(path: Path, obj: dict) -> None:
|
84
|
+
"""Atomically write pretty-printed JSON to *path*."""
|
85
|
+
tmp = FileOps._atomic_tmp(path)
|
86
|
+
try:
|
87
|
+
tmp.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
|
88
|
+
tmp.replace(path)
|
89
|
+
except Exception as e:
|
90
|
+
FileOps.delete(tmp)
|
91
|
+
raise FileManagerError(str(e)) from e
|
92
|
+
|
93
|
+
@staticmethod
|
94
|
+
def save_binary(path: Path, data: bytes) -> None:
|
95
|
+
"""Atomically write binary data to *path* (e.g., cover images)."""
|
96
|
+
tmp = FileOps._atomic_tmp(path)
|
97
|
+
try:
|
98
|
+
tmp.write_bytes(data)
|
99
|
+
tmp.replace(path)
|
100
|
+
except Exception as e:
|
101
|
+
FileOps.delete(tmp)
|
102
|
+
raise FileManagerError(str(e)) from e
|
103
|
+
|
104
|
+
## DELETE/COPY OPERATIONS
|
105
|
+
|
106
|
+
@staticmethod
|
107
|
+
def delete(path: Path) -> None:
|
108
|
+
"""Delete *path* if it exists."""
|
109
|
+
try:
|
110
|
+
if path.exists():
|
111
|
+
path.unlink()
|
112
|
+
except Exception as e:
|
113
|
+
raise FileManagerError(str(e)) from e
|
114
|
+
|
115
|
+
@staticmethod
|
116
|
+
def copy(src: Path, dst: Path) -> None:
|
117
|
+
"""Copy *src* to *dst*."""
|
118
|
+
try:
|
119
|
+
shutil.copy(src, dst)
|
120
|
+
except Exception as e:
|
121
|
+
raise FileManagerError(str(e)) from e
|
122
|
+
|
123
|
+
def _normalize_dirname(name: str) -> str:
|
124
|
+
"""
|
125
|
+
Keep whitespace as-is while replacing any other unsupported characters
|
126
|
+
with an underscore.
|
127
|
+
Allowed: letters, digits, underscore, hyphen, and spaces.
|
128
|
+
"""
|
129
|
+
# Collapse multiple spaces into a single space (optional; comment out if not desired)
|
130
|
+
name = re.sub(r'\s+', ' ', name.strip())
|
131
|
+
|
132
|
+
# Replace any char that is *not* letter, digit, underscore, hyphen, or space.
|
133
|
+
return re.sub(r'[^\w\-\s]', '_', name)
|
134
|
+
|
135
|
+
def now_iso() -> str:
|
136
|
+
"""Current timestamp in ISO-8601 (seconds precision)."""
|
137
|
+
return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
|
8
138
|
|
9
139
|
def generate_file_name_from_url(url: str) -> str:
|
10
140
|
# Parsea URL
|
@@ -64,3 +194,10 @@ def check_exclusive_params(param1: any, param2: any) -> bool:
|
|
64
194
|
|
65
195
|
def create_volume_id(n: int):
|
66
196
|
return f'v{n:02}'
|
197
|
+
|
198
|
+
def check_incomplete_url(url: str) -> bool:
|
199
|
+
if url.startswith('?') or url.startswith('#'):
|
200
|
+
return True
|
201
|
+
|
202
|
+
parsed = urlparse(url)
|
203
|
+
return not parsed.scheme or not parsed.netloc
|
web_novel_scraper/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "
|
1
|
+
__version__ = "2.0.0"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: web-novel-scraper
|
3
|
-
Version:
|
3
|
+
Version: 2.0.0
|
4
4
|
Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
|
5
5
|
Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
|
6
6
|
Project-URL: Documentation, https://web-novel-scraper.readthedocs.io
|