web-novel-scraper 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,7 +10,8 @@ from .decode import Decoder
10
10
  from .file_manager import FileManager
11
11
  from . import utils
12
12
 
13
- from . import request_manager
13
+ from .request_manager import get_html_content
14
+ from .config_manager import ScraperConfig
14
15
 
15
16
  logger = logger_manager.create_logger('NOVEL SCRAPPING')
16
17
 
@@ -18,7 +19,6 @@ logger = logger_manager.create_logger('NOVEL SCRAPPING')
18
19
  @dataclass_json
19
20
  @dataclass
20
21
  class Metadata:
21
- novel_title: str
22
22
  author: Optional[str] = None
23
23
  start_date: Optional[str] = None
24
24
  end_date: Optional[str] = None
@@ -105,10 +105,11 @@ class Chapter:
105
105
  return self.chapter_title < another.chapter_title
106
106
 
107
107
 
108
- @dataclass_json
108
+ @dataclass_json(undefined=Undefined.EXCLUDE)
109
109
  @dataclass
110
110
  class Novel:
111
- metadata: Metadata
111
+ metadata: Metadata = None
112
+ title: str = None
112
113
  scraper_behavior: ScraperBehavior = None
113
114
  chapters: list[Chapter] = field(default_factory=list)
114
115
  toc_main_url: Optional[str] = None
@@ -116,30 +117,23 @@ class Novel:
116
117
  host: str = None
117
118
 
118
119
  def __init__(self,
119
- novel_title: str = None,
120
+ title: str,
120
121
  toc_main_url: str = None,
121
122
  toc_html: str = None,
122
123
  chapters_url_list: list[str] = None,
123
124
  metadata: Metadata = None,
124
125
  chapters: list[Chapter] = None,
125
- novel_base_dir: str = None,
126
126
  scraper_behavior: ScraperBehavior = None,
127
- host: str = None):
128
-
127
+ host: str = None
128
+ ):
129
129
  if toc_main_url and toc_html:
130
- logger.error('There can only be one or toc_main_url or toc_html')
131
- sys.exit(1)
130
+ logger.critical('There can only be one or toc_main_url or toc_html')
131
+ raise ValueError('There can only be one or toc_main_url or toc_html')
132
132
 
133
+ self.title = title
134
+ self.metadata = Metadata()
133
135
  if metadata is not None:
134
136
  self.metadata = metadata
135
- elif novel_title is not None:
136
- self.metadata = Metadata(novel_title)
137
- else:
138
- logger.error('You need to set "novel_title" or "metadata".')
139
- sys.exit(1)
140
-
141
- self.file_manager = FileManager(novel_title=self.metadata.novel_title,
142
- novel_base_dir=novel_base_dir)
143
137
 
144
138
  if toc_html:
145
139
  self.file_manager.add_toc(toc_html)
@@ -155,9 +149,10 @@ class Novel:
155
149
  sys.exit(1)
156
150
 
157
151
  self.host = host if host else utils.obtain_host(self.toc_main_url)
158
- self.decoder = Decoder(self.host)
159
152
 
160
- self.save_novel()
153
+ self.config = None
154
+ self.file_manager = None
155
+ self.decoder = None
161
156
 
162
157
  def __str__(self):
163
158
  """
@@ -165,7 +160,7 @@ class Novel:
165
160
  """
166
161
  toc_info = self.toc_main_url if self.toc_main_url else "TOC added manually"
167
162
  attributes = [
168
- f"Title: {self.metadata.novel_title}",
163
+ f"Title: {self.title}",
169
164
  f"Author: {self.metadata.author}",
170
165
  f"Language: {self.metadata.language}",
171
166
  f"Description: {self.metadata.description}",
@@ -177,30 +172,57 @@ class Novel:
177
172
  return (f"Novel Info: \n"
178
173
  f"{attributes_str}")
179
174
 
175
+ @staticmethod
176
+ def load(title: str, cfg: ScraperConfig, novel_base_dir: str | None = None):
177
+ fm = FileManager(title, cfg.base_novels_dir, novel_base_dir, read_only=True)
178
+ raw = fm.load_novel_json()
179
+ if raw is None:
180
+ logger.debug(f'Novel "{title}" was not found.')
181
+ raise ValueError(f'Novel "{title}" was not found.')
182
+ novel = Novel.from_json(raw)
183
+ novel.config = cfg
184
+ novel.set_config(cfg=cfg, novel_base_dir=novel_base_dir)
185
+ return novel
186
+
180
187
  # NOVEL PARAMETERS MANAGEMENT
181
188
 
182
- def set_scraper_behavior(self, **kwargs) -> None:
189
+ def set_config(self,
190
+ cfg: ScraperConfig = None,
191
+ config_file: str = None,
192
+ base_novels_dir: str = None,
193
+ novel_base_dir: str = None,
194
+ decode_guide_file: str = None):
195
+ if cfg is not None:
196
+ self.config = cfg
197
+ else:
198
+ self.config = ScraperConfig(config_file=config_file,
199
+ base_novels_dir=base_novels_dir,
200
+ decode_guide_file=decode_guide_file)
201
+
202
+ self.file_manager = FileManager(title=self.title,
203
+ base_novels_dir=self.config.base_novels_dir,
204
+ novel_base_dir=novel_base_dir)
205
+
206
+ self.decoder = Decoder(self.host, self.config.decode_guide_file)
207
+
208
+ def set_scraper_behavior(self, save: bool = False, **kwargs) -> None:
183
209
  self.scraper_behavior.update_behavior(**kwargs)
184
- self.save_novel()
185
210
 
186
211
  def set_metadata(self, **kwargs) -> None:
187
212
  self.metadata.update_behavior(**kwargs)
188
- self.save_novel()
189
213
 
190
214
  def add_tag(self, tag: str) -> bool:
191
215
  if tag not in self.metadata.tags:
192
216
  self.metadata.tags.append(tag)
193
- self.save_novel()
194
217
  return True
195
- logger.warning(f'Tag "{tag}" already exists on novel {self.metadata.novel_title}')
218
+ logger.warning(f'Tag "{tag}" already exists on novel {self.title}')
196
219
  return False
197
220
 
198
221
  def remove_tag(self, tag: str) -> bool:
199
222
  if tag in self.metadata.tags:
200
223
  self.metadata.tags.remove(tag)
201
- self.save_novel()
202
224
  return True
203
- logger.warning(f'Tag "{tag}" doesn\'t exist on novel {self.metadata.novel_title}')
225
+ logger.warning(f'Tag "{tag}" doesn\'t exist on novel {self.title}')
204
226
  return False
205
227
 
206
228
  def set_cover_image(self, cover_image_path: str) -> bool:
@@ -208,10 +230,9 @@ class Novel:
208
230
 
209
231
  def set_host(self, host: str) -> None:
210
232
  self.host = host
211
- self.decoder = Decoder(self.host)
212
- self.save_novel()
233
+ self.decoder.set_host(host)
213
234
 
214
- def save_novel(self) -> None:
235
+ def save_novel(self, save: bool = True) -> None:
215
236
  self.file_manager.save_novel_json(self.to_dict())
216
237
 
217
238
  # TABLE OF CONTENTS MANAGEMENT
@@ -224,7 +245,6 @@ class Novel:
224
245
  self.decoder = Decoder(self.host)
225
246
  elif update_host:
226
247
  self.decoder = Decoder(utils.obtain_host(self.toc_main_url))
227
- self.save_novel()
228
248
 
229
249
  def add_toc_html(self, html: str, host: str = None) -> None:
230
250
  if self.toc_main_url:
@@ -236,13 +256,11 @@ class Novel:
236
256
  self.decoder = Decoder(self.host)
237
257
  self.file_manager.add_toc(html)
238
258
  # Delete toc_main_url since they are exclusive
239
- self.save_novel()
240
259
 
241
260
  def delete_toc(self):
242
261
  self.file_manager.delete_toc()
243
262
  self.chapters = []
244
263
  self.chapters_url_list = []
245
- self.save_novel()
246
264
 
247
265
  def sync_toc(self, reload_files: bool = False) -> bool:
248
266
  # Hard reload will request again the toc files from the toc_main_url
@@ -277,9 +295,17 @@ class Novel:
277
295
  if chapters_url_from_toc_content is None:
278
296
  logger.error('Chapters url not found on toc_content')
279
297
  return False
280
- self.chapters_url_list = [*self.chapters_url_list,
281
- *chapters_url_from_toc_content]
282
- if self.scraper_behavior.auto_add_host:
298
+ # First we save a list of lists in case we need to invert the orderAdd commentMore actions
299
+ self.chapters_url_list.append(chapters_url_from_toc_content)
300
+
301
+ invert = self.decoder.is_index_inverted()
302
+ self.chapters_url_list = [
303
+ chapter
304
+ for chapters_url in (self.chapters_url_list[::-1] if invert else self.chapters_url_list)
305
+ for chapter in chapters_url
306
+ ]
307
+ add_host_to_chapter = self.scraper_behavior.auto_add_host or self.decoder.add_host_to_chapter()
308
+ if add_host_to_chapter:
283
309
  self.chapters_url_list = [
284
310
  f'https://{self.host}{chapter_url}' for chapter_url in self.chapters_url_list]
285
311
  self.chapters_url_list = utils.delete_duplicates(
@@ -329,6 +355,7 @@ class Novel:
329
355
  chapter = self.chapters[chapter_idx]
330
356
  if update_html:
331
357
  logger.debug('HTML will be updated...')
358
+
332
359
  chapter = self._get_chapter(chapter,
333
360
  reload=update_html)
334
361
 
@@ -429,7 +456,7 @@ class Novel:
429
456
  return True
430
457
 
431
458
 
432
- # UTILS
459
+ ## UTILS
433
460
 
434
461
 
435
462
  def clean_files(self, clean_chapters: bool = True, clean_toc: bool = True, hard_clean: bool = False) -> None:
@@ -445,6 +472,9 @@ class Novel:
445
472
  def show_novel_dir(self) -> str:
446
473
  return self.file_manager.novel_base_dir
447
474
 
475
+
476
+ ## PRIVATE HELPERS
477
+
448
478
  def _clean_chapter(self, chapter_html_filename: str, hard_clean: bool = False) -> None:
449
479
  hard_clean = hard_clean or self.scraper_behavior.hard_clean
450
480
  chapter_html = self.file_manager.load_chapter_html(
@@ -464,6 +494,16 @@ class Novel:
464
494
  toc = self.decoder.clean_html(toc, hard_clean=hard_clean)
465
495
  self.file_manager.update_toc(toc, i)
466
496
 
497
+ def _request_html_content(self, url: str) -> Optional[str]:
498
+ request_config = self.decoder.request_config
499
+ force_flaresolver = request_config.get('force_flaresolver') or self.scraper_behavior.force_flaresolver
500
+ html_content = get_html_content(url,
501
+ retries=request_config.get('request_retries'),
502
+ timeout=request_config.get('request_timeout'),
503
+ time_between_retries=request_config.get('request_time_between_retries'),
504
+ force_flaresolver=force_flaresolver)
505
+ return html_content
506
+
467
507
  def _get_chapter(self,
468
508
  chapter: Chapter,
469
509
  reload: bool = False) -> Chapter | None:
@@ -481,8 +521,7 @@ class Novel:
481
521
  return chapter
482
522
 
483
523
  # Fetch fresh content
484
- chapter.chapter_html = request_manager.get_html_content(chapter.chapter_url,
485
- force_flaresolver=self.scraper_behavior.force_flaresolver)
524
+ chapter.chapter_html = self._request_html_content(chapter.chapter_url)
486
525
  if not chapter.chapter_html:
487
526
  logger.error(f'No content found on link {chapter.chapter_url}')
488
527
  return chapter
@@ -501,7 +540,11 @@ class Novel:
501
540
  if content:
502
541
  return content
503
542
 
504
- content = request_manager.get_html_content(url)
543
+ if utils.check_incomplete_url(url):
544
+ url = self.toc_main_url + url
545
+
546
+ # Fetch fresh content
547
+ content = self._request_html_content(url)
505
548
  if not content:
506
549
  logger.warning(f'No content found on link {url}')
507
550
  sys.exit(1)
@@ -579,13 +622,14 @@ class Novel:
579
622
  chapter_title = self.decoder.get_chapter_title(chapter.chapter_html)
580
623
  if not chapter_title:
581
624
  logger.debug('No chapter title found, generating one...')
582
- chapter_title = f'{self.metadata.novel_title} Chapter {idx_for_chapter_name}'
625
+ chapter_title = f'{self.title} Chapter {idx_for_chapter_name}'
583
626
  chapter.chapter_title = str(chapter_title)
584
627
  logger.debug(f'Chapter title: "{chapter_title}"')
585
628
 
586
629
  logger.debug('Obtaining chapter content...')
630
+ save_title_to_content = self.scraper_behavior.save_title_to_content or self.decoder.save_title_to_content()
587
631
  chapter.chapter_content = self.decoder.get_chapter_content(chapter.chapter_html,
588
- self.scraper_behavior.save_title_to_content,
632
+ save_title_to_content,
589
633
  chapter.chapter_title)
590
634
  logger.debug('Chapter successfully decoded')
591
635
 
@@ -594,7 +638,7 @@ class Novel:
594
638
  def _create_epub_book(self, book_title: str = None, calibre_collection: dict = None) -> epub.EpubBook:
595
639
  book = epub.EpubBook()
596
640
  if not book_title:
597
- book_title = self.metadata.novel_title
641
+ book_title = self.title
598
642
  book.set_title(book_title)
599
643
  book.set_language(self.metadata.language)
600
644
  book.add_metadata('DC', 'description', self.metadata.description)
@@ -679,11 +723,11 @@ class Novel:
679
723
  idx_start = start_chapter - 1
680
724
  idx_end = end_chapter
681
725
  # We create the epub book
682
- book_title = f'{self.metadata.novel_title} Chapters {start_chapter} - {end_chapter}'
726
+ book_title = f'{self.title} Chapters {start_chapter} - {end_chapter}'
683
727
  calibre_collection = None
684
728
  # If collection_idx is set, we create a calibre collection
685
729
  if collection_idx:
686
- calibre_collection = {'title': self.metadata.novel_title,
730
+ calibre_collection = {'title': self.title,
687
731
  'idx': str(collection_idx)}
688
732
  book = self._create_epub_book(book_title, calibre_collection)
689
733
 
@@ -4,6 +4,7 @@ from . import logger_manager
4
4
  from dotenv import load_dotenv
5
5
  import json
6
6
  import time
7
+ from typing import Optional
7
8
 
8
9
  load_dotenv()
9
10
 
@@ -13,45 +14,52 @@ FORCE_FLARESOLVER = os.getenv('FORCE_FLARESOLVER', '0') == '1'
13
14
 
14
15
  logger = logger_manager.create_logger('GET HTML CONTENT')
15
16
 
16
- def get_request(url: str,
17
- timeout: int = 20,
18
- retries: int = 3,
19
- time_between_retries: int = 1) -> requests.Response | None:
20
- logger.debug(f'Starting get_request for {url} with timeout={timeout}, retries={retries}, time_between_retries={time_between_retries}')
17
+
18
+ def _get_request(url: str,
19
+ timeout: int,
20
+ retries: int,
21
+ time_between_retries: int) -> Optional[requests.Response]:
22
+ logger.debug(
23
+ f'Starting get_request for "{url}" with timeout={timeout}, '
24
+ f'retries={retries}, '
25
+ f'time_between_retries={time_between_retries}')
21
26
  for attempt in range(retries):
22
- logger.debug(f'Attempt {attempt + 1} for {url}')
27
+ logger.debug(f'Attempt {attempt + 1} for "{url}"')
23
28
  try:
24
29
  response = requests.get(url, timeout=timeout)
25
30
  response.raise_for_status()
26
- logger.debug(f'Successful response for {url} on attempt {attempt + 1}')
31
+ logger.debug(f'Successful response for "{url}" on attempt {attempt + 1}')
27
32
  return response
28
33
  except requests.exceptions.ConnectionError as e:
29
- logger.error(f'Connection error ({attempt + 1}/{retries}): {e}')
34
+ logger.debug(f'Connection error ({attempt + 1}/{retries}): {e}')
30
35
  except requests.exceptions.Timeout as e:
31
- logger.error(f'Request timed out ({attempt + 1}/{retries}): {e}')
36
+ logger.debug(f'Request timed out ({attempt + 1}/{retries}): {e}')
32
37
  except requests.exceptions.HTTPError as e:
33
- logger.error(f'HTTP error ({attempt + 1}/{retries}): {e}')
38
+ logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
34
39
  except requests.exceptions.InvalidSchema as e:
35
- logger.error(f'Invalid URL schema for "{url}": {e}')
40
+ logger.debug(f'Invalid URL schema for "{url}": {e}')
36
41
  break # Don't retry on invalid schema
37
42
  except requests.exceptions.RequestException as e:
38
- logger.error(f'Request failed ({attempt + 1}/{retries}): {e}')
43
+ logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
39
44
 
40
45
  if attempt < retries - 1:
41
46
  logger.debug(f'Waiting {time_between_retries} seconds before retrying')
42
47
  time.sleep(time_between_retries) # Wait before retrying
43
- logger.debug(f'Failed to get a successful response for {url} after {retries} attempts')
48
+ logger.debug(f'Failed to get a successful response for "{url}" after {retries} attempts using common HTTP Request')
44
49
  return None
45
50
 
46
51
 
47
- def get_request_flaresolver(url: str,
48
- timeout: int = 20,
49
- flaresolver_url: str = FLARESOLVER_URL,
50
- retries: int = 3,
51
- time_between_retries: int = 1) -> requests.Response | None:
52
- logger.debug(f'Starting get_request_flaresolver for {url} with timeout={timeout}, retries={retries}, time_between_retries={time_between_retries}')
52
+ def _get_request_flaresolver(url: str,
53
+ timeout: int,
54
+ retries: int,
55
+ time_between_retries: int,
56
+ flaresolver_url: str) -> Optional[requests.Response]:
57
+ logger.debug(
58
+ f'Starting get_request_flaresolver for "{url}" with timeout={timeout}, '
59
+ f'retries={retries}, '
60
+ f'time_between_retries={time_between_retries}')
53
61
  for attempt in range(retries):
54
- logger.debug(f'Attempt {attempt + 1} for {url} using FlareSolver')
62
+ logger.debug(f'Attempt {attempt + 1} for "{url}" using FlareSolver')
55
63
  try:
56
64
  response = requests.post(
57
65
  flaresolver_url,
@@ -64,71 +72,76 @@ def get_request_flaresolver(url: str,
64
72
  timeout=timeout
65
73
  )
66
74
  response.raise_for_status()
67
- logger.debug(f'Successful response for {url} on attempt {attempt + 1} using FlareSolver')
75
+ logger.debug(f'Successful response for "{url}" on attempt {attempt + 1} using FlareSolver')
68
76
  return response
69
77
 
70
78
  except requests.exceptions.ConnectionError as e:
71
- logger.error(f'Connection error ({attempt + 1}/{retries}), check FlareSolver host: {flaresolver_url}: {e}')
79
+ logger.warning(f'Connection error with flaresolver (URL: "{flaresolver_url}"): {e}')
80
+ logger.warning(f'If the url is incorrect, set the env variable "FLARESOLVER_URL" to the correct value')
81
+ logger.warning('If FlareSolver is not installed in your machine, consider installing it.')
82
+ break # Don't retry on Connection Error
72
83
  except requests.exceptions.Timeout as e:
73
- logger.error(f'Request timed out ({attempt + 1}/{retries}): {e}')
84
+ logger.debug(f'Request timed out ({attempt + 1}/{retries}): {e}')
74
85
  except requests.exceptions.InvalidSchema as e:
75
- logger.error(f'Invalid FlareSolver URL schema "{flaresolver_url}": {e}')
86
+ logger.debug(f'Invalid FlareSolver URL schema "{flaresolver_url}": {e}')
76
87
  break # Don't retry on invalid schema
77
88
  except requests.exceptions.HTTPError as e:
78
- logger.error(f'HTTP error ({attempt + 1}/{retries}): {e}')
89
+ logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
79
90
  except requests.exceptions.RequestException as e:
80
- logger.error(f'Request failed ({attempt + 1}/{retries}): {e}')
91
+ logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
81
92
  except json.JSONDecodeError as e:
82
- logger.error(f'Invalid JSON response ({attempt + 1}/{retries}): {e}')
93
+ logger.debug(f'Invalid JSON response ({attempt + 1}/{retries}): {e}')
83
94
 
84
95
  if attempt < retries - 1:
85
96
  logger.debug(f'Waiting {time_between_retries} seconds before retrying')
86
97
  time.sleep(time_between_retries) # Wait before retrying
87
- logger.debug(f'Failed to get a successful response for {url} using FlareSolver after {retries} attempts')
98
+
99
+ logger.debug(f'Failed to get a successful response for "{url}" using FlareSolver after {retries} attempts')
88
100
  return None
89
101
 
90
102
 
91
103
  def get_html_content(url: str,
92
- retries: int = 5,
93
- flaresolver: bool = True,
104
+ retries: int = 3,
105
+ timeout: int = 20,
106
+ time_between_retries: int = 3,
94
107
  flaresolver_url: str = FLARESOLVER_URL,
95
- time_between_retries: int = 1,
96
- force_flaresolver: bool = FORCE_FLARESOLVER) -> str | None:
97
- logger.debug(f'Starting get_html_content for {url} with retries={retries}, flaresolver={flaresolver}, flaresolver_url={flaresolver_url}, time_between_retries={time_between_retries}, force_flaresolver={force_flaresolver}')
108
+ force_flaresolver: bool = FORCE_FLARESOLVER) -> Optional[str]:
109
+ logger.debug(
110
+ f'Requesting HTML Content for "{url}" with '
111
+ f'retries: "{retries}", '
112
+ f'timeout: "{timeout}", '
113
+ f'time between retries: "{time_between_retries}"')
114
+ if force_flaresolver:
115
+ logger.debug('Will directly try with FlareSolver')
116
+
98
117
  # First try with common HTTP request
99
118
  if not force_flaresolver:
100
- response = get_request(
101
- url, timeout=20, retries=retries, time_between_retries=time_between_retries)
102
- if not response:
103
- logger.warning(f'Failed to get response from {url} using common HTTP request')
104
- elif not response.ok:
105
- logger.warning(f'Response with errors from {url} using common HTTP request')
106
- else:
107
- logger.debug(f'Successfully retrieved HTML content from {url} using common HTTP request')
119
+ response = _get_request(url,
120
+ timeout=timeout,
121
+ retries=retries,
122
+ time_between_retries=time_between_retries)
123
+ if response and response.ok:
124
+ logger.debug(f'Successfully retrieved HTML content from "{url}" using common HTTP request')
108
125
  return response.text
109
126
 
110
- # If flaresolver is disabled, return None
111
- if not flaresolver:
112
- logger.debug(f'Flaresolver is disabled, returning None for {url}')
113
- return None
114
-
115
127
  # Try with Flaresolver
116
- logger.debug(f'Trying with Flaresolver for {url}')
117
- response = get_request_flaresolver(
118
- url, timeout=20, flaresolver_url=flaresolver_url, time_between_retries=time_between_retries)
119
- if not response:
120
- logger.critical(f'Failed to get response from {url} using FlareSolver')
121
- return None
122
- if not response.ok:
123
- logger.critical(f'Response with errors from {url} using FlareSolver')
128
+ logger.debug(f'Trying with Flaresolver for "{url}"')
129
+ response = _get_request_flaresolver(url,
130
+ timeout=timeout,
131
+ retries=retries,
132
+ time_between_retries=time_between_retries,
133
+ flaresolver_url=flaresolver_url)
134
+ if not response or not response.ok:
135
+ logger.warning(f'Failed all attempts to get HTML content from "{url}')
124
136
  return None
125
137
 
126
138
  response_json = response.json()
127
139
  if 'solution' not in response_json:
128
- logger.critical(f'No solution found in FlareSolver response for {url}')
140
+ logger.warning(f'No solution found in FlareSolver response for "{url}"')
129
141
  return None
130
142
  if 'response' not in response_json['solution']:
131
- logger.critical(f'No response found in FlareSolver solution for {url}')
143
+ logger.warning(f'No response found in FlareSolver solution for "{url}"')
132
144
  return None
133
- logger.debug(f'Successfully retrieved HTML content from {url} using FlareSolver')
145
+
146
+ logger.debug(f'Successfully retrieved HTML content from "{url}" using FlareSolver')
134
147
  return response_json['solution']['response']
@@ -1,10 +1,140 @@
1
- from .file_manager import FileManager
2
- from . import request_manager
1
+ import json
2
+ import shutil
3
+ from datetime import datetime, timezone
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
3
7
  import hashlib
4
8
  from urllib.parse import urlparse
5
9
  import re
6
10
  import unicodedata
7
11
 
12
+ class FileManagerError(Exception):
13
+ """Exception raised for any exception for file operations"""
14
+
15
+ class FileOps:
16
+ """Static helper for disc operations."""
17
+
18
+ ## HELPERS
19
+
20
+ @staticmethod
21
+ def _atomic_tmp(path: Path) -> Path:
22
+ """Temporary file path in the same directory as *path*."""
23
+ return path.with_suffix(path.suffix + ".tmp")
24
+
25
+ ## DIRECTORY MANAGEMENT
26
+ @staticmethod
27
+ def ensure_dir(path: Path) -> Path:
28
+ """Create *path* (and parents) if missing."""
29
+ try:
30
+ path.mkdir(parents=True, exist_ok=True)
31
+ return path
32
+ except Exception as e:
33
+ raise FileManagerError(str(e)) from e
34
+
35
+ ## READ OPERATIONS
36
+
37
+ @staticmethod
38
+ def read_text(path: Path) -> Optional[str]:
39
+ """Return UTF-8 contents or None if *path* does not exist."""
40
+ if not path.exists():
41
+ return None
42
+ try:
43
+ return path.read_text(encoding="utf-8")
44
+ except Exception as e:
45
+ raise FileManagerError(str(e)) from e
46
+
47
+ @staticmethod
48
+ def read_json(path: Path | str) -> Optional[dict]:
49
+ """Return JSON object or None if *path* does not exist."""
50
+ path = Path(path)
51
+ raw = FileOps.read_text(path)
52
+ if raw is None:
53
+ return None
54
+ try:
55
+ return json.loads(raw)
56
+ except Exception as e:
57
+ raise FileManagerError(str(e)) from e
58
+
59
+ @staticmethod
60
+ def read_binary(path: Path) -> Optional[bytes]:
61
+ """Return binary contents or None if *path* does not exist."""
62
+ if not path.exists():
63
+ return None
64
+ try:
65
+ return path.read_bytes()
66
+ except Exception as e:
67
+ raise FileManagerError(str(e)) from e
68
+
69
+ ## WRITE OPERATION
70
+
71
+ @staticmethod
72
+ def save_text(path: Path, text: str) -> None:
73
+ """Atomically write UTF-8 text to *path*."""
74
+ tmp = FileOps._atomic_tmp(path)
75
+ try:
76
+ tmp.write_text(text, encoding="utf-8")
77
+ tmp.replace(path)
78
+ except Exception as e:
79
+ FileOps.delete(tmp)
80
+ raise FileManagerError(str(e)) from e
81
+
82
+ @staticmethod
83
+ def save_json(path: Path, obj: dict) -> None:
84
+ """Atomically write pretty-printed JSON to *path*."""
85
+ tmp = FileOps._atomic_tmp(path)
86
+ try:
87
+ tmp.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
88
+ tmp.replace(path)
89
+ except Exception as e:
90
+ FileOps.delete(tmp)
91
+ raise FileManagerError(str(e)) from e
92
+
93
+ @staticmethod
94
+ def save_binary(path: Path, data: bytes) -> None:
95
+ """Atomically write binary data to *path* (e.g., cover images)."""
96
+ tmp = FileOps._atomic_tmp(path)
97
+ try:
98
+ tmp.write_bytes(data)
99
+ tmp.replace(path)
100
+ except Exception as e:
101
+ FileOps.delete(tmp)
102
+ raise FileManagerError(str(e)) from e
103
+
104
+ ## DELETE/COPY OPERATIONS
105
+
106
+ @staticmethod
107
+ def delete(path: Path) -> None:
108
+ """Delete *path* if it exists."""
109
+ try:
110
+ if path.exists():
111
+ path.unlink()
112
+ except Exception as e:
113
+ raise FileManagerError(str(e)) from e
114
+
115
+ @staticmethod
116
+ def copy(src: Path, dst: Path) -> None:
117
+ """Copy *src* to *dst*."""
118
+ try:
119
+ shutil.copy(src, dst)
120
+ except Exception as e:
121
+ raise FileManagerError(str(e)) from e
122
+
123
+ def _normalize_dirname(name: str) -> str:
124
+ """
125
+ Keep whitespace as-is while replacing any other unsupported characters
126
+ with an underscore.
127
+ Allowed: letters, digits, underscore, hyphen, and spaces.
128
+ """
129
+ # Collapse multiple spaces into a single space (optional; comment out if not desired)
130
+ name = re.sub(r'\s+', ' ', name.strip())
131
+
132
+ # Replace any char that is *not* letter, digit, underscore, hyphen, or space.
133
+ return re.sub(r'[^\w\-\s]', '_', name)
134
+
135
+ def now_iso() -> str:
136
+ """Current timestamp in ISO-8601 (seconds precision)."""
137
+ return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
8
138
 
9
139
  def generate_file_name_from_url(url: str) -> str:
10
140
  # Parsea URL
@@ -64,3 +194,10 @@ def check_exclusive_params(param1: any, param2: any) -> bool:
64
194
 
65
195
  def create_volume_id(n: int):
66
196
  return f'v{n:02}'
197
+
198
+ def check_incomplete_url(url: str) -> bool:
199
+ if url.startswith('?') or url.startswith('#'):
200
+ return True
201
+
202
+ parsed = urlparse(url)
203
+ return not parsed.scheme or not parsed.netloc
@@ -1 +1 @@
1
- __version__ = "1.1.0"
1
+ __version__ = "2.0.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: web-novel-scraper
3
- Version: 1.1.0
3
+ Version: 2.0.0
4
4
  Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
5
5
  Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
6
6
  Project-URL: Documentation, https://web-novel-scraper.readthedocs.io