web-novel-scraper 2.0.3__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,163 +1,86 @@
1
- from dataclasses import dataclass, fields, field
2
- import sys
1
+ from dataclasses import dataclass, field, replace
3
2
 
4
- from dataclasses_json import dataclass_json, config, Undefined
3
+ from dataclasses_json import dataclass_json, Undefined, config
5
4
  from ebooklib import epub
6
5
  from typing import Optional
6
+ from pathlib import Path
7
7
 
8
8
  from . import logger_manager
9
9
  from .decode import Decoder
10
10
  from .file_manager import FileManager
11
11
  from . import utils
12
-
13
12
  from .request_manager import get_html_content
14
13
  from .config_manager import ScraperConfig
14
+ from .models import ScraperBehavior, Metadata, Chapter
15
+ from .utils import _always, ScraperError, FileManagerError, NetworkError, ValidationError, DecodeError
15
16
 
16
17
  logger = logger_manager.create_logger('NOVEL SCRAPPING')
17
18
 
18
19
 
19
- @dataclass_json
20
- @dataclass
21
- class Metadata:
22
- author: Optional[str] = None
23
- start_date: Optional[str] = None
24
- end_date: Optional[str] = None
25
- language: Optional[str] = "en"
26
- description: Optional[str] = None
27
- tags: list[str] = field(default_factory=list)
28
-
29
- def update_behavior(self, **kwargs):
30
- """
31
- Updates the behavior configuration dynamically.
32
- Only updates the attributes provided in kwargs.
33
- """
34
- for key, value in kwargs.items():
35
- if hasattr(self, key) and value is not None:
36
- setattr(self, key, value)
37
-
38
- def __str__(self):
39
- """
40
- Dynamic string representation of the configuration.
41
- """
42
- attributes = [(f"{field.name}="
43
- f"{getattr(self, field.name)}") for field in fields(self)]
44
- attributes_str = '\n'.join(attributes)
45
- return (f"Metadata: \n"
46
- f"{attributes_str}")
47
-
48
-
49
- @dataclass_json
50
- @dataclass
51
- class ScraperBehavior:
52
- # Some novels already have the title in the content.
53
- save_title_to_content: bool = False
54
- # Some novels have the toc link without the host
55
- auto_add_host: bool = False
56
- # Some hosts return 403 when scrapping, this will force the use of FlareSolver
57
- # to save time
58
- force_flaresolver: bool = False
59
- # When you clean the html files, you can use hard clean by default
60
- hard_clean: bool = False
61
-
62
- def update_behavior(self, **kwargs):
63
- """
64
- Updates the behavior configuration dynamically.
65
- Only updates the attributes provided in kwargs.
66
- """
67
- for key, value in kwargs.items():
68
- if hasattr(self, key) and value is not None:
69
- setattr(self, key, value)
70
-
71
- def __str__(self):
72
- """
73
- Dynamic string representation of the configuration.
74
- """
75
- attributes = [(f"{field.name}="
76
- f"{getattr(self, field.name)}") for field in fields(self)]
77
- attributes_str = '\n'.join(attributes)
78
- return (f"Scraper Behavior: \n"
79
- f"{attributes_str}")
80
-
81
-
82
- @dataclass_json(undefined=Undefined.EXCLUDE)
83
- @dataclass
84
- class Chapter:
85
- chapter_url: str
86
- chapter_html_filename: Optional[str] = None
87
- chapter_title: Optional[str] = None
88
-
89
- def __init__(self,
90
- chapter_url: str,
91
- chapter_html: str = None,
92
- chapter_content: str = None,
93
- chapter_html_filename: str = None,
94
- chapter_title: str = None):
95
- self.chapter_url = chapter_url
96
- self.chapter_html = chapter_html
97
- self.chapter_content = chapter_content
98
- self.chapter_html_filename = chapter_html_filename
99
- self.chapter_title = chapter_title
100
-
101
- def __str__(self):
102
- return f'Title: "{self.chapter_title}"\nURL: "{self.chapter_url}"\nFilename: "{self.chapter_html_filename}"'
103
-
104
- def __lt__(self, another):
105
- return self.chapter_title < another.chapter_title
106
-
107
-
108
20
  @dataclass_json(undefined=Undefined.EXCLUDE)
109
21
  @dataclass
110
22
  class Novel:
111
- metadata: Metadata = None
112
- title: str = None
113
- scraper_behavior: ScraperBehavior = None
114
- chapters: list[Chapter] = field(default_factory=list)
23
+ """
24
+ A class representing a web novel with its metadata and content.
25
+
26
+ This class handles all operations related to scraping, storing, and managing web novels,
27
+ including their chapters, table of contents, and metadata.
28
+
29
+ Attributes:
30
+ title (str): The title of the novel.
31
+ host (Optional[str]): The host domain where the novel is located.
32
+ toc_main_url (Optional[str]): The main URL for the table of contents.
33
+ chapters (list[Chapter]): List of chapters in the novel.
34
+ chapters_url_list (list[str]): List of URLs for all chapters.
35
+ metadata (Metadata): Novel metadata like author, language, etc.
36
+ scraper_behavior (ScraperBehavior): Configuration for scraping behavior.
37
+ file_manager (FileManager): Handles file operations for the novel.
38
+ decoder (Decoder): Handles HTML decoding and parsing.
39
+ config (ScraperConfig): General scraper configuration.
40
+ """
41
+
42
+ title: str
43
+ host: Optional[str] = None
115
44
  toc_main_url: Optional[str] = None
45
+ chapters: list[Chapter] = field(default_factory=list)
116
46
  chapters_url_list: list[str] = field(default_factory=list)
117
- host: str = None
118
-
119
- def __init__(self,
120
- title: str,
121
- toc_main_url: str = None,
122
- toc_html: str = None,
123
- chapters_url_list: list[str] = None,
124
- metadata: Metadata = None,
125
- chapters: list[Chapter] = None,
126
- scraper_behavior: ScraperBehavior = None,
127
- host: str = None
128
- ):
129
- if toc_main_url and toc_html:
130
- logger.critical('There can only be one or toc_main_url or toc_html')
131
- raise ValueError('There can only be one or toc_main_url or toc_html')
132
-
133
- self.title = title
134
- self.metadata = Metadata()
135
- if metadata is not None:
136
- self.metadata = metadata
137
-
138
- if toc_html:
139
- self.file_manager.add_toc(toc_html)
140
-
141
- self.toc_main_url = toc_main_url
142
- self.chapters_url_list = chapters_url_list if chapters_url_list else []
143
-
144
- self.chapters = chapters if chapters else []
145
-
146
- self.scraper_behavior = scraper_behavior if scraper_behavior else ScraperBehavior()
147
- if not host and not toc_main_url:
148
- logger.error('You need to set "host" or "toc_main_url".')
149
- sys.exit(1)
47
+ metadata: Metadata = field(default_factory=Metadata)
48
+ scraper_behavior: ScraperBehavior = field(default_factory=ScraperBehavior)
49
+
50
+ file_manager: FileManager = field(default=None,
51
+ repr=False,
52
+ compare=False,
53
+ metadata=config(exclude=_always))
54
+ decoder: Decoder = field(default=None,
55
+ repr=False,
56
+ compare=False,
57
+ metadata=config(exclude=_always))
58
+ config: ScraperConfig = field(default=None,
59
+ repr=False,
60
+ compare=False,
61
+ metadata=config(exclude=_always))
62
+
63
+ def __post_init__(self):
64
+ """
65
+ Validates the novel instance after initialization.
150
66
 
151
- self.host = host if host else utils.obtain_host(self.toc_main_url)
67
+ Raises:
68
+ ValidationError: If the title is empty or neither host nor toc_main_url is provided.
69
+ """
152
70
 
153
- self.config = None
154
- self.file_manager = None
155
- self.decoder = None
71
+ if not self.title:
72
+ raise ValidationError("title can't be empty")
73
+ if not (self.host or self.toc_main_url):
74
+ raise ValidationError('You must provide "host" or "toc_main_url"')
156
75
 
157
76
  def __str__(self):
158
77
  """
159
- Dynamic string representation of the novel.
78
+ Returns a string representation of the novel with its main attributes.
79
+
80
+ Returns:
81
+ str: A formatted string containing the novel's main information.
160
82
  """
83
+
161
84
  toc_info = self.toc_main_url if self.toc_main_url else "TOC added manually"
162
85
  attributes = [
163
86
  f"Title: {self.title}",
@@ -172,99 +95,317 @@ class Novel:
172
95
  return (f"Novel Info: \n"
173
96
  f"{attributes_str}")
174
97
 
175
- @staticmethod
176
- def load(title: str, cfg: ScraperConfig, novel_base_dir: str | None = None):
98
+ @classmethod
99
+ def load(cls, title: str, cfg: ScraperConfig, novel_base_dir: Path = None) -> 'Novel':
100
+ """
101
+ Loads a novel from stored JSON data.
102
+
103
+ Args:
104
+ title (str): Title of the novel to load.
105
+ cfg (ScraperConfig): Scraper configuration.
106
+ novel_base_dir (Path, optional): Base directory for the novel data.
107
+
108
+ Returns:
109
+ Novel: A new Novel instance loaded from stored data.
110
+
111
+ Raises:
112
+ ValidationError: If the novel with the given title is not found.
113
+ """
114
+
177
115
  fm = FileManager(title, cfg.base_novels_dir, novel_base_dir, read_only=True)
178
116
  raw = fm.load_novel_json()
179
117
  if raw is None:
180
118
  logger.debug(f'Novel "{title}" was not found.')
181
- raise ValueError(f'Novel "{title}" was not found.')
182
- novel = Novel.from_json(raw)
183
- novel.config = cfg
119
+ raise ValidationError(f'Novel "{title}" was not found.')
120
+ novel = cls.from_json(raw)
184
121
  novel.set_config(cfg=cfg, novel_base_dir=novel_base_dir)
185
122
  return novel
186
123
 
124
+ @classmethod
125
+ def new(cls, title: str, cfg: ScraperConfig, host: str = None, toc_html: str = None,
126
+ toc_main_url: str = None) -> 'Novel':
127
+ """Creates a new Novel instance.
128
+
129
+ Args:
130
+ title: Title of the novel (required)
131
+ cfg: Scraper configuration (required)
132
+ host: Host URL for the novel content (optional)
133
+ toc_html: HTML content for the table of contents (optional)
134
+ toc_main_url: URL for the table of contents (optional)
135
+
136
+ Note:
137
+ - Either toc_html or toc_main_url must be provided
138
+ - If toc_main_url is provided, host will be extracted from it if not explicitly provided
139
+ - If toc_html is provided, host must be explicitly provided
140
+
141
+ Returns:
142
+ Novel: A new Novel instance
143
+
144
+ Raises:
145
+ ValidationError: If the title is empty, or if neither toc_html nor toc_main_url is provided
146
+ """
147
+ if not title:
148
+ raise ValidationError("Title cannot be empty")
149
+
150
+ if not (toc_html or toc_main_url):
151
+ raise ValidationError("Either toc_html or toc_main_url must be provided")
152
+
153
+ if toc_html and not host:
154
+ raise ValidationError("When providing toc_html, host must be explicitly provided")
155
+
156
+ novel = cls(title=title, host=host, toc_main_url=toc_main_url)
157
+ breakpoint()
158
+ # If toc_main_url is provided and the host isn't, extract host from URL
159
+ if toc_main_url and not host:
160
+ host = utils.obtain_host(toc_main_url)
161
+ novel.host = host
162
+
163
+ # If toc_html is provided, add it to the novel
164
+ if toc_html:
165
+ novel.add_toc_html(toc_html, host)
166
+
167
+ return novel
168
+
187
169
  # NOVEL PARAMETERS MANAGEMENT
188
170
 
189
171
  def set_config(self,
190
- cfg: ScraperConfig = None,
191
- config_file: str = None,
192
- base_novels_dir: str = None,
193
- novel_base_dir: str = None,
194
- decode_guide_file: str = None):
195
- if cfg is not None:
196
- self.config = cfg
197
- else:
198
- self.config = ScraperConfig(config_file=config_file,
199
- base_novels_dir=base_novels_dir,
200
- decode_guide_file=decode_guide_file)
172
+ cfg: ScraperConfig,
173
+ novel_base_dir: str | None = None) -> None:
174
+ """
175
+ Configures the novel with the provided scraper configuration and base directory.
176
+
177
+ Sets up the file manager and decoder for the novel based on the provided configuration.
178
+
179
+ Args:
180
+ cfg (ScraperConfig): The scraper configuration to use.
181
+ novel_base_dir (str | None, optional): Base directory for the novel files.
182
+ If None, it uses the default directory from configuration.
201
183
 
202
- self.file_manager = FileManager(title=self.title,
203
- base_novels_dir=self.config.base_novels_dir,
204
- novel_base_dir=novel_base_dir)
184
+ Raises:
185
+ FileManagerError: If there's an error when reading the config or decoding guide files.
186
+ """
187
+
188
+ try:
189
+ self.config = cfg
190
+ self.file_manager = FileManager(title=self.title,
191
+ base_novels_dir=self.config.base_novels_dir,
192
+ novel_base_dir=novel_base_dir)
193
+ self.decoder = Decoder(self.host, self.config.decode_guide_file)
194
+ except FileManagerError as e:
195
+ logger.error("Could not set configuration. File Manager Error", exc_info=e)
196
+ raise
197
+
198
+ def set_scraper_behavior(self, **kwargs) -> None:
199
+ """
200
+ Updates the scraper behavior configuration with the provided parameters.
205
201
 
206
- self.decoder = Decoder(self.host, self.config.decode_guide_file)
202
+ Args:
203
+ **kwargs: Keyword arguments for updating scraper behavior settings.
204
+ Can include any valid ScraperBehavior attributes.
205
+ """
207
206
 
208
- def set_scraper_behavior(self, save: bool = False, **kwargs) -> None:
209
- self.scraper_behavior.update_behavior(**kwargs)
207
+ filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
208
+ self.scraper_behavior = replace(self.scraper_behavior, **filtered_kwargs)
209
+ logger.info(f'Scraper behavior updated')
210
210
 
211
211
  def set_metadata(self, **kwargs) -> None:
212
- self.metadata.update_behavior(**kwargs)
212
+ """
213
+ Updates the novel's metadata with the provided parameters.
214
+
215
+ Args:
216
+ **kwargs: Keyword arguments for updating metadata.
217
+ Can include any valid Metadata attributes like author, language, etc.
218
+ """
219
+ filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
220
+ self.metadata = replace(self.metadata, **filtered_kwargs)
221
+ logger.info(f'Metadata updated')
222
+
223
+ def add_tag(self, tag: str) -> None:
224
+ """
225
+ Adds a new tag to the novel's metadata if it doesn't already exist.
226
+
227
+ Args:
228
+ tag (str): The tag to add to the novel's metadata.
229
+ """
213
230
 
214
- def add_tag(self, tag: str) -> bool:
215
231
  if tag not in self.metadata.tags:
216
- self.metadata.tags.append(tag)
217
- return True
218
- logger.warning(f'Tag "{tag}" already exists on novel {self.title}')
219
- return False
232
+ self.metadata = replace(
233
+ self.metadata, tags=(*self.metadata.tags, tag)
234
+ )
235
+ logger.info('Tag %s added to metadata', tag)
236
+ else:
237
+ logger.debug("Tag %s already present in %s", tag, self.title)
238
+
239
+ def remove_tag(self, tag: str) -> None:
240
+ """
241
+ Removes a tag from the novel's metadata if it exists.
242
+
243
+ Args:
244
+ tag (str): The tag to remove from the novel's metadata.
245
+ """
220
246
 
221
- def remove_tag(self, tag: str) -> bool:
222
247
  if tag in self.metadata.tags:
223
- self.metadata.tags.remove(tag)
224
- return True
225
- logger.warning(f'Tag "{tag}" doesn\'t exist on novel {self.title}')
226
- return False
248
+ self.metadata = replace(self.metadata,
249
+ tags=tuple(t for t in self.metadata.tags if t != tag))
250
+ logger.info('Tag %s removed from metadata', tag)
251
+ else:
252
+ logger.debug("Tag %s not present in %s", tag, self.title)
227
253
 
228
254
  def set_cover_image(self, cover_image_path: str) -> None:
229
- self.file_manager.save_novel_cover(cover_image_path)
255
+ """
256
+ Sets or updates the novel's cover image.
257
+
258
+ Args:
259
+ cover_image_path (str): Path to the cover image file.
260
+
261
+ Raises:
262
+ FileManagerError: If there's an error when saving the cover image.
263
+ """
264
+
265
+ try:
266
+ self.file_manager.save_novel_cover(cover_image_path)
267
+ logger.info('Cover image updated')
268
+ except FileManagerError as e:
269
+ logger.error("Could not update cover. File Manager Error", exc_info=e)
270
+ raise
230
271
 
231
272
  def set_host(self, host: str) -> None:
273
+ """
274
+ Sets or updates the novel's host URL and modifies the decoder.
275
+
276
+ Args:
277
+ host (str): The host URL for the novel.
278
+
279
+ Raises:
280
+ DecodeError: If there's an error when setting up the decoder with the new host.
281
+ """
282
+
232
283
  self.host = host
233
- self.decoder.set_host(host)
284
+ try:
285
+ self.decoder.set_host(host)
286
+ logger.info(f'Host updated to "{self.host}"')
287
+ except ValidationError as e:
288
+ logger.error("Could not set host. Decode Error", exc_info=e)
289
+ raise
290
+
291
+ def save_novel(self) -> None:
292
+ """
293
+ Saves the current state of the novel to disk.
294
+
295
+ Persists all novel data including metadata, chapters, and configuration
296
+ to the novel's JSON file.
297
+
298
+ Raises:
299
+ FileManagerError: If there's an error when saving the novel data.
300
+ """
234
301
 
235
- def save_novel(self, save: bool = True) -> None:
236
- self.file_manager.save_novel_json(self.to_dict())
302
+ try:
303
+ self.file_manager.save_novel_json(self.to_dict())
304
+ logger.info(f'Novel data saved to disk on file "{self.file_manager.novel_json_file}".')
305
+ except FileManagerError as e:
306
+ logger.error("Could not save novel. File Manager Error", exc_info=e)
307
+ raise
237
308
 
238
309
  # TABLE OF CONTENTS MANAGEMENT
239
310
 
240
- def set_toc_main_url(self, toc_main_url: str, host: str = None, update_host: bool = False) -> None:
311
+ def set_toc_main_url(self, toc_main_url: str, update_host: bool = True) -> None:
312
+ """
313
+ Sets the main URL for the table of contents and optionally updates the host.
314
+
315
+ Deletes any existing TOC files as they will be refreshed from the new URL.
316
+ If update_host is True, extracts and updates the host from the new URL.
317
+
318
+ Args:
319
+ toc_main_url: Main URL for the table of contents
320
+ update_host: Whether to update the host based on the URL (default: True)
321
+
322
+ Raises:
323
+ ValidationError: If host extraction fails
324
+ FileManagerError: If TOC deletion fails
325
+ """
326
+
241
327
  self.toc_main_url = toc_main_url
242
- self.file_manager.delete_toc()
243
- if host:
244
- self.host = host
245
- self.decoder = Decoder(self.host)
246
- elif update_host:
247
- self.decoder = Decoder(utils.obtain_host(self.toc_main_url))
328
+ logger.info(f'Main URL updated to "{self.toc_main_url}", TOCs already requested will be deleted.')
329
+ try:
330
+ self.file_manager.delete_toc()
331
+ except FileManagerError as e:
332
+ logger.error("Could not delete TOCs. File Manager Error", exc_info=e)
333
+ raise
334
+
335
+ if update_host:
336
+ new_host = utils.obtain_host(self.toc_main_url)
337
+ logger.debug(f'Update Host flag present, new host is "{new_host}".')
338
+ self.set_host(new_host)
248
339
 
249
340
  def add_toc_html(self, html: str, host: str = None) -> None:
341
+ """
342
+ Adds HTML content as a table of contents fragment.
343
+
344
+ This method is mutually exclusive with using toc_main_url - if a main URL exists,
345
+ it will be cleared. Host must be provided either directly or from a previous configuration.
346
+
347
+ Args:
348
+ html: HTML content to add as TOC fragment
349
+ host: Optional host to set for this content
350
+
351
+ Raises:
352
+ ValidationError: If no host is provided when required
353
+ FileManagerError: If saving TOC content fails
354
+ """
355
+
250
356
  if self.toc_main_url:
357
+ logger.debug(f'TOC main URL is exclusive with manual TOC files, TOC main URL will be deleted.')
251
358
  self.delete_toc()
252
359
  self.toc_main_url = None
253
360
 
254
361
  if host:
255
- self.host = host
256
- self.decoder = Decoder(self.host)
362
+ self.set_host(host)
363
+ else:
364
+ if self.host is None:
365
+ logger.error(f'When using TOC files instead of URLs, host must be provided.')
366
+ raise ValidationError('Host must be provided when using TOC files instead of URLs.')
257
367
  self.file_manager.add_toc(html)
258
- # Delete toc_main_url since they are exclusive
368
+ logger.info('New TOC file added to disk.')
259
369
 
260
370
  def delete_toc(self):
371
+ """
372
+ Deletes all table of contents files and resets chapter data.
373
+
374
+ Clears:
375
+ - All TOC files from disk
376
+ - Chapter list
377
+ - Chapter URL list
378
+
379
+ Raises:
380
+ FileManagerError: If deletion of TOC files fails
381
+ """
382
+
261
383
  self.file_manager.delete_toc()
262
384
  self.chapters = []
263
385
  self.chapters_url_list = []
386
+ logger.info('TOC files deleted from disk.')
387
+
388
+ def sync_toc(self, reload_files: bool = True) -> None:
389
+ """
390
+ Synchronizes the table of contents with stored/remote content.
391
+
392
+ Process:
393
+ 1. Checks if TOC content exists (stored or retrievable)
394
+ 2. Optionally reloads TOC files from remote if needed
395
+ 3. Extracts chapter URLs from TOC content
396
+ 4. Creates/updates chapters based on URLs
397
+
398
+ Args:
399
+ reload_files: Whether to force reload of TOC files from remote (default: True)
400
+
401
+ Raises:
402
+ ScraperError: If no TOC content is available
403
+ FileManagerError: If file operations fail
404
+ DecodeError: If TOC parsing fails
405
+ NetworkError: If remote content retrieval fails
406
+ ValidationError: If chapter creation fails
407
+ """
264
408
 
265
- def sync_toc(self, reload_files: bool = False) -> bool:
266
- # Hard reload will request again the toc files from the toc_main_url
267
- # Only works with toc_main_url
268
409
  all_tocs_content = self.file_manager.get_all_toc()
269
410
 
270
411
  # If there is no toc_main_url and no manually added toc, there is no way to sync toc
@@ -272,59 +413,116 @@ class Novel:
272
413
  if toc_not_exists:
273
414
  logger.critical(
274
415
  'There is no toc html and no toc url set, unable to get toc.')
275
- return False
276
-
277
- reload_files = reload_files and self.toc_main_url is not None
278
- if reload_files or not all_tocs_content:
279
- self.chapters = []
280
- self.file_manager.delete_toc()
281
- all_tocs_content = []
282
- toc_content = self._add_toc(self.toc_main_url)
283
- all_tocs_content.append(toc_content)
284
- if self.decoder.has_pagination():
285
- next_page = self.decoder.get_toc_next_page_url(toc_content)
286
- while next_page:
287
- toc_content = self._add_toc(next_page)
288
- next_page = self.decoder.get_toc_next_page_url(toc_content)
289
- all_tocs_content.append(toc_content)
416
+ raise ScraperError('There is no toc html and no toc url set, unable to get toc.')
417
+
418
+ # Will reload files if:
419
+ # Reload_files is True (requested by user) AND there is a toc_main_url present.
420
+ # OR
421
+ # There is a toc_main_url present, but no toc files are saved in the disk.
422
+ reload_files = ((reload_files or
423
+ all_tocs_content is None) or
424
+ self.toc_main_url is not None)
425
+ if reload_files:
426
+ logger.debug('Reloading TOC files.')
427
+ try:
428
+ self._request_toc_files()
429
+ except FileManagerError as e:
430
+ logger.error("Could not request TOC files. File Manager Error", exc_info=e)
431
+ raise
432
+ except DecodeError as e:
433
+ logger.error("Could not request TOC files. Decoder Error", exc_info=e)
434
+ raise
435
+ except NetworkError as e:
436
+ logger.error("Could not request TOC files. Network Error", exc_info=e)
437
+ raise
438
+
439
+ try:
440
+ self._load_or_request_chapter_urls_from_toc()
441
+ except DecodeError as e:
442
+ logger.error("Could not get chapter urls from TOC files. Decoder Error", exc_info=e)
443
+ raise
444
+ except FileManagerError as e:
445
+ logger.error("Could not get chapter urls from TOC files. File Manager Error", exc_info=e)
446
+ raise
447
+
448
+ try:
449
+ self._create_chapters_from_toc()
450
+ except ValidationError as e:
451
+ logger.error("Could not create chapters from TOC files. Validation Error", exc_info=e)
452
+ raise
453
+ logger.info('TOC synced with files, Chapters created from Table of Contents.')
454
+
455
+ def show_toc(self) -> Optional[str]:
456
+ """
457
+ Generates a human-readable representation of the Table Of Contents.
290
458
 
291
- # Now we get the links from the toc content
292
- self.chapters_url_list = []
293
- for toc_content in all_tocs_content:
294
- chapters_url_from_toc_content = self.decoder.get_chapter_urls(toc_content)
295
- if chapters_url_from_toc_content is None:
296
- logger.error('Chapters url not found on toc_content')
297
- return False
298
- # First we save a list of lists in case we need to invert the orderAdd commentMore actions
299
- self.chapters_url_list.append(chapters_url_from_toc_content)
300
-
301
- invert = self.decoder.is_index_inverted()
302
- self.chapters_url_list = [
303
- chapter
304
- for chapters_url in (self.chapters_url_list[::-1] if invert else self.chapters_url_list)
305
- for chapter in chapters_url
306
- ]
307
- add_host_to_chapter = self.scraper_behavior.auto_add_host or self.decoder.add_host_to_chapter()
308
- if add_host_to_chapter:
309
- self.chapters_url_list = [
310
- f'https://{self.host}{chapter_url}' for chapter_url in self.chapters_url_list]
311
- self.chapters_url_list = utils.delete_duplicates(
312
- self.chapters_url_list)
313
- self.save_novel()
314
- self._create_chapters_from_toc()
315
- return True
459
+ Returns:
460
+ Optional[str]: Formatted string showing chapter numbers and URLs, None if no chapters_urls found
461
+ """
316
462
 
317
- def show_toc(self):
318
463
  if not self.chapters_url_list:
319
- return 'No chapters in TOC, reload TOC and try again'
464
+ logger.warning('No chapters in TOC')
465
+ return None
320
466
  toc_str = 'Table Of Contents:'
321
467
  for i, chapter_url in enumerate(self.chapters_url_list):
322
- toc_str += f'\nChapter {i+1}: {chapter_url}'
468
+ toc_str += f'\nChapter {i + 1}: {chapter_url}'
323
469
  return toc_str
324
470
 
325
471
  # CHAPTERS MANAGEMENT
326
472
 
473
+ def get_chapter(self, chapter_index: Optional[int] = None, chapter_url: Optional[str] = None) -> Optional[Chapter]:
474
+ """
475
+ Retrieves a chapter either by its index in the chapter list or by its URL.
476
+
477
+ Args:
478
+ chapter_index (Optional[int]): The index of the chapter in the chapter list
479
+ chapter_url (Optional[str]): The URL of the chapter to retrieve
480
+
481
+ Returns:
482
+ Optional[Chapter]: The requested chapter if found, None otherwise
483
+
484
+ Raises:
485
+ ValidationError: If neither index nor url is provided, or if both are provided
486
+ IndexError: If the provided index is out of range
487
+ """
488
+ if not utils.check_exclusive_params(chapter_index, chapter_url):
489
+ raise ValidationError("Exactly one of 'chapter_index' or 'chapter_url' must be provided")
490
+
491
+ if chapter_url is not None:
492
+ chapter_index = self._find_chapter_index_by_url(chapter_url)
493
+
494
+ if chapter_index is not None:
495
+ if chapter_index < 0:
496
+ raise ValueError("Index must be positive")
497
+ try:
498
+ return self.chapters[chapter_index]
499
+ except IndexError:
500
+ logger.warning(f"No chapter found at index {chapter_index}")
501
+ return None
502
+ logger.warning(f"No chapter found with url {chapter_url}")
503
+ return None
504
+
327
505
  def show_chapters(self) -> str:
506
+ """
507
+ Generates a text representation of all novel chapters.
508
+
509
+ Returns:
510
+ str: Formatted string containing the list of chapters with their information:
511
+ - Chapter number
512
+ - Title (if available)
513
+ - URL
514
+ - HTML filename (if available)
515
+
516
+ Note:
517
+ Output format is:
518
+ Chapters List:
519
+ Chapter 1:
520
+ Title: [title or message]
521
+ URL: [url]
522
+ Filename: [filename or message]
523
+ ...
524
+ """
525
+
328
526
  chapter_list = "Chapters List:\n"
329
527
  for i, chapter in enumerate(self.chapters):
330
528
  chapter_list += f"Chapter {i + 1}:\n"
@@ -333,105 +531,166 @@ class Novel:
333
531
  chapter_list += f" Filename: {chapter.chapter_html_filename if chapter.chapter_html_filename else 'File not yet requested'}\n"
334
532
  return chapter_list
335
533
 
336
- def scrap_chapter(self, chapter_url: str = None, chapter_idx: int = None, update_html: bool = False) -> Chapter:
337
- logger.info('Scraping Chapter...')
338
- chapter = None
339
- if not utils.check_exclusive_params(chapter_url, chapter_idx):
340
- raise ValueError("chapter_url and chapter_id, only one needs to be set")
534
+ def scrap_chapter(self, chapter: Chapter, reload_file: bool = False) -> Chapter:
535
+ """
536
+ Processes and decodes a specific chapter of the novel.
341
537
 
342
- if chapter_url is not None:
343
- logger.debug(f'Using chapter url: {chapter_url}')
344
- chapter = self._get_chapter_by_url(chapter_url=chapter_url)
345
- if chapter is None:
346
- logger.warning(f'Chapter with url "{chapter_url}" does not exist, generating one...')
347
- chapter = Chapter(chapter_url=chapter_url)
348
-
349
- if chapter_idx is not None:
350
- logger.debug(f'Using chapter index: {chapter_idx}')
351
- if chapter_idx < 0 or chapter_idx >= len(self.chapters):
352
- logger.critical(f'Could not find chapter with idx {chapter_idx}')
353
- raise ValueError(f'Could not find chapter with idx {chapter_idx}')
354
-
355
- chapter = self.chapters[chapter_idx]
356
- if update_html:
357
- logger.debug('HTML will be updated...')
358
-
359
- chapter = self._get_chapter(chapter,
360
- reload=update_html)
361
-
362
- if not chapter.chapter_html or not chapter.chapter_html_filename:
363
- logger.critical(f'Failed to create chapter on link: "{chapter_url}" '
364
- f'on path "{chapter.chapter_html_filename}"')
365
- raise ValueError(f'Failed to create chapter on link: "{chapter_url}" '
366
- f'on path "{chapter.chapter_html_filename}"')
538
+ This method handles the complete scraping process for an individual chapter,
539
+ including HTML loading or requesting and content decoding.
540
+
541
+ Args:
542
+ chapter (Chapter): Chapter object to process
543
+ reload_file (bool, optional): If True, forces a new download of the chapter
544
+ even if it already exists locally. Defaults to False.
545
+
546
+ Returns:
547
+ Chapter: The updated Chapter object with decoded content
548
+
549
+ Raises:
550
+ ValidationError: If there are issues with the values of the provided Chapter object
551
+ DecodeError: If there are issues during content decoding
552
+ NetworkError: If there are issues during HTML request
553
+ FileManagerError: If there are issues during file operations
554
+ """
555
+
556
+ logger.debug('Scraping Chapter...')
557
+ if chapter.chapter_url is None:
558
+ logger.error('Chapter trying to be scrapped does not have a URL')
559
+ raise ValidationError('Chapter trying to be scrapped does not have a URL')
560
+
561
+ logger.debug(f'Using chapter url: {chapter.chapter_url}')
562
+
563
+ if reload_file:
564
+ logger.debug('Reload file Flag present, HTML will be requested...')
565
+
566
+ try:
567
+ chapter = self._load_or_request_chapter(chapter,
568
+ reload_file=reload_file)
569
+ except ValidationError as e:
570
+ logger.error(f'Could get chapter for URL "{chapter.chapter_url}" HTML content. Validation Error',
571
+ exc_info=e)
572
+ raise
573
+ except FileManagerError as e:
574
+ logger.error(f'Could get chapter for URL "{chapter.chapter_url}" HTML content. File Manager Error',
575
+ exc_info=e)
576
+ raise
577
+ except NetworkError as e:
578
+ logger.error(f'Could get chapter for URL "{chapter.chapter_url}" HTML content. Network Error', exc_info=e)
579
+ raise
580
+
581
+ if not chapter.chapter_html:
582
+ logger.error(f'Could not get HTML content for chapter with URL "{chapter.chapter_url}"')
583
+ raise ScraperError(f'Could not get HTML content for chapter with URL "{chapter.chapter_url}"')
367
584
 
368
585
  # We get the chapter title and content
369
586
  # We pass an index so we can autogenerate a Title
370
- chapter = self._decode_chapter(chapter=chapter, idx_for_chapter_name=chapter_idx)
371
-
372
- logger.info(f'Chapter scrapped from link: {chapter_url}')
587
+ save_title_to_content = (self.scraper_behavior.save_title_to_content or
588
+ self.decoder.save_title_to_content())
589
+ try:
590
+ chapter = self._decode_chapter(chapter=chapter,
591
+ save_title_to_content=save_title_to_content)
592
+ except DecodeError as e:
593
+ logger.error(f'Could not decode HTML title and content for chapter with URL "{chapter.chapter_url}"',
594
+ exc_info=e)
595
+ raise
596
+ except ValidationError as e:
597
+ logger.error(f'Could not decode HTML title and content for chapter with URL "{chapter.chapter_url}"',
598
+ exc_info=e)
599
+ raise
600
+
601
+ logger.info(f'Chapter scrapped from link: {chapter.chapter_url}')
373
602
  return chapter
374
603
 
375
- def scrap_all_chapters(self, sync_toc: bool = False, update_chapters: bool = False, update_html: bool = False) -> None:
376
- if sync_toc:
377
- self.sync_toc()
378
- # We scrap all chapters from our chapter list
379
- if self.chapters_url_list:
380
- for i, chapter in enumerate(len(self.chapters)):
381
-
382
- # If update_chapters is true, we scrap again the chapter info
383
- if update_chapters:
384
- chapter = self.scrap_chapter(chapter_idx=i,
385
- update_html=update_html)
386
- self._add_or_update_chapter_data(
387
- chapter=chapter, link_idx=i)
388
- continue
389
- # If not, we only update if the chapter doesn't have a title or html
390
- if chapter.chapter_html_filename and chapter.chapter_title:
391
- continue
392
- chapter = self.scrap_chapter(chapter_idx=i,
393
- update_html=update_html)
394
- self._add_or_update_chapter_data(chapter=chapter,
395
- save_in_file=True)
396
- else:
397
- logger.warning('No chapters found')
604
+ def request_all_chapters(self,
605
+ sync_toc: bool = True,
606
+ reload_files: bool = False,
607
+ clean_chapters: bool = False) -> None:
608
+ """
609
+ Requests and processes all chapters of the novel.
610
+
611
+ This method performs scraping of all available chapters in the novel,
612
+ handling the loading and decoding of each one.
613
+
614
+ Args:
615
+ sync_toc (bool, optional): If True, syncs the table of contents
616
+ reload_files (bool, optional): If True, forces a new download of all
617
+ chapters, even if they already exist locally. Defaults to False.
618
+ clean_chapters (bool, optional): If True, cleans the HTML content of the files
619
+
620
+ Raises:
621
+ FileManagerError: If there are issues during file operations
622
+ DecodeError: If there are issues during content decoding
623
+ ValidationError: If there are issues during content decoding
624
+
625
+ Note:
626
+ - Process is performed sequentially for each chapter
627
+ - Errors in individual chapters don't stop the complete process
628
+ - Progress is logged through the logging system
629
+ """
398
630
 
399
- def request_all_chapters(self, sync_toc: bool = False, update_html: bool = False, clean_chapters: bool = False) -> None:
631
+ logger.debug('Requesting all chapters...')
400
632
  if sync_toc:
401
- self.sync_toc()
402
- if self.chapters_url_list:
403
- # We request the HTML files of all the chapters
404
- for i, chapter in enumerate(self.chapters):
405
- # If the chapter exists and update_html is false, we can skip
406
- if chapter.chapter_html_filename and not update_html:
407
- continue
408
- chapter = self._get_chapter(
409
- chapter=chapter, reload=update_html)
410
- if not chapter.chapter_html_filename:
411
- logger.critical(f'Error requesting chapter {i} with url {chapter.chapter_url}')
412
- return False
413
-
414
- self._add_or_update_chapter_data(chapter=chapter, link_idx=i,
415
- save_in_file=True)
416
- if clean_chapters:
417
- self._clean_chapter(chapter.chapter_html_filename)
418
- return True
419
- else:
420
- logger.warning('No chapters found')
633
+ logger.debug('Sync TOC flag present, syncing TOC...')
634
+ try:
635
+ self.sync_toc(reload_files=False)
636
+ except ScraperError:
637
+ logger.warning('Error when trying to sync TOC, continuing without syncing...')
638
+
639
+ if len(self.chapters_url_list) == 0:
640
+ logger.warning('No chapters in TOC, returning without requesting any...')
641
+ return None
642
+
643
+ # We request the HTML files of all the chapters
644
+ # The chapter will be requested again if:
645
+ # 1. Reload files flag is True (Requested by user)
646
+ chapters_obtained = 0
647
+ total_chapters = len(self.chapters)
648
+ for i in range(len(self.chapters)):
649
+ logger.info(f'Requesting chapter {i + 1} of {total_chapters}')
650
+ try:
651
+ self.chapters[i] = self._load_or_request_chapter(chapter=self.chapters[i],
652
+ reload_file=reload_files)
653
+ except FileManagerError:
654
+ logger.warning(f'Error requesting chapter {i + 1} with url {self.chapters[i].chapter_url}, Skipping...')
655
+ continue
656
+ except ValidationError:
657
+ logger.warning(f'Error validating chapter {i + 1} with url {self.chapters[i].chapter_url}, Skipping...')
658
+ continue
659
+
660
+ if not self.chapters[i].chapter_html:
661
+ logger.warning(f'Error requesting chapter {i + 1} with url {self.chapters[i].chapter_url}')
662
+ continue
663
+
664
+ if clean_chapters:
665
+ self._clean_chapter(self.chapters[i].chapter_html_filename)
666
+ self.save_novel()
667
+ chapters_obtained += 1
668
+ logger.info(f'Successfully requested {chapters_obtained} of {total_chapters} chapters.')
669
+ return None
421
670
 
422
- # EPUB CREATION
671
+ # EPUB CREATION
423
672
 
424
673
  def save_novel_to_epub(self,
425
674
  sync_toc: bool = False,
426
675
  start_chapter: int = 1,
427
676
  end_chapter: int = None,
428
677
  chapters_by_book: int = 100) -> None:
678
+ logger.debug('Saving novel to epub...')
429
679
  if sync_toc:
430
- self.sync_toc()
680
+ logger.debug('Sync TOC flag present, syncing TOC...')
681
+ try:
682
+ self.sync_toc(reload_files=False)
683
+ except ScraperError:
684
+ logger.warning('Error when trying to sync TOC, continuing without syncing...')
685
+
686
+ if start_chapter < 1:
687
+ logger.error('Start chapter is invalid.')
688
+ raise ValidationError('Start chapter is invalid.')
431
689
 
432
690
  if start_chapter > len(self.chapters):
433
- logger.info(f'The start chapter is bigger than the number of chapters saved ({len(self.chapters)})')
434
- return
691
+ logger.error(f'The start chapter is bigger than the number of chapters saved ({len(self.chapters)})')
692
+ raise ValidationError(
693
+ f'The start chapter is bigger than the number of chapters saved ({len(self.chapters)})')
435
694
 
436
695
  if not end_chapter:
437
696
  end_chapter = len(self.chapters)
@@ -443,22 +702,19 @@ class Novel:
443
702
  idx = 1
444
703
  start = start_chapter
445
704
  while start <= end_chapter:
446
- end = min(start + chapters_by_book - 1, end_chapter)
705
+ end = min(start + chapters_by_book - 1,
706
+ end_chapter)
447
707
  result = self._save_chapters_to_epub(start_chapter=start,
448
708
  end_chapter=end,
449
709
  collection_idx=idx)
450
710
  if not result:
451
711
  logger.critical(f'Error with saving novel to epub, with start chapter: '
452
712
  f'{start_chapter} and end chapter: {end_chapter}')
453
- return False
454
713
  start = start + chapters_by_book
455
714
  idx = idx + 1
456
- return True
457
-
458
715
 
459
716
  ## UTILS
460
717
 
461
-
462
718
  def clean_files(self, clean_chapters: bool = True, clean_toc: bool = True, hard_clean: bool = False) -> None:
463
719
  hard_clean = hard_clean or self.scraper_behavior.hard_clean
464
720
  if clean_chapters:
@@ -470,8 +726,7 @@ class Novel:
470
726
  self._clean_toc(hard_clean)
471
727
 
472
728
  def show_novel_dir(self) -> str:
473
- return self.file_manager.novel_base_dir
474
-
729
+ return str(self.file_manager.novel_base_dir)
475
730
 
476
731
  ## PRIVATE HELPERS
477
732
 
@@ -492,9 +747,25 @@ class Novel:
492
747
  tocs_content = self.file_manager.get_all_toc()
493
748
  for i, toc in enumerate(tocs_content):
494
749
  toc = self.decoder.clean_html(toc, hard_clean=hard_clean)
495
- self.file_manager.update_toc(toc, i)
750
+ self.file_manager.update_toc(idx=i,
751
+ html=toc)
496
752
 
497
753
  def _request_html_content(self, url: str) -> Optional[str]:
754
+ """
755
+ Performs an HTTP request to retrieve HTML content from a URL.
756
+
757
+ Args:
758
+ url (str): The URL of the webpage to request
759
+
760
+ Returns:
761
+ Optional[str]: The HTML content of the webpage if the request is successful,
762
+ None otherwise
763
+
764
+ Note:
765
+ This method uses the decoder configuration and scraper behavior
766
+ to handle HTTP requests, including retries and timeouts.
767
+ """
768
+
498
769
  request_config = self.decoder.request_config
499
770
  force_flaresolver = request_config.get('force_flaresolver') or self.scraper_behavior.force_flaresolver
500
771
  html_content = get_html_content(url,
@@ -504,135 +775,331 @@ class Novel:
504
775
  force_flaresolver=force_flaresolver)
505
776
  return html_content
506
777
 
507
- def _get_chapter(self,
508
- chapter: Chapter,
509
- reload: bool = False) -> Chapter | None:
778
+ def _load_or_request_chapter(self,
779
+ chapter: Chapter,
780
+ reload_file: bool = False) -> Chapter:
781
+ """
782
+ Loads or requests a chapter's HTML content from a local file or a URL.
783
+
784
+ This method first attempts to load the chapter content from a local file.
785
+ If not possible or if reload is requested, it fetches the content from the web.
786
+
787
+ Args:
788
+ chapter (Chapter): Chapter object containing chapter information.
789
+ reload_file (bool, optional): If True, forces a new web request
790
+ regardless of local file existence. Defaults to False.
791
+
792
+ Returns:
793
+ Chapter: The Chapter object updated with HTML content.
794
+
795
+ Raises:
796
+ FileManagerError: If there's an error loading or saving the chapter file.
797
+ ValidationError: If there's a validation error when requesting the chapter.
798
+ NetworkError: If there's a network error when requesting the chapter.
799
+
800
+ Note:
801
+ - If the file doesn't exist locally, a web request will be made.
802
+ - If the file exists but is empty, a web request will be made.
803
+ - File saving errors are logged as warnings but don't stop execution.
804
+ """
510
805
 
511
- # Generate filename if needed
806
+ # Generate a filename if needed
512
807
  if not chapter.chapter_html_filename:
808
+ logger.debug('Generating a filename for the chapter')
513
809
  chapter.chapter_html_filename = utils.generate_file_name_from_url(
514
810
  chapter.chapter_url)
515
811
 
516
- # Try loading from cache first
517
- if not reload:
518
- chapter.chapter_html = self.file_manager.load_chapter_html(
519
- chapter.chapter_html_filename)
520
- if chapter.chapter_html:
812
+ # The HTML will be requested again if:
813
+ # 1. "Reload file" flag is True (requested by user)
814
+ # 2. Chapter file does not exist
815
+ # 3. The Chapter file does exist, but there is no content
816
+ reload_file = reload_file or not self.file_manager.chapter_file_exists(chapter.chapter_html_filename)
817
+ # Try loading from the disk first
818
+ if not reload_file:
819
+ try:
820
+ logger.debug(f'Loading chapter HTML from file: "{chapter.chapter_html_filename}"')
821
+ chapter.chapter_html = self.file_manager.load_chapter_html(chapter.chapter_html_filename)
822
+ except FileManagerError as e:
823
+ logger.error(f'Error when trying to load chapter {chapter.chapter_title} from file', exc_info=e)
824
+ raise
825
+ if chapter.chapter_html is not None:
521
826
  return chapter
522
827
 
523
828
  # Fetch fresh content
524
- chapter.chapter_html = self._request_html_content(chapter.chapter_url)
829
+ try:
830
+ logger.debug(f'Requesting chapter HTML from URL: "{chapter.chapter_url}"')
831
+ chapter.chapter_html = self._request_html_content(chapter.chapter_url)
832
+ except ValidationError:
833
+ logger.error(
834
+ f'Error when trying to request chapter {chapter.chapter_title} from url: {chapter.chapter_url}')
835
+ raise
836
+ except NetworkError:
837
+ logger.error(
838
+ f'Error when trying to request chapter {chapter.chapter_title} from url: {chapter.chapter_url}')
839
+ raise
840
+
841
+ # If the requests failed, we will let the higher methods decide if they throw an error.
525
842
  if not chapter.chapter_html:
526
843
  logger.error(f'No content found on link {chapter.chapter_url}')
527
844
  return chapter
528
845
 
529
846
  # Save content
530
- self.file_manager.save_chapter_html(
531
- chapter.chapter_html_filename, chapter.chapter_html)
847
+ try:
848
+ logger.info(f'Saving chapter HTML to file: "{chapter.chapter_html_filename}"')
849
+ self.file_manager.save_chapter_html(chapter.chapter_html_filename,
850
+ chapter.chapter_html)
851
+ except FileManagerError as e:
852
+ # We can pass this error and try again later
853
+ logger.warning(f'Error when trying to save chapter {chapter.chapter_title} to file', exc_info=e)
854
+
532
855
  return chapter
533
856
 
534
- def _add_toc(self,
535
- url: str,
536
- toc_filename: str = None,
537
- reload: bool = False):
538
- if not reload:
539
- content = self.file_manager.get_toc(toc_filename)
540
- if content:
541
- return content
857
+ def _request_toc_files(self):
858
+ """
859
+ Requests and stores all table of contents (TOC) files from the novel's website.
542
860
 
543
- if utils.check_incomplete_url(url):
544
- url = self.toc_main_url + url
861
+ This method handles both paginated and non-paginated TOCs:
862
+ - For non-paginated TOCs: Downloads and stores a single TOC file
863
+ - For paginated TOCs: Iteratively downloads all TOC pages until no next page is found
545
864
 
546
- # Fetch fresh content
547
- content = self._request_html_content(url)
548
- if not content:
549
- logger.warning(f'No content found on link {url}')
550
- sys.exit(1)
865
+ The method first clears any existing TOC files before downloading new ones.
551
866
 
552
- self.file_manager.add_toc(content)
553
- return content
867
+ Raises:
868
+ NetworkError: If there's an error during the HTTP request
869
+ ValidationError: If no content is found at the TOC URL
870
+ DecodeError: If there's an error parsing the next page URL
871
+
872
+ Note:
873
+ This is an internal method that uses the decoder configuration to determine
874
+ pagination behavior and to parse TOC content.
875
+ """
876
+
877
+ def _get_toc(toc_url: str, get_next_page: bool) -> str | None:
878
+ # Some TOCs next page links have incomplete URLS (e.g., /page/2)
879
+ if utils.check_incomplete_url(toc_url):
880
+ toc_url = self.toc_main_url + toc_url
881
+ logger.debug(f'Toc link is incomplete, trying with toc link: "{toc_url}"')
882
+
883
+ # Fetch fresh content
884
+ logger.debug(f'Requesting TOC from link: "{toc_url}"')
885
+ try:
886
+ toc_content = self._request_html_content(toc_url)
887
+ except NetworkError as E:
888
+ logger.error(f'Error with network, error: {E}')
889
+ raise
890
+
891
+ if not toc_content:
892
+ logger.error(f'No content found on link "{toc_url}"')
893
+ raise ValidationError(f'No content found on link "{toc_url}"')
894
+
895
+ logger.debug('Saving new TOC file to disk.')
896
+ self.file_manager.add_toc(toc_content)
897
+
898
+ if get_next_page:
899
+ try:
900
+ logger.debug(f'Parsing next page from link: {toc_url}')
901
+ next_page = self.decoder.get_toc_next_page_url(toc_content)
902
+ except DecodeError:
903
+ raise
904
+ return next_page
905
+ return None
554
906
 
555
- def _add_or_update_chapter_data(self, chapter: Chapter, link_idx: int = None, save_in_file: bool = True) -> None:
556
- if link_idx:
557
- chapter_idx = link_idx
907
+ self.file_manager.delete_toc()
908
+ has_pagination = self.decoder.has_pagination()
909
+
910
+ if not has_pagination:
911
+ logger.debug('TOC does not have pagination, requesting only one file.')
912
+ _get_toc(self.toc_main_url, get_next_page=False)
558
913
  else:
559
- # Check if the chapter exists
560
- chapter_idx = self._find_chapter_index_by_link(chapter.chapter_url)
561
- if chapter_idx is None:
562
- # If no existing chapter we append it
563
- self.chapters.append(chapter)
564
- chapter_idx = len(self.chapters)
565
- else:
566
- if chapter.chapter_title:
567
- self.chapters[chapter_idx].chapter_title = chapter.chapter_title
568
- if chapter.chapter_html_filename:
569
- self.chapters[chapter_idx].chapter_html_filename = chapter.chapter_html_filename
570
- if save_in_file:
571
- self.save_novel()
572
- return chapter_idx
914
+ logger.debug('TOC has pagination, requesting all files.')
915
+ next_page_url = self.toc_main_url
916
+ while next_page_url:
917
+ next_page_url = _get_toc(next_page_url, get_next_page=True)
918
+
919
+ def _load_or_request_chapter_urls_from_toc(self) -> None:
920
+ """
921
+ Extracts and processes chapter URLs from the table of contents.
922
+
923
+ Raises:
924
+ DecodeError: If fails to decode chapter URLs from TOC content
925
+ """
926
+ # Get configuration
927
+ is_inverted = self.decoder.is_index_inverted()
928
+ add_host_to_chapter = self.scraper_behavior.auto_add_host or self.decoder.add_host_to_chapter()
929
+
930
+ # Get all TOC content at once
931
+ try:
932
+ all_tocs = self.file_manager.get_all_toc()
933
+ except FileManagerError:
934
+ logger.error('Error when trying to load TOC files from disk.')
935
+ raise
936
+
937
+ # Extract URLs from all TOC fragments
938
+ self.chapters_url_list = []
939
+ for toc_content in all_tocs:
940
+ try:
941
+ urls = self.decoder.get_chapter_urls(toc_content)
942
+ self.chapters_url_list.extend(urls) # More efficient than creating intermediate lists
943
+ except DecodeError as e:
944
+ logger.error('Failed to decode chapter URLs from TOC content', exc_info=e)
945
+ raise
946
+
947
+ # Handle inversion if needed
948
+ if is_inverted:
949
+ logger.debug('Inverting chapter URLs order')
950
+ self.chapters_url_list.reverse() # In-place reversal is more efficient
951
+
952
+ # Add host if needed
953
+ if add_host_to_chapter:
954
+ logger.debug('Adding host to chapter URLs')
955
+ self.chapters_url_list = [f'https://{self.host}{url}' for url in self.chapters_url_list]
956
+
957
+ # Remove duplicates while preserving order
958
+ # self.chapters_url_list = utils.delete_duplicates(self.chapters_url_list)
959
+
960
+ logger.info(f'Successfully extracted {len(self.chapters_url_list)} unique chapter URLs')
961
+
962
+ def _create_chapters_from_toc(self):
963
+ """
964
+ Synchronizes existing chapters with the table of contents (TOC) URL list.
965
+
966
+ This method performs the following operations:
967
+ 1. Removes chapters whose URLs are no longer in the TOC
968
+ 2. Adds new chapters for URLs found in the TOC
969
+ 3. Reorders chapters according to the TOC sequence
970
+
971
+ Raises:
972
+ ValidationError: If there's an error when creating a new chapter
973
+
974
+ Note:
975
+ This is an internal method used to maintain consistency
976
+ between chapters and the table of contents.
977
+ """
573
978
 
574
- def _order_chapters_by_link_list(self) -> None:
979
+ existing_urls = {chapter.chapter_url for chapter in self.chapters}
980
+ toc_urls_set = set(self.chapters_url_list)
981
+
982
+ # Find chapters to remove and new chapters to add
983
+ urls_to_remove = existing_urls - toc_urls_set
984
+ urls_to_add = toc_urls_set - existing_urls
985
+
986
+ if urls_to_remove:
987
+ logger.info(f'Removing {len(urls_to_remove)} chapters not found in TOC')
988
+ self.chapters = [ch for ch in self.chapters if ch.chapter_url not in urls_to_remove]
989
+
990
+ if urls_to_add:
991
+ logger.info(f'Adding {len(urls_to_add)} new chapters from TOC')
992
+ for url in self.chapters_url_list:
993
+ if url in urls_to_add:
994
+ try:
995
+ new_chapter = Chapter(chapter_url=url)
996
+ self.chapters.append(new_chapter)
997
+ except ValidationError as e:
998
+ logger.error(f'Failed to create chapter for URL {url}: {e}')
999
+ raise
1000
+
1001
+ # Reorder according to TOC
1002
+ logger.debug('Reordering chapters according to TOC')
575
1003
  self.chapters.sort(
576
1004
  key=lambda x: self.chapters_url_list.index(x.chapter_url))
577
1005
 
578
- def _get_chapter_by_url(self, chapter_url: str) -> Chapter:
579
- for chapter in self.chapters:
580
- if chapter_url == chapter.chapter_url:
581
- return chapter
582
- return None
1006
+ logger.info(f'Chapter synchronization complete. Total chapters: {len(self.chapters)}')
583
1007
 
584
- def _find_chapter_index_by_link(self, chapter_url: str) -> str:
585
- for index, chapter in enumerate(self.chapters):
586
- if chapter.chapter_url == chapter_url:
587
- return index
588
- return None
1008
+ def _add_or_update_chapter_data(self, chapter: Chapter, save_in_file: bool = True) -> None:
589
1009
 
590
- def _delete_chapters_not_in_toc(self) -> None:
591
- self.chapters = [
592
- chapter for chapter in self.chapters if chapter.chapter_url in self.chapters_url_list]
1010
+ # Check if the chapter exists
1011
+ chapter_idx = self._find_chapter_index_by_url(chapter.chapter_url)
1012
+ if chapter_idx is None:
1013
+ # If no existing chapter, we append it
1014
+ self.chapters.append(chapter)
1015
+ else:
1016
+ if chapter.chapter_title:
1017
+ self.chapters[chapter_idx].chapter_title = chapter.chapter_title
1018
+ if chapter.chapter_html_filename:
1019
+ self.chapters[chapter_idx].chapter_html_filename = chapter.chapter_html_filename
593
1020
 
594
- def _create_chapters_from_toc(self):
595
- self._delete_chapters_not_in_toc()
596
- increment = 100
597
- aux = 1
598
- for chapter_url in self.chapters_url_list:
599
- aux += 1
600
- chapter_idx = self._find_chapter_index_by_link(chapter_url)
601
- if not chapter_idx:
602
- chapter = Chapter(chapter_url=chapter_url)
603
- self._add_or_update_chapter_data(
604
- chapter=chapter, save_in_file=False)
605
- if aux == increment:
606
- self.save_novel()
607
- aux = 1
608
- self._order_chapters_by_link_list()
609
- self.save_novel()
1021
+ if save_in_file:
1022
+ self.save_novel()
610
1023
 
611
- def _decode_chapter(self, chapter: Chapter, idx_for_chapter_name: str = None) -> Chapter:
612
- logger.debug('Decoding chapter...')
613
- if chapter.chapter_html is None:
614
- logger.debug(f'No HTML content found, requesting HTML content...')
615
- chapter = self._get_chapter(chapter)
1024
+ def _find_chapter_index_by_url(self, chapter_url: str) -> Optional[int]:
1025
+ """
1026
+ Find the chapter index by its URL in the chapter list.
1027
+
1028
+ Args:
1029
+ chapter_url: URL of the chapter to find
1030
+
1031
+ Returns:
1032
+ Optional[int]: Index of the chapter if found, None otherwise
1033
+
1034
+ Note:
1035
+ Uses next() for efficient iteration - stops as soon as a match is found
1036
+ """
1037
+ try:
1038
+ return next(i for i, ch in enumerate(self.chapters)
1039
+ if ch.chapter_url == chapter_url)
1040
+ except StopIteration:
1041
+ return None
1042
+
1043
+ def _decode_chapter(self,
1044
+ chapter: Chapter,
1045
+ save_title_to_content: bool = False) -> Chapter:
1046
+ """
1047
+ Decodes a chapter's HTML content to extract title and content.
1048
+
1049
+ This method processes the HTML content of a chapter to extract its title and content.
1050
+ If no title is found, it auto-generates one using the chapter's index in the URL list.
1051
+
1052
+ Args:
1053
+ chapter (Chapter): Chapter object containing the HTML content to decode.
1054
+ save_title_to_content (bool, optional): Whether to include the title in the
1055
+ chapter content. Defaults to False.
1056
+
1057
+ Returns:
1058
+ Chapter: The updated Chapter object with decoded title and content.
616
1059
 
617
- if not chapter.chapter_html:
618
- raise ValueError(f'Chapter HTML could not be obtained for chapter link "{chapter.chapter_url}" '
619
- f'on file "{chapter.chapter_html_filename}"')
1060
+ Raises:
1061
+ ScraperError: If the chapter's HTML content is None.
1062
+ DecodeError: If there's an error decoding the chapter's title or content.
1063
+
1064
+ Note:
1065
+ - If no title is found, it will be auto-generated as "{novel_title} Chapter {index}".
1066
+ - The chapter's HTML must be loaded before calling this method.
1067
+ """
1068
+
1069
+ logger.debug(f'Decoding chapter with URL {chapter.chapter_url}...')
1070
+ if chapter.chapter_html is None:
1071
+ logger.error(f'Chapter HTML not found for chapter with URL "{chapter.chapter_url}"')
1072
+ raise ScraperError(f'Chapter HTML not found for chapter with URL "{chapter.chapter_url}"')
620
1073
 
621
1074
  logger.debug('Obtaining chapter title...')
622
- chapter_title = self.decoder.get_chapter_title(chapter.chapter_html)
623
- if not chapter_title:
624
- logger.debug('No chapter title found, generating one...')
625
- chapter_title = f'{self.title} Chapter {idx_for_chapter_name}'
626
- chapter.chapter_title = str(chapter_title)
627
- logger.debug(f'Chapter title: "{chapter_title}"')
1075
+ try:
1076
+ chapter_title = self.decoder.get_chapter_title(chapter.chapter_html)
1077
+ except DecodeError as e:
1078
+ logger.error(f'Failed to decode chapter title from HTML content: {e}')
1079
+ raise
628
1080
 
629
- logger.debug('Obtaining chapter content...')
630
- save_title_to_content = self.scraper_behavior.save_title_to_content or self.decoder.save_title_to_content()
631
- chapter.chapter_content = self.decoder.get_chapter_content(chapter.chapter_html,
632
- save_title_to_content,
633
- chapter.chapter_title)
634
- logger.debug('Chapter successfully decoded')
1081
+ if chapter_title is None:
1082
+ logger.debug('No chapter title found, trying to autogenerate one...')
1083
+ try:
1084
+ chapter_idx = self.chapters_url_list.index(chapter.chapter_url)
1085
+ except ValueError:
1086
+ chapter_idx = ""
1087
+
1088
+ chapter_title = f'{self.title} Chapter {chapter_idx}'
635
1089
 
1090
+ chapter.chapter_title = chapter_title
1091
+ logger.info(f'Chapter title: "{chapter_title}"')
1092
+
1093
+ logger.debug('Obtaining chapter content...')
1094
+ try:
1095
+ chapter.chapter_content = self.decoder.get_chapter_content(chapter.chapter_html,
1096
+ save_title_to_content,
1097
+ chapter.chapter_title)
1098
+ except DecodeError:
1099
+ logger.error(f'Failed to decode chapter content for chapter with URL "{chapter.chapter_url}"')
1100
+ raise
1101
+
1102
+ logger.debug('Chapter title and content successfully decoded from HTML')
636
1103
  return chapter
637
1104
 
638
1105
  def _create_epub_book(self, book_title: str = None, calibre_collection: dict = None) -> epub.EpubBook:
@@ -661,7 +1128,7 @@ class Novel:
661
1128
  # date_metadata += f'/{self.metadata.end_date}'
662
1129
  if self.metadata.end_date:
663
1130
  book.add_metadata('OPF', 'meta', self.metadata.end_date, {
664
- 'name': 'end_date', 'content': self.metadata.end_date})
1131
+ 'name': 'end_date', 'content': self.metadata.end_date})
665
1132
  if date_metadata:
666
1133
  logger.debug(f'Using date_metadata {date_metadata}')
667
1134
  book.add_metadata('DC', 'date', date_metadata)
@@ -669,12 +1136,13 @@ class Novel:
669
1136
  # Collections with calibre
670
1137
  if calibre_collection:
671
1138
  book.add_metadata('OPF', 'meta', '', {
672
- 'name': 'calibre:series', 'content': calibre_collection["title"]})
1139
+ 'name': 'calibre:series', 'content': calibre_collection["title"]})
673
1140
  book.add_metadata('OPF', 'meta', '', {
674
- 'name': 'calibre:series_index', 'content': calibre_collection["idx"]})
1141
+ 'name': 'calibre:series_index', 'content': calibre_collection["idx"]})
675
1142
 
676
1143
  cover_image_content = self.file_manager.load_novel_cover()
677
1144
  if cover_image_content:
1145
+ breakpoint()
678
1146
  book.set_cover('cover.jpg', cover_image_content)
679
1147
  book.spine += ['cover']
680
1148
 
@@ -682,11 +1150,10 @@ class Novel:
682
1150
  return book
683
1151
 
684
1152
  def _add_chapter_to_epub_book(self, chapter: Chapter, book: epub.EpubBook):
685
- chapter = self.scrap_chapter(
686
- chapter_url=chapter.chapter_url)
1153
+ chapter = self.scrap_chapter(chapter)
687
1154
  if chapter is None:
688
1155
  logger.warning('Error reading chapter')
689
- return
1156
+ return None
690
1157
  self._add_or_update_chapter_data(
691
1158
  chapter=chapter, save_in_file=False)
692
1159
  file_name = utils.generate_epub_file_name_from_title(
@@ -708,10 +1175,9 @@ class Novel:
708
1175
  start_chapter: int,
709
1176
  end_chapter: int = None,
710
1177
  collection_idx: int = None):
711
-
712
1178
  if start_chapter > len(self.chapters):
713
1179
  logger.error('start_chapter out of range')
714
- return
1180
+ return None
715
1181
  # If end_chapter is not set, we set it to idx_start + chapters_num - 1
716
1182
  if not end_chapter:
717
1183
  end_chapter = len(self.chapters)
@@ -725,7 +1191,7 @@ class Novel:
725
1191
  # We create the epub book
726
1192
  book_title = f'{self.title} Chapters {start_chapter} - {end_chapter}'
727
1193
  calibre_collection = None
728
- # If collection_idx is set, we create a calibre collection
1194
+ # If collection_idx is set, we create a Calibre collection
729
1195
  if collection_idx:
730
1196
  calibre_collection = {'title': self.title,
731
1197
  'idx': str(collection_idx)}
@@ -735,11 +1201,16 @@ class Novel:
735
1201
  book = self._add_chapter_to_epub_book(chapter=chapter,
736
1202
  book=book)
737
1203
  if book is None:
738
- logger.critical(f'Error saving epub {book_title}, could not decode chapter {chapter} using host {self.host}')
1204
+ logger.critical(
1205
+ f'Error saving epub {book_title}, could not decode chapter {chapter} using host {self.host}')
739
1206
  return False
740
1207
 
741
1208
  book.add_item(epub.EpubNcx())
742
1209
  book.add_item(epub.EpubNav())
743
- self.file_manager.save_book(book, f'{book_title}.epub')
1210
+ try:
1211
+ self.file_manager.save_book(book, f'{book_title}.epub')
1212
+ except FileManagerError:
1213
+ logger.error(f'Error saving epub {book_title}')
1214
+ raise
744
1215
  self.save_novel()
745
1216
  return True