web-novel-scraper 2.0.3__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,163 +1,86 @@
1
- from dataclasses import dataclass, fields, field
2
- import sys
1
+ from dataclasses import dataclass, field, replace
3
2
 
4
- from dataclasses_json import dataclass_json, config, Undefined
3
+ from dataclasses_json import dataclass_json, Undefined, config
5
4
  from ebooklib import epub
6
5
  from typing import Optional
6
+ from pathlib import Path
7
7
 
8
8
  from . import logger_manager
9
9
  from .decode import Decoder
10
10
  from .file_manager import FileManager
11
11
  from . import utils
12
-
13
12
  from .request_manager import get_html_content
14
13
  from .config_manager import ScraperConfig
14
+ from .models import ScraperBehavior, Metadata, Chapter
15
+ from .utils import _always, ScraperError, FileManagerError, NetworkError, ValidationError, DecodeError
15
16
 
16
17
  logger = logger_manager.create_logger('NOVEL SCRAPPING')
17
18
 
18
19
 
19
- @dataclass_json
20
- @dataclass
21
- class Metadata:
22
- author: Optional[str] = None
23
- start_date: Optional[str] = None
24
- end_date: Optional[str] = None
25
- language: Optional[str] = "en"
26
- description: Optional[str] = None
27
- tags: list[str] = field(default_factory=list)
28
-
29
- def update_behavior(self, **kwargs):
30
- """
31
- Updates the behavior configuration dynamically.
32
- Only updates the attributes provided in kwargs.
33
- """
34
- for key, value in kwargs.items():
35
- if hasattr(self, key) and value is not None:
36
- setattr(self, key, value)
37
-
38
- def __str__(self):
39
- """
40
- Dynamic string representation of the configuration.
41
- """
42
- attributes = [(f"{field.name}="
43
- f"{getattr(self, field.name)}") for field in fields(self)]
44
- attributes_str = '\n'.join(attributes)
45
- return (f"Metadata: \n"
46
- f"{attributes_str}")
47
-
48
-
49
- @dataclass_json
50
- @dataclass
51
- class ScraperBehavior:
52
- # Some novels already have the title in the content.
53
- save_title_to_content: bool = False
54
- # Some novels have the toc link without the host
55
- auto_add_host: bool = False
56
- # Some hosts return 403 when scrapping, this will force the use of FlareSolver
57
- # to save time
58
- force_flaresolver: bool = False
59
- # When you clean the html files, you can use hard clean by default
60
- hard_clean: bool = False
61
-
62
- def update_behavior(self, **kwargs):
63
- """
64
- Updates the behavior configuration dynamically.
65
- Only updates the attributes provided in kwargs.
66
- """
67
- for key, value in kwargs.items():
68
- if hasattr(self, key) and value is not None:
69
- setattr(self, key, value)
70
-
71
- def __str__(self):
72
- """
73
- Dynamic string representation of the configuration.
74
- """
75
- attributes = [(f"{field.name}="
76
- f"{getattr(self, field.name)}") for field in fields(self)]
77
- attributes_str = '\n'.join(attributes)
78
- return (f"Scraper Behavior: \n"
79
- f"{attributes_str}")
80
-
81
-
82
- @dataclass_json(undefined=Undefined.EXCLUDE)
83
- @dataclass
84
- class Chapter:
85
- chapter_url: str
86
- chapter_html_filename: Optional[str] = None
87
- chapter_title: Optional[str] = None
88
-
89
- def __init__(self,
90
- chapter_url: str,
91
- chapter_html: str = None,
92
- chapter_content: str = None,
93
- chapter_html_filename: str = None,
94
- chapter_title: str = None):
95
- self.chapter_url = chapter_url
96
- self.chapter_html = chapter_html
97
- self.chapter_content = chapter_content
98
- self.chapter_html_filename = chapter_html_filename
99
- self.chapter_title = chapter_title
100
-
101
- def __str__(self):
102
- return f'Title: "{self.chapter_title}"\nURL: "{self.chapter_url}"\nFilename: "{self.chapter_html_filename}"'
103
-
104
- def __lt__(self, another):
105
- return self.chapter_title < another.chapter_title
106
-
107
-
108
20
  @dataclass_json(undefined=Undefined.EXCLUDE)
109
21
  @dataclass
110
22
  class Novel:
111
- metadata: Metadata = None
112
- title: str = None
113
- scraper_behavior: ScraperBehavior = None
114
- chapters: list[Chapter] = field(default_factory=list)
23
+ """
24
+ A class representing a web novel with its metadata and content.
25
+
26
+ This class handles all operations related to scraping, storing, and managing web novels,
27
+ including their chapters, table of contents, and metadata.
28
+
29
+ Attributes:
30
+ title (str): The title of the novel.
31
+ host (Optional[str]): The host domain where the novel is located.
32
+ toc_main_url (Optional[str]): The main URL for the table of contents.
33
+ chapters (list[Chapter]): List of chapters in the novel.
34
+ chapters_url_list (list[str]): List of URLs for all chapters.
35
+ metadata (Metadata): Novel metadata like author, language, etc.
36
+ scraper_behavior (ScraperBehavior): Configuration for scraping behavior.
37
+ file_manager (FileManager): Handles file operations for the novel.
38
+ decoder (Decoder): Handles HTML decoding and parsing.
39
+ config (ScraperConfig): General scraper configuration.
40
+ """
41
+
42
+ title: str
43
+ host: Optional[str] = None
115
44
  toc_main_url: Optional[str] = None
45
+ chapters: list[Chapter] = field(default_factory=list)
116
46
  chapters_url_list: list[str] = field(default_factory=list)
117
- host: str = None
118
-
119
- def __init__(self,
120
- title: str,
121
- toc_main_url: str = None,
122
- toc_html: str = None,
123
- chapters_url_list: list[str] = None,
124
- metadata: Metadata = None,
125
- chapters: list[Chapter] = None,
126
- scraper_behavior: ScraperBehavior = None,
127
- host: str = None
128
- ):
129
- if toc_main_url and toc_html:
130
- logger.critical('There can only be one or toc_main_url or toc_html')
131
- raise ValueError('There can only be one or toc_main_url or toc_html')
132
-
133
- self.title = title
134
- self.metadata = Metadata()
135
- if metadata is not None:
136
- self.metadata = metadata
137
-
138
- if toc_html:
139
- self.file_manager.add_toc(toc_html)
140
-
141
- self.toc_main_url = toc_main_url
142
- self.chapters_url_list = chapters_url_list if chapters_url_list else []
143
-
144
- self.chapters = chapters if chapters else []
145
-
146
- self.scraper_behavior = scraper_behavior if scraper_behavior else ScraperBehavior()
147
- if not host and not toc_main_url:
148
- logger.error('You need to set "host" or "toc_main_url".')
149
- sys.exit(1)
47
+ metadata: Metadata = field(default_factory=Metadata)
48
+ scraper_behavior: ScraperBehavior = field(default_factory=ScraperBehavior)
49
+
50
+ file_manager: FileManager = field(default=None,
51
+ repr=False,
52
+ compare=False,
53
+ metadata=config(exclude=_always))
54
+ decoder: Decoder = field(default=None,
55
+ repr=False,
56
+ compare=False,
57
+ metadata=config(exclude=_always))
58
+ config: ScraperConfig = field(default=None,
59
+ repr=False,
60
+ compare=False,
61
+ metadata=config(exclude=_always))
62
+
63
+ def __post_init__(self):
64
+ """
65
+ Validates the novel instance after initialization.
150
66
 
151
- self.host = host if host else utils.obtain_host(self.toc_main_url)
67
+ Raises:
68
+ ValidationError: If the title is empty or neither host nor toc_main_url is provided.
69
+ """
152
70
 
153
- self.config = None
154
- self.file_manager = None
155
- self.decoder = None
71
+ if not self.title:
72
+ raise ValidationError("title can't be empty")
73
+ if not (self.host or self.toc_main_url):
74
+ raise ValidationError('You must provide "host" or "toc_main_url"')
156
75
 
157
76
  def __str__(self):
158
77
  """
159
- Dynamic string representation of the novel.
78
+ Returns a string representation of the novel with its main attributes.
79
+
80
+ Returns:
81
+ str: A formatted string containing the novel's main information.
160
82
  """
83
+
161
84
  toc_info = self.toc_main_url if self.toc_main_url else "TOC added manually"
162
85
  attributes = [
163
86
  f"Title: {self.title}",
@@ -172,99 +95,316 @@ class Novel:
172
95
  return (f"Novel Info: \n"
173
96
  f"{attributes_str}")
174
97
 
175
- @staticmethod
176
- def load(title: str, cfg: ScraperConfig, novel_base_dir: str | None = None):
98
+ @classmethod
99
+ def load(cls, title: str, cfg: ScraperConfig, novel_base_dir: Path = None) -> 'Novel':
100
+ """
101
+ Loads a novel from stored JSON data.
102
+
103
+ Args:
104
+ title (str): Title of the novel to load.
105
+ cfg (ScraperConfig): Scraper configuration.
106
+ novel_base_dir (Path, optional): Base directory for the novel data.
107
+
108
+ Returns:
109
+ Novel: A new Novel instance loaded from stored data.
110
+
111
+ Raises:
112
+ ValidationError: If the novel with the given title is not found.
113
+ """
114
+
177
115
  fm = FileManager(title, cfg.base_novels_dir, novel_base_dir, read_only=True)
178
116
  raw = fm.load_novel_json()
179
117
  if raw is None:
180
118
  logger.debug(f'Novel "{title}" was not found.')
181
- raise ValueError(f'Novel "{title}" was not found.')
182
- novel = Novel.from_json(raw)
183
- novel.config = cfg
119
+ raise ValidationError(f'Novel "{title}" was not found.')
120
+ novel = cls.from_json(raw)
184
121
  novel.set_config(cfg=cfg, novel_base_dir=novel_base_dir)
185
122
  return novel
186
123
 
124
+ @classmethod
125
+ def new(cls, title: str, cfg: ScraperConfig, host: str = None, toc_html: str = None,
126
+ toc_main_url: str = None) -> 'Novel':
127
+ """Creates a new Novel instance.
128
+
129
+ Args:
130
+ title: Title of the novel (required)
131
+ cfg: Scraper configuration (required)
132
+ host: Host URL for the novel content (optional)
133
+ toc_html: HTML content for the table of contents (optional)
134
+ toc_main_url: URL for the table of contents (optional)
135
+
136
+ Note:
137
+ - Either toc_html or toc_main_url must be provided
138
+ - If toc_main_url is provided, host will be extracted from it if not explicitly provided
139
+ - If toc_html is provided, host must be explicitly provided
140
+
141
+ Returns:
142
+ Novel: A new Novel instance
143
+
144
+ Raises:
145
+ ValidationError: If the title is empty, or if neither toc_html nor toc_main_url is provided
146
+ """
147
+ if not title:
148
+ raise ValidationError("Title cannot be empty")
149
+
150
+ if not (toc_html or toc_main_url):
151
+ raise ValidationError("Either toc_html or toc_main_url must be provided")
152
+
153
+ if toc_html and not host:
154
+ raise ValidationError("When providing toc_html, host must be explicitly provided")
155
+
156
+ novel = cls(title=title, host=host, toc_main_url=toc_main_url)
157
+ # If toc_main_url is provided and the host isn't, extract host from URL
158
+ if toc_main_url and not host:
159
+ host = utils.obtain_host(toc_main_url)
160
+ novel.host = host
161
+
162
+ # If toc_html is provided, add it to the novel
163
+ if toc_html:
164
+ novel.add_toc_html(toc_html, host)
165
+
166
+ return novel
167
+
187
168
  # NOVEL PARAMETERS MANAGEMENT
188
169
 
189
170
  def set_config(self,
190
- cfg: ScraperConfig = None,
191
- config_file: str = None,
192
- base_novels_dir: str = None,
193
- novel_base_dir: str = None,
194
- decode_guide_file: str = None):
195
- if cfg is not None:
196
- self.config = cfg
197
- else:
198
- self.config = ScraperConfig(config_file=config_file,
199
- base_novels_dir=base_novels_dir,
200
- decode_guide_file=decode_guide_file)
171
+ cfg: ScraperConfig,
172
+ novel_base_dir: str | None = None) -> None:
173
+ """
174
+ Configures the novel with the provided scraper configuration and base directory.
175
+
176
+ Sets up the file manager and decoder for the novel based on the provided configuration.
177
+
178
+ Args:
179
+ cfg (ScraperConfig): The scraper configuration to use.
180
+ novel_base_dir (str | None, optional): Base directory for the novel files.
181
+ If None, it uses the default directory from configuration.
201
182
 
202
- self.file_manager = FileManager(title=self.title,
203
- base_novels_dir=self.config.base_novels_dir,
204
- novel_base_dir=novel_base_dir)
183
+ Raises:
184
+ FileManagerError: If there's an error when reading the config or decoding guide files.
185
+ """
186
+
187
+ try:
188
+ self.config = cfg
189
+ self.file_manager = FileManager(title=self.title,
190
+ base_novels_dir=self.config.base_novels_dir,
191
+ novel_base_dir=novel_base_dir)
192
+ self.decoder = Decoder(self.host, self.config.decode_guide_file)
193
+ except FileManagerError as e:
194
+ logger.error("Could not set configuration. File Manager Error", exc_info=e)
195
+ raise
196
+
197
+ def set_scraper_behavior(self, **kwargs) -> None:
198
+ """
199
+ Updates the scraper behavior configuration with the provided parameters.
205
200
 
206
- self.decoder = Decoder(self.host, self.config.decode_guide_file)
201
+ Args:
202
+ **kwargs: Keyword arguments for updating scraper behavior settings.
203
+ Can include any valid ScraperBehavior attributes.
204
+ """
207
205
 
208
- def set_scraper_behavior(self, save: bool = False, **kwargs) -> None:
209
- self.scraper_behavior.update_behavior(**kwargs)
206
+ filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
207
+ self.scraper_behavior = replace(self.scraper_behavior, **filtered_kwargs)
208
+ logger.info(f'Scraper behavior updated')
210
209
 
211
210
  def set_metadata(self, **kwargs) -> None:
212
- self.metadata.update_behavior(**kwargs)
211
+ """
212
+ Updates the novel's metadata with the provided parameters.
213
+
214
+ Args:
215
+ **kwargs: Keyword arguments for updating metadata.
216
+ Can include any valid Metadata attributes like author, language, etc.
217
+ """
218
+ filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
219
+ self.metadata = replace(self.metadata, **filtered_kwargs)
220
+ logger.info(f'Metadata updated')
221
+
222
+ def add_tag(self, tag: str) -> None:
223
+ """
224
+ Adds a new tag to the novel's metadata if it doesn't already exist.
225
+
226
+ Args:
227
+ tag (str): The tag to add to the novel's metadata.
228
+ """
213
229
 
214
- def add_tag(self, tag: str) -> bool:
215
230
  if tag not in self.metadata.tags:
216
- self.metadata.tags.append(tag)
217
- return True
218
- logger.warning(f'Tag "{tag}" already exists on novel {self.title}')
219
- return False
231
+ self.metadata = replace(
232
+ self.metadata, tags=(*self.metadata.tags, tag)
233
+ )
234
+ logger.info('Tag %s added to metadata', tag)
235
+ else:
236
+ logger.debug("Tag %s already present in %s", tag, self.title)
237
+
238
+ def remove_tag(self, tag: str) -> None:
239
+ """
240
+ Removes a tag from the novel's metadata if it exists.
241
+
242
+ Args:
243
+ tag (str): The tag to remove from the novel's metadata.
244
+ """
220
245
 
221
- def remove_tag(self, tag: str) -> bool:
222
246
  if tag in self.metadata.tags:
223
- self.metadata.tags.remove(tag)
224
- return True
225
- logger.warning(f'Tag "{tag}" doesn\'t exist on novel {self.title}')
226
- return False
247
+ self.metadata = replace(self.metadata,
248
+ tags=tuple(t for t in self.metadata.tags if t != tag))
249
+ logger.info('Tag %s removed from metadata', tag)
250
+ else:
251
+ logger.debug("Tag %s not present in %s", tag, self.title)
227
252
 
228
253
  def set_cover_image(self, cover_image_path: str) -> None:
229
- self.file_manager.save_novel_cover(cover_image_path)
254
+ """
255
+ Sets or updates the novel's cover image.
256
+
257
+ Args:
258
+ cover_image_path (str): Path to the cover image file.
259
+
260
+ Raises:
261
+ FileManagerError: If there's an error when saving the cover image.
262
+ """
263
+
264
+ try:
265
+ self.file_manager.save_novel_cover(cover_image_path)
266
+ logger.info('Cover image updated')
267
+ except FileManagerError as e:
268
+ logger.error("Could not update cover. File Manager Error", exc_info=e)
269
+ raise
230
270
 
231
271
  def set_host(self, host: str) -> None:
272
+ """
273
+ Sets or updates the novel's host URL and modifies the decoder.
274
+
275
+ Args:
276
+ host (str): The host URL for the novel.
277
+
278
+ Raises:
279
+ DecodeError: If there's an error when setting up the decoder with the new host.
280
+ """
281
+
232
282
  self.host = host
233
- self.decoder.set_host(host)
283
+ try:
284
+ self.decoder.set_host(host)
285
+ logger.info(f'Host updated to "{self.host}"')
286
+ except ValidationError as e:
287
+ logger.error("Could not set host. Decode Error", exc_info=e)
288
+ raise
289
+
290
+ def save_novel(self) -> None:
291
+ """
292
+ Saves the current state of the novel to disk.
293
+
294
+ Persists all novel data including metadata, chapters, and configuration
295
+ to the novel's JSON file.
296
+
297
+ Raises:
298
+ FileManagerError: If there's an error when saving the novel data.
299
+ """
234
300
 
235
- def save_novel(self, save: bool = True) -> None:
236
- self.file_manager.save_novel_json(self.to_dict())
301
+ try:
302
+ self.file_manager.save_novel_json(self.to_dict())
303
+ logger.info(f'Novel data saved to disk on file "{self.file_manager.novel_json_file}".')
304
+ except FileManagerError as e:
305
+ logger.error("Could not save novel. File Manager Error", exc_info=e)
306
+ raise
237
307
 
238
308
  # TABLE OF CONTENTS MANAGEMENT
239
309
 
240
- def set_toc_main_url(self, toc_main_url: str, host: str = None, update_host: bool = False) -> None:
310
+ def set_toc_main_url(self, toc_main_url: str, update_host: bool = True) -> None:
311
+ """
312
+ Sets the main URL for the table of contents and optionally updates the host.
313
+
314
+ Deletes any existing TOC files as they will be refreshed from the new URL.
315
+ If update_host is True, extracts and updates the host from the new URL.
316
+
317
+ Args:
318
+ toc_main_url: Main URL for the table of contents
319
+ update_host: Whether to update the host based on the URL (default: True)
320
+
321
+ Raises:
322
+ ValidationError: If host extraction fails
323
+ FileManagerError: If TOC deletion fails
324
+ """
325
+
241
326
  self.toc_main_url = toc_main_url
242
- self.file_manager.delete_toc()
243
- if host:
244
- self.host = host
245
- self.decoder = Decoder(self.host)
246
- elif update_host:
247
- self.decoder = Decoder(utils.obtain_host(self.toc_main_url))
327
+ logger.info(f'Main URL updated to "{self.toc_main_url}", TOCs already requested will be deleted.')
328
+ try:
329
+ self.file_manager.delete_toc()
330
+ except FileManagerError as e:
331
+ logger.error("Could not delete TOCs. File Manager Error", exc_info=e)
332
+ raise
333
+
334
+ if update_host:
335
+ new_host = utils.obtain_host(self.toc_main_url)
336
+ logger.debug(f'Update Host flag present, new host is "{new_host}".')
337
+ self.set_host(new_host)
248
338
 
249
339
  def add_toc_html(self, html: str, host: str = None) -> None:
340
+ """
341
+ Adds HTML content as a table of contents fragment.
342
+
343
+ This method is mutually exclusive with using toc_main_url - if a main URL exists,
344
+ it will be cleared. Host must be provided either directly or from a previous configuration.
345
+
346
+ Args:
347
+ html: HTML content to add as TOC fragment
348
+ host: Optional host to set for this content
349
+
350
+ Raises:
351
+ ValidationError: If no host is provided when required
352
+ FileManagerError: If saving TOC content fails
353
+ """
354
+
250
355
  if self.toc_main_url:
356
+ logger.debug(f'TOC main URL is exclusive with manual TOC files, TOC main URL will be deleted.')
251
357
  self.delete_toc()
252
358
  self.toc_main_url = None
253
359
 
254
360
  if host:
255
- self.host = host
256
- self.decoder = Decoder(self.host)
361
+ self.set_host(host)
362
+ else:
363
+ if self.host is None:
364
+ logger.error(f'When using TOC files instead of URLs, host must be provided.')
365
+ raise ValidationError('Host must be provided when using TOC files instead of URLs.')
257
366
  self.file_manager.add_toc(html)
258
- # Delete toc_main_url since they are exclusive
367
+ logger.info('New TOC file added to disk.')
259
368
 
260
369
  def delete_toc(self):
370
+ """
371
+ Deletes all table of contents files and resets chapter data.
372
+
373
+ Clears:
374
+ - All TOC files from disk
375
+ - Chapter list
376
+ - Chapter URL list
377
+
378
+ Raises:
379
+ FileManagerError: If deletion of TOC files fails
380
+ """
381
+
261
382
  self.file_manager.delete_toc()
262
383
  self.chapters = []
263
384
  self.chapters_url_list = []
385
+ logger.info('TOC files deleted from disk.')
386
+
387
+ def sync_toc(self, reload_files: bool = True) -> None:
388
+ """
389
+ Synchronizes the table of contents with stored/remote content.
390
+
391
+ Process:
392
+ 1. Checks if TOC content exists (stored or retrievable)
393
+ 2. Optionally reloads TOC files from remote if needed
394
+ 3. Extracts chapter URLs from TOC content
395
+ 4. Creates/updates chapters based on URLs
396
+
397
+ Args:
398
+ reload_files: Whether to force reload of TOC files from remote (default: True)
399
+
400
+ Raises:
401
+ ScraperError: If no TOC content is available
402
+ FileManagerError: If file operations fail
403
+ DecodeError: If TOC parsing fails
404
+ NetworkError: If remote content retrieval fails
405
+ ValidationError: If chapter creation fails
406
+ """
264
407
 
265
- def sync_toc(self, reload_files: bool = False) -> bool:
266
- # Hard reload will request again the toc files from the toc_main_url
267
- # Only works with toc_main_url
268
408
  all_tocs_content = self.file_manager.get_all_toc()
269
409
 
270
410
  # If there is no toc_main_url and no manually added toc, there is no way to sync toc
@@ -272,59 +412,116 @@ class Novel:
272
412
  if toc_not_exists:
273
413
  logger.critical(
274
414
  'There is no toc html and no toc url set, unable to get toc.')
275
- return False
276
-
277
- reload_files = reload_files and self.toc_main_url is not None
278
- if reload_files or not all_tocs_content:
279
- self.chapters = []
280
- self.file_manager.delete_toc()
281
- all_tocs_content = []
282
- toc_content = self._add_toc(self.toc_main_url)
283
- all_tocs_content.append(toc_content)
284
- if self.decoder.has_pagination():
285
- next_page = self.decoder.get_toc_next_page_url(toc_content)
286
- while next_page:
287
- toc_content = self._add_toc(next_page)
288
- next_page = self.decoder.get_toc_next_page_url(toc_content)
289
- all_tocs_content.append(toc_content)
415
+ raise ScraperError('There is no toc html and no toc url set, unable to get toc.')
416
+
417
+ # Will reload files if:
418
+ # Reload_files is True (requested by user) AND there is a toc_main_url present.
419
+ # OR
420
+ # There is a toc_main_url present, but no toc files are saved in the disk.
421
+ reload_files = ((reload_files or
422
+ all_tocs_content is None) or
423
+ self.toc_main_url is not None)
424
+ if reload_files:
425
+ logger.debug('Reloading TOC files.')
426
+ try:
427
+ self._request_toc_files()
428
+ except FileManagerError as e:
429
+ logger.error("Could not request TOC files. File Manager Error", exc_info=e)
430
+ raise
431
+ except DecodeError as e:
432
+ logger.error("Could not request TOC files. Decoder Error", exc_info=e)
433
+ raise
434
+ except NetworkError as e:
435
+ logger.error("Could not request TOC files. Network Error", exc_info=e)
436
+ raise
437
+
438
+ try:
439
+ self._load_or_request_chapter_urls_from_toc()
440
+ except DecodeError as e:
441
+ logger.error("Could not get chapter urls from TOC files. Decoder Error", exc_info=e)
442
+ raise
443
+ except FileManagerError as e:
444
+ logger.error("Could not get chapter urls from TOC files. File Manager Error", exc_info=e)
445
+ raise
446
+
447
+ try:
448
+ self._create_chapters_from_toc()
449
+ except ValidationError as e:
450
+ logger.error("Could not create chapters from TOC files. Validation Error", exc_info=e)
451
+ raise
452
+ logger.info('TOC synced with files, Chapters created from Table of Contents.')
453
+
454
+ def show_toc(self) -> Optional[str]:
455
+ """
456
+ Generates a human-readable representation of the Table Of Contents.
290
457
 
291
- # Now we get the links from the toc content
292
- self.chapters_url_list = []
293
- for toc_content in all_tocs_content:
294
- chapters_url_from_toc_content = self.decoder.get_chapter_urls(toc_content)
295
- if chapters_url_from_toc_content is None:
296
- logger.error('Chapters url not found on toc_content')
297
- return False
298
- # First we save a list of lists in case we need to invert the orderAdd commentMore actions
299
- self.chapters_url_list.append(chapters_url_from_toc_content)
300
-
301
- invert = self.decoder.is_index_inverted()
302
- self.chapters_url_list = [
303
- chapter
304
- for chapters_url in (self.chapters_url_list[::-1] if invert else self.chapters_url_list)
305
- for chapter in chapters_url
306
- ]
307
- add_host_to_chapter = self.scraper_behavior.auto_add_host or self.decoder.add_host_to_chapter()
308
- if add_host_to_chapter:
309
- self.chapters_url_list = [
310
- f'https://{self.host}{chapter_url}' for chapter_url in self.chapters_url_list]
311
- self.chapters_url_list = utils.delete_duplicates(
312
- self.chapters_url_list)
313
- self.save_novel()
314
- self._create_chapters_from_toc()
315
- return True
458
+ Returns:
459
+ Optional[str]: Formatted string showing chapter numbers and URLs, None if no chapters_urls found
460
+ """
316
461
 
317
- def show_toc(self):
318
462
  if not self.chapters_url_list:
319
- return 'No chapters in TOC, reload TOC and try again'
463
+ logger.warning('No chapters in TOC')
464
+ return None
320
465
  toc_str = 'Table Of Contents:'
321
466
  for i, chapter_url in enumerate(self.chapters_url_list):
322
- toc_str += f'\nChapter {i+1}: {chapter_url}'
467
+ toc_str += f'\nChapter {i + 1}: {chapter_url}'
323
468
  return toc_str
324
469
 
325
470
  # CHAPTERS MANAGEMENT
326
471
 
472
+ def get_chapter(self, chapter_index: Optional[int] = None, chapter_url: Optional[str] = None) -> Optional[Chapter]:
473
+ """
474
+ Retrieves a chapter either by its index in the chapter list or by its URL.
475
+
476
+ Args:
477
+ chapter_index (Optional[int]): The index of the chapter in the chapter list
478
+ chapter_url (Optional[str]): The URL of the chapter to retrieve
479
+
480
+ Returns:
481
+ Optional[Chapter]: The requested chapter if found, None otherwise
482
+
483
+ Raises:
484
+ ValidationError: If neither index nor url is provided, or if both are provided
485
+ IndexError: If the provided index is out of range
486
+ """
487
+ if not utils.check_exclusive_params(chapter_index, chapter_url):
488
+ raise ValidationError("Exactly one of 'chapter_index' or 'chapter_url' must be provided")
489
+
490
+ if chapter_url is not None:
491
+ chapter_index = self._find_chapter_index_by_url(chapter_url)
492
+
493
+ if chapter_index is not None:
494
+ if chapter_index < 0:
495
+ raise ValueError("Index must be positive")
496
+ try:
497
+ return self.chapters[chapter_index]
498
+ except IndexError:
499
+ logger.warning(f"No chapter found at index {chapter_index}")
500
+ return None
501
+ logger.warning(f"No chapter found with url {chapter_url}")
502
+ return None
503
+
327
504
  def show_chapters(self) -> str:
505
+ """
506
+ Generates a text representation of all novel chapters.
507
+
508
+ Returns:
509
+ str: Formatted string containing the list of chapters with their information:
510
+ - Chapter number
511
+ - Title (if available)
512
+ - URL
513
+ - HTML filename (if available)
514
+
515
+ Note:
516
+ Output format is:
517
+ Chapters List:
518
+ Chapter 1:
519
+ Title: [title or message]
520
+ URL: [url]
521
+ Filename: [filename or message]
522
+ ...
523
+ """
524
+
328
525
  chapter_list = "Chapters List:\n"
329
526
  for i, chapter in enumerate(self.chapters):
330
527
  chapter_list += f"Chapter {i + 1}:\n"
@@ -333,105 +530,166 @@ class Novel:
333
530
  chapter_list += f" Filename: {chapter.chapter_html_filename if chapter.chapter_html_filename else 'File not yet requested'}\n"
334
531
  return chapter_list
335
532
 
336
- def scrap_chapter(self, chapter_url: str = None, chapter_idx: int = None, update_html: bool = False) -> Chapter:
337
- logger.info('Scraping Chapter...')
338
- chapter = None
339
- if not utils.check_exclusive_params(chapter_url, chapter_idx):
340
- raise ValueError("chapter_url and chapter_id, only one needs to be set")
533
+ def scrap_chapter(self, chapter: Chapter, reload_file: bool = False) -> Chapter:
534
+ """
535
+ Processes and decodes a specific chapter of the novel.
341
536
 
342
- if chapter_url is not None:
343
- logger.debug(f'Using chapter url: {chapter_url}')
344
- chapter = self._get_chapter_by_url(chapter_url=chapter_url)
345
- if chapter is None:
346
- logger.warning(f'Chapter with url "{chapter_url}" does not exist, generating one...')
347
- chapter = Chapter(chapter_url=chapter_url)
348
-
349
- if chapter_idx is not None:
350
- logger.debug(f'Using chapter index: {chapter_idx}')
351
- if chapter_idx < 0 or chapter_idx >= len(self.chapters):
352
- logger.critical(f'Could not find chapter with idx {chapter_idx}')
353
- raise ValueError(f'Could not find chapter with idx {chapter_idx}')
354
-
355
- chapter = self.chapters[chapter_idx]
356
- if update_html:
357
- logger.debug('HTML will be updated...')
358
-
359
- chapter = self._get_chapter(chapter,
360
- reload=update_html)
361
-
362
- if not chapter.chapter_html or not chapter.chapter_html_filename:
363
- logger.critical(f'Failed to create chapter on link: "{chapter_url}" '
364
- f'on path "{chapter.chapter_html_filename}"')
365
- raise ValueError(f'Failed to create chapter on link: "{chapter_url}" '
366
- f'on path "{chapter.chapter_html_filename}"')
537
+ This method handles the complete scraping process for an individual chapter,
538
+ including HTML loading or requesting and content decoding.
539
+
540
+ Args:
541
+ chapter (Chapter): Chapter object to process
542
+ reload_file (bool, optional): If True, forces a new download of the chapter
543
+ even if it already exists locally. Defaults to False.
544
+
545
+ Returns:
546
+ Chapter: The updated Chapter object with decoded content
547
+
548
+ Raises:
549
+ ValidationError: If there are issues with the values of the provided Chapter object
550
+ DecodeError: If there are issues during content decoding
551
+ NetworkError: If there are issues during HTML request
552
+ FileManagerError: If there are issues during file operations
553
+ """
554
+
555
+ logger.debug('Scraping Chapter...')
556
+ if chapter.chapter_url is None:
557
+ logger.error('Chapter trying to be scrapped does not have a URL')
558
+ raise ValidationError('Chapter trying to be scrapped does not have a URL')
559
+
560
+ logger.debug(f'Using chapter url: {chapter.chapter_url}')
561
+
562
+ if reload_file:
563
+ logger.debug('Reload file Flag present, HTML will be requested...')
564
+
565
+ try:
566
+ chapter = self._load_or_request_chapter(chapter,
567
+ reload_file=reload_file)
568
+ except ValidationError as e:
569
+ logger.error(f'Could get chapter for URL "{chapter.chapter_url}" HTML content. Validation Error',
570
+ exc_info=e)
571
+ raise
572
+ except FileManagerError as e:
573
+ logger.error(f'Could get chapter for URL "{chapter.chapter_url}" HTML content. File Manager Error',
574
+ exc_info=e)
575
+ raise
576
+ except NetworkError as e:
577
+ logger.error(f'Could get chapter for URL "{chapter.chapter_url}" HTML content. Network Error', exc_info=e)
578
+ raise
579
+
580
+ if not chapter.chapter_html:
581
+ logger.error(f'Could not get HTML content for chapter with URL "{chapter.chapter_url}"')
582
+ raise ScraperError(f'Could not get HTML content for chapter with URL "{chapter.chapter_url}"')
367
583
 
368
584
  # We get the chapter title and content
369
585
  # We pass an index so we can autogenerate a Title
370
- chapter = self._decode_chapter(chapter=chapter, idx_for_chapter_name=chapter_idx)
371
-
372
- logger.info(f'Chapter scrapped from link: {chapter_url}')
586
+ save_title_to_content = (self.scraper_behavior.save_title_to_content or
587
+ self.decoder.save_title_to_content())
588
+ try:
589
+ chapter = self._decode_chapter(chapter=chapter,
590
+ save_title_to_content=save_title_to_content)
591
+ except DecodeError as e:
592
+ logger.error(f'Could not decode HTML title and content for chapter with URL "{chapter.chapter_url}"',
593
+ exc_info=e)
594
+ raise
595
+ except ValidationError as e:
596
+ logger.error(f'Could not decode HTML title and content for chapter with URL "{chapter.chapter_url}"',
597
+ exc_info=e)
598
+ raise
599
+
600
+ logger.info(f'Chapter scrapped from link: {chapter.chapter_url}')
373
601
  return chapter
374
602
 
375
- def scrap_all_chapters(self, sync_toc: bool = False, update_chapters: bool = False, update_html: bool = False) -> None:
376
- if sync_toc:
377
- self.sync_toc()
378
- # We scrap all chapters from our chapter list
379
- if self.chapters_url_list:
380
- for i, chapter in enumerate(len(self.chapters)):
381
-
382
- # If update_chapters is true, we scrap again the chapter info
383
- if update_chapters:
384
- chapter = self.scrap_chapter(chapter_idx=i,
385
- update_html=update_html)
386
- self._add_or_update_chapter_data(
387
- chapter=chapter, link_idx=i)
388
- continue
389
- # If not, we only update if the chapter doesn't have a title or html
390
- if chapter.chapter_html_filename and chapter.chapter_title:
391
- continue
392
- chapter = self.scrap_chapter(chapter_idx=i,
393
- update_html=update_html)
394
- self._add_or_update_chapter_data(chapter=chapter,
395
- save_in_file=True)
396
- else:
397
- logger.warning('No chapters found')
603
+ def request_all_chapters(self,
604
+ sync_toc: bool = True,
605
+ reload_files: bool = False,
606
+ clean_chapters: bool = False) -> None:
607
+ """
608
+ Requests and processes all chapters of the novel.
609
+
610
+ This method performs scraping of all available chapters in the novel,
611
+ handling the loading and decoding of each one.
612
+
613
+ Args:
614
+ sync_toc (bool, optional): If True, syncs the table of contents
615
+ reload_files (bool, optional): If True, forces a new download of all
616
+ chapters, even if they already exist locally. Defaults to False.
617
+ clean_chapters (bool, optional): If True, cleans the HTML content of the files
618
+
619
+ Raises:
620
+ FileManagerError: If there are issues during file operations
621
+ DecodeError: If there are issues during content decoding
622
+ ValidationError: If there are issues during content decoding
623
+
624
+ Note:
625
+ - Process is performed sequentially for each chapter
626
+ - Errors in individual chapters don't stop the complete process
627
+ - Progress is logged through the logging system
628
+ """
398
629
 
399
- def request_all_chapters(self, sync_toc: bool = False, update_html: bool = False, clean_chapters: bool = False) -> None:
630
+ logger.debug('Requesting all chapters...')
400
631
  if sync_toc:
401
- self.sync_toc()
402
- if self.chapters_url_list:
403
- # We request the HTML files of all the chapters
404
- for i, chapter in enumerate(self.chapters):
405
- # If the chapter exists and update_html is false, we can skip
406
- if chapter.chapter_html_filename and not update_html:
407
- continue
408
- chapter = self._get_chapter(
409
- chapter=chapter, reload=update_html)
410
- if not chapter.chapter_html_filename:
411
- logger.critical(f'Error requesting chapter {i} with url {chapter.chapter_url}')
412
- return False
413
-
414
- self._add_or_update_chapter_data(chapter=chapter, link_idx=i,
415
- save_in_file=True)
416
- if clean_chapters:
417
- self._clean_chapter(chapter.chapter_html_filename)
418
- return True
419
- else:
420
- logger.warning('No chapters found')
632
+ logger.debug('Sync TOC flag present, syncing TOC...')
633
+ try:
634
+ self.sync_toc(reload_files=False)
635
+ except ScraperError:
636
+ logger.warning('Error when trying to sync TOC, continuing without syncing...')
637
+
638
+ if len(self.chapters_url_list) == 0:
639
+ logger.warning('No chapters in TOC, returning without requesting any...')
640
+ return None
641
+
642
+ # We request the HTML files of all the chapters
643
+ # The chapter will be requested again if:
644
+ # 1. Reload files flag is True (Requested by user)
645
+ chapters_obtained = 0
646
+ total_chapters = len(self.chapters)
647
+ for i in range(len(self.chapters)):
648
+ logger.info(f'Requesting chapter {i + 1} of {total_chapters}')
649
+ try:
650
+ self.chapters[i] = self._load_or_request_chapter(chapter=self.chapters[i],
651
+ reload_file=reload_files)
652
+ except FileManagerError:
653
+ logger.warning(f'Error requesting chapter {i + 1} with url {self.chapters[i].chapter_url}, Skipping...')
654
+ continue
655
+ except ValidationError:
656
+ logger.warning(f'Error validating chapter {i + 1} with url {self.chapters[i].chapter_url}, Skipping...')
657
+ continue
658
+
659
+ if not self.chapters[i].chapter_html:
660
+ logger.warning(f'Error requesting chapter {i + 1} with url {self.chapters[i].chapter_url}')
661
+ continue
662
+
663
+ if clean_chapters:
664
+ self._clean_chapter(self.chapters[i].chapter_html_filename)
665
+ self.save_novel()
666
+ chapters_obtained += 1
667
+ logger.info(f'Successfully requested {chapters_obtained} of {total_chapters} chapters.')
668
+ return None
421
669
 
422
- # EPUB CREATION
670
+ # EPUB CREATION
423
671
 
424
672
  def save_novel_to_epub(self,
425
673
  sync_toc: bool = False,
426
674
  start_chapter: int = 1,
427
675
  end_chapter: int = None,
428
676
  chapters_by_book: int = 100) -> None:
677
+ logger.debug('Saving novel to epub...')
429
678
  if sync_toc:
430
- self.sync_toc()
679
+ logger.debug('Sync TOC flag present, syncing TOC...')
680
+ try:
681
+ self.sync_toc(reload_files=False)
682
+ except ScraperError:
683
+ logger.warning('Error when trying to sync TOC, continuing without syncing...')
684
+
685
+ if start_chapter < 1:
686
+ logger.error('Start chapter is invalid.')
687
+ raise ValidationError('Start chapter is invalid.')
431
688
 
432
689
  if start_chapter > len(self.chapters):
433
- logger.info(f'The start chapter is bigger than the number of chapters saved ({len(self.chapters)})')
434
- return
690
+ logger.error(f'The start chapter is bigger than the number of chapters saved ({len(self.chapters)})')
691
+ raise ValidationError(
692
+ f'The start chapter is bigger than the number of chapters saved ({len(self.chapters)})')
435
693
 
436
694
  if not end_chapter:
437
695
  end_chapter = len(self.chapters)
@@ -443,22 +701,19 @@ class Novel:
443
701
  idx = 1
444
702
  start = start_chapter
445
703
  while start <= end_chapter:
446
- end = min(start + chapters_by_book - 1, end_chapter)
704
+ end = min(start + chapters_by_book - 1,
705
+ end_chapter)
447
706
  result = self._save_chapters_to_epub(start_chapter=start,
448
707
  end_chapter=end,
449
708
  collection_idx=idx)
450
709
  if not result:
451
710
  logger.critical(f'Error with saving novel to epub, with start chapter: '
452
711
  f'{start_chapter} and end chapter: {end_chapter}')
453
- return False
454
712
  start = start + chapters_by_book
455
713
  idx = idx + 1
456
- return True
457
-
458
714
 
459
715
  ## UTILS
460
716
 
461
-
462
717
  def clean_files(self, clean_chapters: bool = True, clean_toc: bool = True, hard_clean: bool = False) -> None:
463
718
  hard_clean = hard_clean or self.scraper_behavior.hard_clean
464
719
  if clean_chapters:
@@ -470,8 +725,7 @@ class Novel:
470
725
  self._clean_toc(hard_clean)
471
726
 
472
727
  def show_novel_dir(self) -> str:
473
- return self.file_manager.novel_base_dir
474
-
728
+ return str(self.file_manager.novel_base_dir)
475
729
 
476
730
  ## PRIVATE HELPERS
477
731
 
@@ -492,9 +746,25 @@ class Novel:
492
746
  tocs_content = self.file_manager.get_all_toc()
493
747
  for i, toc in enumerate(tocs_content):
494
748
  toc = self.decoder.clean_html(toc, hard_clean=hard_clean)
495
- self.file_manager.update_toc(toc, i)
749
+ self.file_manager.update_toc(idx=i,
750
+ html=toc)
496
751
 
497
752
  def _request_html_content(self, url: str) -> Optional[str]:
753
+ """
754
+ Performs an HTTP request to retrieve HTML content from a URL.
755
+
756
+ Args:
757
+ url (str): The URL of the webpage to request
758
+
759
+ Returns:
760
+ Optional[str]: The HTML content of the webpage if the request is successful,
761
+ None otherwise
762
+
763
+ Note:
764
+ This method uses the decoder configuration and scraper behavior
765
+ to handle HTTP requests, including retries and timeouts.
766
+ """
767
+
498
768
  request_config = self.decoder.request_config
499
769
  force_flaresolver = request_config.get('force_flaresolver') or self.scraper_behavior.force_flaresolver
500
770
  html_content = get_html_content(url,
@@ -504,135 +774,331 @@ class Novel:
504
774
  force_flaresolver=force_flaresolver)
505
775
  return html_content
506
776
 
507
- def _get_chapter(self,
508
- chapter: Chapter,
509
- reload: bool = False) -> Chapter | None:
777
+ def _load_or_request_chapter(self,
778
+ chapter: Chapter,
779
+ reload_file: bool = False) -> Chapter:
780
+ """
781
+ Loads or requests a chapter's HTML content from a local file or a URL.
782
+
783
+ This method first attempts to load the chapter content from a local file.
784
+ If not possible or if reload is requested, it fetches the content from the web.
785
+
786
+ Args:
787
+ chapter (Chapter): Chapter object containing chapter information.
788
+ reload_file (bool, optional): If True, forces a new web request
789
+ regardless of local file existence. Defaults to False.
790
+
791
+ Returns:
792
+ Chapter: The Chapter object updated with HTML content.
793
+
794
+ Raises:
795
+ FileManagerError: If there's an error loading or saving the chapter file.
796
+ ValidationError: If there's a validation error when requesting the chapter.
797
+ NetworkError: If there's a network error when requesting the chapter.
798
+
799
+ Note:
800
+ - If the file doesn't exist locally, a web request will be made.
801
+ - If the file exists but is empty, a web request will be made.
802
+ - File saving errors are logged as warnings but don't stop execution.
803
+ """
510
804
 
511
- # Generate filename if needed
805
+ # Generate a filename if needed
512
806
  if not chapter.chapter_html_filename:
807
+ logger.debug('Generating a filename for the chapter')
513
808
  chapter.chapter_html_filename = utils.generate_file_name_from_url(
514
809
  chapter.chapter_url)
515
810
 
516
- # Try loading from cache first
517
- if not reload:
518
- chapter.chapter_html = self.file_manager.load_chapter_html(
519
- chapter.chapter_html_filename)
520
- if chapter.chapter_html:
811
+ # The HTML will be requested again if:
812
+ # 1. "Reload file" flag is True (requested by user)
813
+ # 2. Chapter file does not exist
814
+ # 3. The Chapter file does exist, but there is no content
815
+ reload_file = reload_file or not self.file_manager.chapter_file_exists(chapter.chapter_html_filename)
816
+ # Try loading from the disk first
817
+ if not reload_file:
818
+ try:
819
+ logger.debug(f'Loading chapter HTML from file: "{chapter.chapter_html_filename}"')
820
+ chapter.chapter_html = self.file_manager.load_chapter_html(chapter.chapter_html_filename)
821
+ except FileManagerError as e:
822
+ logger.error(f'Error when trying to load chapter {chapter.chapter_title} from file', exc_info=e)
823
+ raise
824
+ if chapter.chapter_html is not None:
521
825
  return chapter
522
826
 
523
827
  # Fetch fresh content
524
- chapter.chapter_html = self._request_html_content(chapter.chapter_url)
828
+ try:
829
+ logger.debug(f'Requesting chapter HTML from URL: "{chapter.chapter_url}"')
830
+ chapter.chapter_html = self._request_html_content(chapter.chapter_url)
831
+ except ValidationError:
832
+ logger.error(
833
+ f'Error when trying to request chapter {chapter.chapter_title} from url: {chapter.chapter_url}')
834
+ raise
835
+ except NetworkError:
836
+ logger.error(
837
+ f'Error when trying to request chapter {chapter.chapter_title} from url: {chapter.chapter_url}')
838
+ raise
839
+
840
+ # If the requests failed, we will let the higher methods decide if they throw an error.
525
841
  if not chapter.chapter_html:
526
842
  logger.error(f'No content found on link {chapter.chapter_url}')
527
843
  return chapter
528
844
 
529
845
  # Save content
530
- self.file_manager.save_chapter_html(
531
- chapter.chapter_html_filename, chapter.chapter_html)
846
+ try:
847
+ logger.info(f'Saving chapter HTML to file: "{chapter.chapter_html_filename}"')
848
+ self.file_manager.save_chapter_html(chapter.chapter_html_filename,
849
+ chapter.chapter_html)
850
+ except FileManagerError as e:
851
+ # We can pass this error and try again later
852
+ logger.warning(f'Error when trying to save chapter {chapter.chapter_title} to file', exc_info=e)
853
+
532
854
  return chapter
533
855
 
534
- def _add_toc(self,
535
- url: str,
536
- toc_filename: str = None,
537
- reload: bool = False):
538
- if not reload:
539
- content = self.file_manager.get_toc(toc_filename)
540
- if content:
541
- return content
856
+ def _request_toc_files(self):
857
+ """
858
+ Requests and stores all table of contents (TOC) files from the novel's website.
542
859
 
543
- if utils.check_incomplete_url(url):
544
- url = self.toc_main_url + url
860
+ This method handles both paginated and non-paginated TOCs:
861
+ - For non-paginated TOCs: Downloads and stores a single TOC file
862
+ - For paginated TOCs: Iteratively downloads all TOC pages until no next page is found
545
863
 
546
- # Fetch fresh content
547
- content = self._request_html_content(url)
548
- if not content:
549
- logger.warning(f'No content found on link {url}')
550
- sys.exit(1)
864
+ The method first clears any existing TOC files before downloading new ones.
551
865
 
552
- self.file_manager.add_toc(content)
553
- return content
866
+ Raises:
867
+ NetworkError: If there's an error during the HTTP request
868
+ ValidationError: If no content is found at the TOC URL
869
+ DecodeError: If there's an error parsing the next page URL
870
+
871
+ Note:
872
+ This is an internal method that uses the decoder configuration to determine
873
+ pagination behavior and to parse TOC content.
874
+ """
875
+
876
+ def _get_toc(toc_url: str, get_next_page: bool) -> str | None:
877
+ # Some TOCs next page links have incomplete URLS (e.g., /page/2)
878
+ if utils.check_incomplete_url(toc_url):
879
+ toc_url = self.toc_main_url + toc_url
880
+ logger.debug(f'Toc link is incomplete, trying with toc link: "{toc_url}"')
881
+
882
+ # Fetch fresh content
883
+ logger.debug(f'Requesting TOC from link: "{toc_url}"')
884
+ try:
885
+ toc_content = self._request_html_content(toc_url)
886
+ except NetworkError as E:
887
+ logger.error(f'Error with network, error: {E}')
888
+ raise
889
+
890
+ if not toc_content:
891
+ logger.error(f'No content found on link "{toc_url}"')
892
+ raise ValidationError(f'No content found on link "{toc_url}"')
893
+
894
+ logger.debug('Saving new TOC file to disk.')
895
+ self.file_manager.add_toc(toc_content)
896
+
897
+ if get_next_page:
898
+ try:
899
+ logger.debug(f'Parsing next page from link: {toc_url}')
900
+ next_page = self.decoder.get_toc_next_page_url(toc_content)
901
+ except DecodeError:
902
+ raise
903
+ return next_page
904
+ return None
554
905
 
555
- def _add_or_update_chapter_data(self, chapter: Chapter, link_idx: int = None, save_in_file: bool = True) -> None:
556
- if link_idx:
557
- chapter_idx = link_idx
906
+ self.file_manager.delete_toc()
907
+ has_pagination = self.decoder.has_pagination()
908
+
909
+ if not has_pagination:
910
+ logger.debug('TOC does not have pagination, requesting only one file.')
911
+ _get_toc(self.toc_main_url, get_next_page=False)
558
912
  else:
559
- # Check if the chapter exists
560
- chapter_idx = self._find_chapter_index_by_link(chapter.chapter_url)
561
- if chapter_idx is None:
562
- # If no existing chapter we append it
563
- self.chapters.append(chapter)
564
- chapter_idx = len(self.chapters)
565
- else:
566
- if chapter.chapter_title:
567
- self.chapters[chapter_idx].chapter_title = chapter.chapter_title
568
- if chapter.chapter_html_filename:
569
- self.chapters[chapter_idx].chapter_html_filename = chapter.chapter_html_filename
570
- if save_in_file:
571
- self.save_novel()
572
- return chapter_idx
913
+ logger.debug('TOC has pagination, requesting all files.')
914
+ next_page_url = self.toc_main_url
915
+ while next_page_url:
916
+ next_page_url = _get_toc(next_page_url, get_next_page=True)
917
+
918
+ def _load_or_request_chapter_urls_from_toc(self) -> None:
919
+ """
920
+ Extracts and processes chapter URLs from the table of contents.
921
+
922
+ Raises:
923
+ DecodeError: If fails to decode chapter URLs from TOC content
924
+ """
925
+ # Get configuration
926
+ is_inverted = self.decoder.is_index_inverted()
927
+ add_host_to_chapter = self.scraper_behavior.auto_add_host or self.decoder.add_host_to_chapter()
928
+
929
+ # Get all TOC content at once
930
+ try:
931
+ all_tocs = self.file_manager.get_all_toc()
932
+ except FileManagerError:
933
+ logger.error('Error when trying to load TOC files from disk.')
934
+ raise
935
+
936
+ # Extract URLs from all TOC fragments
937
+ self.chapters_url_list = []
938
+ for toc_content in all_tocs:
939
+ try:
940
+ urls = self.decoder.get_chapter_urls(toc_content)
941
+ self.chapters_url_list.extend(urls) # More efficient than creating intermediate lists
942
+ except DecodeError as e:
943
+ logger.error('Failed to decode chapter URLs from TOC content', exc_info=e)
944
+ raise
945
+
946
+ # Handle inversion if needed
947
+ if is_inverted:
948
+ logger.debug('Inverting chapter URLs order')
949
+ self.chapters_url_list.reverse() # In-place reversal is more efficient
950
+
951
+ # Add host if needed
952
+ if add_host_to_chapter:
953
+ logger.debug('Adding host to chapter URLs')
954
+ self.chapters_url_list = [f'https://{self.host}{url}' for url in self.chapters_url_list]
955
+
956
+ # Remove duplicates while preserving order
957
+ # self.chapters_url_list = utils.delete_duplicates(self.chapters_url_list)
958
+
959
+ logger.info(f'Successfully extracted {len(self.chapters_url_list)} unique chapter URLs')
960
+
961
+ def _create_chapters_from_toc(self):
962
+ """
963
+ Synchronizes existing chapters with the table of contents (TOC) URL list.
964
+
965
+ This method performs the following operations:
966
+ 1. Removes chapters whose URLs are no longer in the TOC
967
+ 2. Adds new chapters for URLs found in the TOC
968
+ 3. Reorders chapters according to the TOC sequence
969
+
970
+ Raises:
971
+ ValidationError: If there's an error when creating a new chapter
972
+
973
+ Note:
974
+ This is an internal method used to maintain consistency
975
+ between chapters and the table of contents.
976
+ """
573
977
 
574
- def _order_chapters_by_link_list(self) -> None:
978
+ existing_urls = {chapter.chapter_url for chapter in self.chapters}
979
+ toc_urls_set = set(self.chapters_url_list)
980
+
981
+ # Find chapters to remove and new chapters to add
982
+ urls_to_remove = existing_urls - toc_urls_set
983
+ urls_to_add = toc_urls_set - existing_urls
984
+
985
+ if urls_to_remove:
986
+ logger.info(f'Removing {len(urls_to_remove)} chapters not found in TOC')
987
+ self.chapters = [ch for ch in self.chapters if ch.chapter_url not in urls_to_remove]
988
+
989
+ if urls_to_add:
990
+ logger.info(f'Adding {len(urls_to_add)} new chapters from TOC')
991
+ for url in self.chapters_url_list:
992
+ if url in urls_to_add:
993
+ try:
994
+ new_chapter = Chapter(chapter_url=url)
995
+ self.chapters.append(new_chapter)
996
+ except ValidationError as e:
997
+ logger.error(f'Failed to create chapter for URL {url}: {e}')
998
+ raise
999
+
1000
+ # Reorder according to TOC
1001
+ logger.debug('Reordering chapters according to TOC')
575
1002
  self.chapters.sort(
576
1003
  key=lambda x: self.chapters_url_list.index(x.chapter_url))
577
1004
 
578
- def _get_chapter_by_url(self, chapter_url: str) -> Chapter:
579
- for chapter in self.chapters:
580
- if chapter_url == chapter.chapter_url:
581
- return chapter
582
- return None
1005
+ logger.info(f'Chapter synchronization complete. Total chapters: {len(self.chapters)}')
583
1006
 
584
- def _find_chapter_index_by_link(self, chapter_url: str) -> str:
585
- for index, chapter in enumerate(self.chapters):
586
- if chapter.chapter_url == chapter_url:
587
- return index
588
- return None
1007
+ def _add_or_update_chapter_data(self, chapter: Chapter, save_in_file: bool = True) -> None:
589
1008
 
590
- def _delete_chapters_not_in_toc(self) -> None:
591
- self.chapters = [
592
- chapter for chapter in self.chapters if chapter.chapter_url in self.chapters_url_list]
1009
+ # Check if the chapter exists
1010
+ chapter_idx = self._find_chapter_index_by_url(chapter.chapter_url)
1011
+ if chapter_idx is None:
1012
+ # If no existing chapter, we append it
1013
+ self.chapters.append(chapter)
1014
+ else:
1015
+ if chapter.chapter_title:
1016
+ self.chapters[chapter_idx].chapter_title = chapter.chapter_title
1017
+ if chapter.chapter_html_filename:
1018
+ self.chapters[chapter_idx].chapter_html_filename = chapter.chapter_html_filename
593
1019
 
594
- def _create_chapters_from_toc(self):
595
- self._delete_chapters_not_in_toc()
596
- increment = 100
597
- aux = 1
598
- for chapter_url in self.chapters_url_list:
599
- aux += 1
600
- chapter_idx = self._find_chapter_index_by_link(chapter_url)
601
- if not chapter_idx:
602
- chapter = Chapter(chapter_url=chapter_url)
603
- self._add_or_update_chapter_data(
604
- chapter=chapter, save_in_file=False)
605
- if aux == increment:
606
- self.save_novel()
607
- aux = 1
608
- self._order_chapters_by_link_list()
609
- self.save_novel()
1020
+ if save_in_file:
1021
+ self.save_novel()
610
1022
 
611
- def _decode_chapter(self, chapter: Chapter, idx_for_chapter_name: str = None) -> Chapter:
612
- logger.debug('Decoding chapter...')
613
- if chapter.chapter_html is None:
614
- logger.debug(f'No HTML content found, requesting HTML content...')
615
- chapter = self._get_chapter(chapter)
1023
+ def _find_chapter_index_by_url(self, chapter_url: str) -> Optional[int]:
1024
+ """
1025
+ Find the chapter index by its URL in the chapter list.
1026
+
1027
+ Args:
1028
+ chapter_url: URL of the chapter to find
1029
+
1030
+ Returns:
1031
+ Optional[int]: Index of the chapter if found, None otherwise
1032
+
1033
+ Note:
1034
+ Uses next() for efficient iteration - stops as soon as a match is found
1035
+ """
1036
+ try:
1037
+ return next(i for i, ch in enumerate(self.chapters)
1038
+ if ch.chapter_url == chapter_url)
1039
+ except StopIteration:
1040
+ return None
1041
+
1042
+ def _decode_chapter(self,
1043
+ chapter: Chapter,
1044
+ save_title_to_content: bool = False) -> Chapter:
1045
+ """
1046
+ Decodes a chapter's HTML content to extract title and content.
1047
+
1048
+ This method processes the HTML content of a chapter to extract its title and content.
1049
+ If no title is found, it auto-generates one using the chapter's index in the URL list.
1050
+
1051
+ Args:
1052
+ chapter (Chapter): Chapter object containing the HTML content to decode.
1053
+ save_title_to_content (bool, optional): Whether to include the title in the
1054
+ chapter content. Defaults to False.
1055
+
1056
+ Returns:
1057
+ Chapter: The updated Chapter object with decoded title and content.
616
1058
 
617
- if not chapter.chapter_html:
618
- raise ValueError(f'Chapter HTML could not be obtained for chapter link "{chapter.chapter_url}" '
619
- f'on file "{chapter.chapter_html_filename}"')
1059
+ Raises:
1060
+ ScraperError: If the chapter's HTML content is None.
1061
+ DecodeError: If there's an error decoding the chapter's title or content.
1062
+
1063
+ Note:
1064
+ - If no title is found, it will be auto-generated as "{novel_title} Chapter {index}".
1065
+ - The chapter's HTML must be loaded before calling this method.
1066
+ """
1067
+
1068
+ logger.debug(f'Decoding chapter with URL {chapter.chapter_url}...')
1069
+ if chapter.chapter_html is None:
1070
+ logger.error(f'Chapter HTML not found for chapter with URL "{chapter.chapter_url}"')
1071
+ raise ScraperError(f'Chapter HTML not found for chapter with URL "{chapter.chapter_url}"')
620
1072
 
621
1073
  logger.debug('Obtaining chapter title...')
622
- chapter_title = self.decoder.get_chapter_title(chapter.chapter_html)
623
- if not chapter_title:
624
- logger.debug('No chapter title found, generating one...')
625
- chapter_title = f'{self.title} Chapter {idx_for_chapter_name}'
626
- chapter.chapter_title = str(chapter_title)
627
- logger.debug(f'Chapter title: "{chapter_title}"')
1074
+ try:
1075
+ chapter_title = self.decoder.get_chapter_title(chapter.chapter_html)
1076
+ except DecodeError as e:
1077
+ logger.error(f'Failed to decode chapter title from HTML content: {e}')
1078
+ raise
628
1079
 
629
- logger.debug('Obtaining chapter content...')
630
- save_title_to_content = self.scraper_behavior.save_title_to_content or self.decoder.save_title_to_content()
631
- chapter.chapter_content = self.decoder.get_chapter_content(chapter.chapter_html,
632
- save_title_to_content,
633
- chapter.chapter_title)
634
- logger.debug('Chapter successfully decoded')
1080
+ if chapter_title is None:
1081
+ logger.debug('No chapter title found, trying to autogenerate one...')
1082
+ try:
1083
+ chapter_idx = self.chapters_url_list.index(chapter.chapter_url)
1084
+ except ValueError:
1085
+ chapter_idx = ""
1086
+
1087
+ chapter_title = f'{self.title} Chapter {chapter_idx}'
635
1088
 
1089
+ chapter.chapter_title = chapter_title
1090
+ logger.info(f'Chapter title: "{chapter_title}"')
1091
+
1092
+ logger.debug('Obtaining chapter content...')
1093
+ try:
1094
+ chapter.chapter_content = self.decoder.get_chapter_content(chapter.chapter_html,
1095
+ save_title_to_content,
1096
+ chapter.chapter_title)
1097
+ except DecodeError:
1098
+ logger.error(f'Failed to decode chapter content for chapter with URL "{chapter.chapter_url}"')
1099
+ raise
1100
+
1101
+ logger.debug('Chapter title and content successfully decoded from HTML')
636
1102
  return chapter
637
1103
 
638
1104
  def _create_epub_book(self, book_title: str = None, calibre_collection: dict = None) -> epub.EpubBook:
@@ -661,7 +1127,7 @@ class Novel:
661
1127
  # date_metadata += f'/{self.metadata.end_date}'
662
1128
  if self.metadata.end_date:
663
1129
  book.add_metadata('OPF', 'meta', self.metadata.end_date, {
664
- 'name': 'end_date', 'content': self.metadata.end_date})
1130
+ 'name': 'end_date', 'content': self.metadata.end_date})
665
1131
  if date_metadata:
666
1132
  logger.debug(f'Using date_metadata {date_metadata}')
667
1133
  book.add_metadata('DC', 'date', date_metadata)
@@ -669,9 +1135,9 @@ class Novel:
669
1135
  # Collections with calibre
670
1136
  if calibre_collection:
671
1137
  book.add_metadata('OPF', 'meta', '', {
672
- 'name': 'calibre:series', 'content': calibre_collection["title"]})
1138
+ 'name': 'calibre:series', 'content': calibre_collection["title"]})
673
1139
  book.add_metadata('OPF', 'meta', '', {
674
- 'name': 'calibre:series_index', 'content': calibre_collection["idx"]})
1140
+ 'name': 'calibre:series_index', 'content': calibre_collection["idx"]})
675
1141
 
676
1142
  cover_image_content = self.file_manager.load_novel_cover()
677
1143
  if cover_image_content:
@@ -682,11 +1148,10 @@ class Novel:
682
1148
  return book
683
1149
 
684
1150
  def _add_chapter_to_epub_book(self, chapter: Chapter, book: epub.EpubBook):
685
- chapter = self.scrap_chapter(
686
- chapter_url=chapter.chapter_url)
1151
+ chapter = self.scrap_chapter(chapter)
687
1152
  if chapter is None:
688
1153
  logger.warning('Error reading chapter')
689
- return
1154
+ return None
690
1155
  self._add_or_update_chapter_data(
691
1156
  chapter=chapter, save_in_file=False)
692
1157
  file_name = utils.generate_epub_file_name_from_title(
@@ -708,10 +1173,9 @@ class Novel:
708
1173
  start_chapter: int,
709
1174
  end_chapter: int = None,
710
1175
  collection_idx: int = None):
711
-
712
1176
  if start_chapter > len(self.chapters):
713
1177
  logger.error('start_chapter out of range')
714
- return
1178
+ return None
715
1179
  # If end_chapter is not set, we set it to idx_start + chapters_num - 1
716
1180
  if not end_chapter:
717
1181
  end_chapter = len(self.chapters)
@@ -725,7 +1189,7 @@ class Novel:
725
1189
  # We create the epub book
726
1190
  book_title = f'{self.title} Chapters {start_chapter} - {end_chapter}'
727
1191
  calibre_collection = None
728
- # If collection_idx is set, we create a calibre collection
1192
+ # If collection_idx is set, we create a Calibre collection
729
1193
  if collection_idx:
730
1194
  calibre_collection = {'title': self.title,
731
1195
  'idx': str(collection_idx)}
@@ -735,11 +1199,16 @@ class Novel:
735
1199
  book = self._add_chapter_to_epub_book(chapter=chapter,
736
1200
  book=book)
737
1201
  if book is None:
738
- logger.critical(f'Error saving epub {book_title}, could not decode chapter {chapter} using host {self.host}')
1202
+ logger.critical(
1203
+ f'Error saving epub {book_title}, could not decode chapter {chapter} using host {self.host}')
739
1204
  return False
740
1205
 
741
1206
  book.add_item(epub.EpubNcx())
742
1207
  book.add_item(epub.EpubNav())
743
- self.file_manager.save_book(book, f'{book_title}.epub')
1208
+ try:
1209
+ self.file_manager.save_book(book, f'{book_title}.epub')
1210
+ except FileManagerError:
1211
+ logger.error(f'Error saving epub {book_title}')
1212
+ raise
744
1213
  self.save_novel()
745
1214
  return True