web-novel-scraper 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,723 @@
1
+ from dataclasses import dataclass, fields, field
2
+ import sys
3
+
4
+ from dataclasses_json import dataclass_json, config, Undefined
5
+ from ebooklib import epub
6
+ from typing import Optional
7
+
8
+ from . import logger_manager
9
+ from .decode import Decoder
10
+ from .file_manager import FileManager
11
+ from . import utils
12
+
13
+ from . import request_manager
14
+
15
+ logger = logger_manager.create_logger('NOVEL SCRAPPING')
16
+
17
+
18
+ @dataclass_json
19
+ @dataclass
20
+ class Metadata:
21
+ novel_title: str
22
+ author: Optional[str] = None
23
+ start_date: Optional[str] = None
24
+ end_date: Optional[str] = None
25
+ language: Optional[str] = "en"
26
+ description: Optional[str] = None
27
+ tags: list[str] = field(default_factory=list)
28
+
29
+ def update_behavior(self, **kwargs):
30
+ """
31
+ Updates the behavior configuration dynamically.
32
+ Only updates the attributes provided in kwargs.
33
+ """
34
+ for key, value in kwargs.items():
35
+ if hasattr(self, key) and value is not None:
36
+ setattr(self, key, value)
37
+
38
+ def __str__(self):
39
+ """
40
+ Dynamic string representation of the configuration.
41
+ """
42
+ attributes = [f"{field.name}={
43
+ getattr(self, field.name)}" for field in fields(self)]
44
+ return f"Metadata: \n{'\n'.join(attributes)}"
45
+
46
+
47
+ @dataclass_json
48
+ @dataclass
49
+ class ScraperBehavior:
50
+ # Some novels already have the title in the content.
51
+ save_title_to_content: bool = False
52
+ # Some novels have the toc link without the host
53
+ auto_add_host: bool = False
54
+ # Some hosts return 403 when scrapping, this will force the use of FlareSolver
55
+ # to save time
56
+ force_flaresolver: bool = False
57
+ # When you clean the html files, you can use hard clean by default
58
+ hard_clean: bool = False
59
+
60
+ def update_behavior(self, **kwargs):
61
+ """
62
+ Updates the behavior configuration dynamically.
63
+ Only updates the attributes provided in kwargs.
64
+ """
65
+ for key, value in kwargs.items():
66
+ if hasattr(self, key) and value is not None:
67
+ setattr(self, key, value)
68
+
69
+ def __str__(self):
70
+ """
71
+ Dynamic string representation of the configuration.
72
+ """
73
+ attributes = [f"{field.name}={
74
+ getattr(self, field.name)}" for field in fields(self)]
75
+ return f"Scraper Behavior: \n{'\n'.join(attributes)}"
76
+
77
+
78
+ @dataclass_json(undefined=Undefined.EXCLUDE)
79
+ @dataclass
80
+ class Chapter:
81
+ chapter_url: str
82
+ chapter_html_filename: Optional[str] = None
83
+ chapter_title: Optional[str] = None
84
+
85
+ def __init__(self,
86
+ chapter_url: str,
87
+ chapter_html: str = None,
88
+ chapter_content: str = None,
89
+ chapter_html_filename: str = None,
90
+ chapter_title: str = None):
91
+ self.chapter_url = chapter_url
92
+ self.chapter_html = chapter_html
93
+ self.chapter_content = chapter_content
94
+ self.chapter_html_filename = chapter_html_filename
95
+ self.chapter_title = chapter_title
96
+
97
+ def __str__(self):
98
+ return f'Title: "{self.chapter_title}"\nURL: "{self.chapter_url}"\nFilename: "{self.chapter_html_filename}"'
99
+
100
+ def __lt__(self, another):
101
+ return self.chapter_title < another.chapter_title
102
+
103
+
104
+ @dataclass_json
105
+ @dataclass
106
+ class Novel:
107
+ metadata: Metadata
108
+ scraper_behavior: ScraperBehavior = None
109
+ chapters: list[Chapter] = field(default_factory=list)
110
+ toc_main_url: Optional[str] = None
111
+ chapters_url_list: list[str] = field(default_factory=list)
112
+ host: str = None
113
+
114
+ def __init__(self,
115
+ novel_title: str = None,
116
+ toc_main_url: str = None,
117
+ toc_html: str = None,
118
+ chapters_url_list: list[str] = None,
119
+ metadata: Metadata = None,
120
+ chapters: list[Chapter] = None,
121
+ novel_base_dir: str = None,
122
+ scraper_behavior: ScraperBehavior = None,
123
+ host: str = None):
124
+
125
+ if toc_main_url and toc_html:
126
+ logger.error('There can only be one or toc_main_url or toc_html')
127
+ sys.exit(1)
128
+
129
+ if metadata is not None:
130
+ self.metadata = metadata
131
+ elif novel_title is not None:
132
+ self.metadata = Metadata(novel_title)
133
+ else:
134
+ logger.error('You need to set "novel_title" or "metadata".')
135
+ sys.exit(1)
136
+
137
+ self.file_manager = FileManager(novel_title=self.metadata.novel_title,
138
+ novel_base_dir=novel_base_dir)
139
+
140
+ if toc_html:
141
+ self.file_manager.add_toc(toc_html)
142
+
143
+ self.toc_main_url = toc_main_url
144
+ self.chapters_url_list = chapters_url_list if chapters_url_list else []
145
+
146
+ self.chapters = chapters if chapters else []
147
+
148
+ self.scraper_behavior = scraper_behavior if scraper_behavior else ScraperBehavior()
149
+ if not host and not toc_main_url:
150
+ logger.error('You need to set "host" or "toc_main_url".')
151
+ sys.exit(1)
152
+
153
+ self.host = host if host else utils.obtain_host(self.toc_main_url)
154
+ self.decoder = Decoder(self.host)
155
+
156
+ self.save_novel()
157
+
158
+ def __str__(self):
159
+ """
160
+ Dynamic string representation of the novel.
161
+ """
162
+ toc_info = self.toc_main_url if self.toc_main_url else "TOC added manually"
163
+ attributes = [
164
+ f"Title: {self.metadata.novel_title}",
165
+ f"Author: {self.metadata.author}",
166
+ f"Language: {self.metadata.language}",
167
+ f"Description: {self.metadata.description}",
168
+ f"Tags: {', '.join(self.metadata.tags)}",
169
+ f"TOC Info: {toc_info}",
170
+ f"Host: {self.host}"
171
+ ]
172
+ return f"Novel Info: \n{'\n'.join(attributes)}"
173
+
174
+ # NOVEL PARAMETERS MANAGEMENT
175
+
176
+ def set_scraper_behavior(self, **kwargs) -> None:
177
+ self.scraper_behavior.update_behavior(**kwargs)
178
+ self.save_novel()
179
+
180
+ def set_metadata(self, **kwargs) -> None:
181
+ self.metadata.update_behavior(**kwargs)
182
+ self.save_novel()
183
+
184
+ def add_tag(self, tag: str) -> bool:
185
+ if tag not in self.metadata.tags:
186
+ self.metadata.tags.append(tag)
187
+ self.save_novel()
188
+ return True
189
+ logger.warning(f'Tag "{tag}" already exists on novel {
190
+ self.metadata.novel_title}')
191
+ return False
192
+
193
+ def remove_tag(self, tag: str) -> bool:
194
+ if tag in self.metadata.tags:
195
+ self.metadata.tags.remove(tag)
196
+ self.save_novel()
197
+ return True
198
+ logger.warning(f'Tag "{tag}" doesn\'t exist on novel {
199
+ self.metadata.novel_title}')
200
+ return False
201
+
202
+ def set_cover_image(self, cover_image_path: str) -> bool:
203
+ return self.file_manager.save_novel_cover(cover_image_path)
204
+
205
+ def set_host(self, host: str) -> None:
206
+ self.host = host
207
+ self.decoder = Decoder(self.host)
208
+ self.save_novel()
209
+
210
+ def save_novel(self) -> None:
211
+ self.file_manager.save_novel_json(self.to_dict())
212
+
213
+ # TABLE OF CONTENTS MANAGEMENT
214
+
215
+ def set_toc_main_url(self, toc_main_url: str, host: str = None, update_host: bool = False) -> None:
216
+ self.toc_main_url = toc_main_url
217
+ self.file_manager.delete_toc()
218
+ if host:
219
+ self.host = host
220
+ self.decoder = Decoder(self.host)
221
+ elif update_host:
222
+ self.decoder = Decoder(utils.obtain_host(self.toc_main_url))
223
+
224
+ def add_toc_html(self, html: str, host: str = None) -> None:
225
+ if self.toc_main_url:
226
+ self.delete_toc()
227
+ self.toc_main_url = None
228
+
229
+ if host:
230
+ self.host = host
231
+ self.decoder = Decoder(self.host)
232
+ self.file_manager.add_toc(html)
233
+ # Delete toc_main_url since they are exclusive
234
+ self.save_novel()
235
+
236
+ def delete_toc(self):
237
+ self.file_manager.delete_toc()
238
+ self.chapters = []
239
+ self.chapters_url_list = []
240
+ self.save_novel()
241
+
242
+ def sync_toc(self, reload_files: bool = False) -> bool:
243
+ # Hard reload will request again the toc files from the toc_main_url
244
+ # Only works with toc_main_url
245
+ all_tocs_content = self.file_manager.get_all_toc()
246
+
247
+ # If there is no toc_main_url and no manually added toc, there is no way to sync toc
248
+ toc_not_exists = not all_tocs_content and self.toc_main_url is None
249
+ if toc_not_exists:
250
+ logger.critical(
251
+ 'There is no toc html and no toc url setted, unable to get toc.')
252
+ return False
253
+
254
+ reload_files = reload_files and self.toc_main_url is not None
255
+ if reload_files or not all_tocs_content:
256
+ self.chapters = []
257
+ self.file_manager.delete_toc()
258
+ all_tocs_content = []
259
+ toc_content = self._add_toc(self.toc_main_url)
260
+ all_tocs_content.append(toc_content)
261
+ if self.decoder.has_pagination():
262
+ next_page = self._get_next_page_from_toc_content(toc_content)
263
+ while next_page:
264
+ toc_content = self._add_toc(next_page)
265
+ next_page = self._get_next_page_from_toc_content(
266
+ toc_content)
267
+ all_tocs_content.append(toc_content)
268
+
269
+ # Now we get the links from the toc content
270
+ self.chapters_url_list = []
271
+ for toc_content in all_tocs_content:
272
+ chapters_url_from_toc_content = self._get_chapter_urls_from_toc_content(
273
+ toc_content)
274
+ if chapters_url_from_toc_content is None:
275
+ logger.error('Chapters url not found on toc_content')
276
+ return False
277
+ self.chapters_url_list = [*self.chapters_url_list,
278
+ *chapters_url_from_toc_content]
279
+ if self.scraper_behavior.auto_add_host:
280
+ self.chapters_url_list = [
281
+ f'https://{self.host}{chapter_url}' for chapter_url in self.chapters_url_list]
282
+ self.chapters_url_list = utils.delete_duplicates(
283
+ self.chapters_url_list)
284
+ self.save_novel()
285
+ self._create_chapters_from_toc()
286
+ return True
287
+
288
+ def show_toc(self):
289
+ if not self.chapters_url_list:
290
+ return 'No chapters in TOC, reload TOC and try again'
291
+ toc_str = 'Table Of Contents:'
292
+ for i, chapter_url in enumerate(self.chapters_url_list):
293
+ toc_str += f'\nChapter {i+1}: {chapter_url}'
294
+ return toc_str
295
+
296
+ # CHAPTERS MANAGEMENT
297
+
298
+ def show_chapters(self) -> str:
299
+ chapter_list = "Chapters List:\n"
300
+ for i, chapter in enumerate(self.chapters):
301
+ chapter_list += f"Chapter {i + 1}:\n"
302
+ chapter_list += f" Title: {
303
+ chapter.chapter_title if chapter.chapter_title else 'Title not yet scrapped'}\n"
304
+ chapter_list += f" URL: {chapter.chapter_url}\n"
305
+ chapter_list += f" Filename: {
306
+ chapter.chapter_html_filename if chapter.chapter_html_filename else 'File not yet requested'}\n"
307
+ return chapter_list
308
+
309
+ def scrap_chapter(self, chapter_url: str = None, chapter_idx: int = None, update_html: bool = False) -> Chapter:
310
+ if not utils.check_exclusive_params(chapter_url, chapter_idx):
311
+ logger.error(
312
+ 'chapter_url and chapter_id, only one needs to be setted')
313
+ return
314
+
315
+ if chapter_url is not None:
316
+ chapter = self._get_chapter_by_url(chapter_url=chapter_url)
317
+ if chapter is None:
318
+ chapter = Chapter(chapter_url=chapter_url)
319
+
320
+ if chapter_idx is not None:
321
+ if chapter_idx < 0 or chapter_idx >= len(self.chapters):
322
+ logger.error(f'Could not find chapter with idx {chapter_idx}')
323
+ return
324
+ chapter = self.chapters[chapter_idx]
325
+
326
+ chapter = self._get_chapter(chapter,
327
+ reload=update_html)
328
+
329
+ if not chapter.chapter_html or not chapter.chapter_html_filename:
330
+ logger.warning(f'Failed to create chapter on link: "{
331
+ chapter_url}" on path "{chapter.chapter_html_filename}"')
332
+ return
333
+
334
+ # We get the title and content, if there's no title, we autogenerate one.
335
+ chapter = self._decode_chapter(chapter=chapter, idx_for_chapter_name=chapter_idx)
336
+ if not chapter.chapter_content:
337
+ logger.error('Content not found')
338
+ return
339
+
340
+ logger.info(f'Chapter scrapped from link: {chapter_url}')
341
+ return chapter
342
+
343
+ def scrap_all_chapters(self, sync_toc: bool = False, update_chapters: bool = False, update_html: bool = False) -> None:
344
+ if sync_toc:
345
+ self.sync_toc()
346
+ # We scrap all chapters from our chapter list
347
+ if self.chapters_url_list:
348
+ for i, chapter in enumerate(len(self.chapters)):
349
+
350
+ # If update_chapters is true, we scrap again the chapter info
351
+ if update_chapters:
352
+ chapter = self.scrap_chapter(chapter_idx=i,
353
+ update_html=update_html)
354
+ self._add_or_update_chapter_data(
355
+ chapter=chapter, link_idx=i)
356
+ continue
357
+ # If not, we only update if the chapter doesn't have a title or html
358
+ if chapter.chapter_html_filename and chapter.chapter_title:
359
+ continue
360
+ chapter = self.scrap_chapter(chapter_idx=i,
361
+ update_html=update_html)
362
+ self._add_or_update_chapter_data(chapter=chapter,
363
+ save_in_file=True)
364
+ else:
365
+ logger.warning('No chapters found')
366
+
367
+ def request_all_chapters(self, sync_toc: bool = False, update_html: bool = False, clean_chapters: bool = False) -> None:
368
+ if sync_toc:
369
+ self.sync_toc()
370
+ if self.chapters_url_list:
371
+ # We request the HTML files of all the chapters
372
+ for i, chapter in enumerate(self.chapters):
373
+ # If the chapter exists and update_html is false, we can skip
374
+ if chapter.chapter_html_filename and not update_html:
375
+ continue
376
+ chapter = self._get_chapter(
377
+ chapter=chapter, reload=update_html)
378
+ if not chapter.chapter_html_filename:
379
+ logger.critical(f'Error requesting chapter {
380
+ i} with url {chapter.chapter_url}')
381
+ return False
382
+
383
+ self._add_or_update_chapter_data(chapter=chapter, link_idx=i,
384
+ save_in_file=True)
385
+ if clean_chapters:
386
+ self._clean_chapter(chapter.chapter_html_filename)
387
+ return True
388
+ else:
389
+ logger.warning('No chapters found')
390
+
391
+ # EPUB CREATION
392
+
393
+ def save_novel_to_epub(self,
394
+ sync_toc: bool = False,
395
+ start_chapter: int = 1,
396
+ end_chapter: int = None,
397
+ chapters_by_book: int = 100) -> None:
398
+ if sync_toc:
399
+ self.sync_toc()
400
+
401
+ if start_chapter > len(self.chapters):
402
+ logger.info(f'The start chapter is bigger than the number of chapters saved ({
403
+ len(self.chapters)})')
404
+ return
405
+
406
+ if not end_chapter:
407
+ end_chapter = len(self.chapters)
408
+ elif end_chapter > len(self.chapters):
409
+ end_chapter = len(self.chapters)
410
+ logger.info(f'The end chapter is bigger than the number of chapters, automatically setting it to {
411
+ end_chapter}.')
412
+
413
+ idx = 1
414
+ start = start_chapter
415
+ while start <= end_chapter:
416
+ end = min(start + chapters_by_book - 1, end_chapter)
417
+ result = self._save_chapters_to_epub(start_chapter=start,
418
+ end_chapter=end,
419
+ collection_idx=idx)
420
+ if not result:
421
+ logger.critical(f'Error with saving novel to epub, with start chapter: {
422
+ start_chapter} and end chapter: {end_chapter}')
423
+ return False
424
+ start = start + chapters_by_book
425
+ idx = idx + 1
426
+ return True
427
+
428
+
429
+ # UTILS
430
+
431
+
432
+ def clean_files(self, clean_chapters: bool = True, clean_toc: bool = True, hard_clean: bool = False) -> None:
433
+ hard_clean = hard_clean or self.scraper_behavior.hard_clean
434
+ if clean_chapters:
435
+ for chapter in self.chapters:
436
+ if chapter.chapter_html_filename:
437
+ self._clean_chapter(
438
+ chapter.chapter_html_filename, hard_clean)
439
+ if clean_toc:
440
+ self._clean_toc(hard_clean)
441
+
442
+ def show_novel_dir(self) -> str:
443
+ return self.file_manager.novel_base_dir
444
+
445
+ def _clean_chapter(self, chapter_html_filename: str, hard_clean: bool = False) -> None:
446
+ hard_clean = hard_clean or self.scraper_behavior.hard_clean
447
+ chapter_html = self.file_manager.load_chapter_html(
448
+ chapter_html_filename)
449
+ if not chapter_html:
450
+ logger.warning(f'No content found on file {chapter_html_filename}')
451
+ return
452
+ chapter_html = self.decoder.clean_html(
453
+ chapter_html, hard_clean=hard_clean)
454
+ self.file_manager.save_chapter_html(
455
+ chapter_html_filename, chapter_html)
456
+
457
+ def _clean_toc(self, hard_clean: bool = False) -> None:
458
+ hard_clean = hard_clean or self.scraper_behavior.hard_clean
459
+ tocs_content = self.file_manager.get_all_toc()
460
+ for i, toc in enumerate(tocs_content):
461
+ toc = self.decoder.clean_html(toc, hard_clean=hard_clean)
462
+ self.file_manager.update_toc(toc, i)
463
+
464
+ def _get_chapter(self,
465
+ chapter: Chapter,
466
+ reload: bool = False) -> Chapter | None:
467
+
468
+ # Generate filename if needed
469
+ if not chapter.chapter_html_filename:
470
+ chapter.chapter_html_filename = utils.generate_file_name_from_url(
471
+ chapter.chapter_url)
472
+
473
+ # Try loading from cache first
474
+ if not reload:
475
+ chapter.chapter_html = self.file_manager.load_chapter_html(
476
+ chapter.chapter_html_filename)
477
+ if chapter.chapter_html:
478
+ return chapter
479
+
480
+ # Fetch fresh content
481
+ chapter.chapter_html = request_manager.get_html_content(chapter.chapter_url,
482
+ force_flaresolver=self.scraper_behavior.force_flaresolver)
483
+ if not chapter.chapter_html:
484
+ logger.error(f'No content found on link {chapter.chapter_url}')
485
+ return chapter
486
+
487
+ # Save content
488
+ self.file_manager.save_chapter_html(
489
+ chapter.chapter_html_filename, chapter.chapter_html)
490
+ return chapter
491
+
492
+ def _add_toc(self,
493
+ url: str,
494
+ toc_filename: str = None,
495
+ reload: bool = False):
496
+ if not reload:
497
+ content = self.file_manager.get_toc(toc_filename)
498
+ if content:
499
+ return content
500
+
501
+ content = request_manager.get_html_content(url)
502
+ if not content:
503
+ logger.warning(f'No content found on link {url}')
504
+ sys.exit(1)
505
+
506
+ self.file_manager.add_toc(content)
507
+ return content
508
+
509
+ def _get_chapter_urls_from_toc_content(self, toc_content: str) -> list[str]:
510
+ toc_elements = self.decoder.decode_html(toc_content, 'index')
511
+ try:
512
+ toc_urls = [toc_element['href'] for toc_element in toc_elements]
513
+ except KeyError as e:
514
+ logger.error(f'{e} not found on the Tag elements decoded from TOC')
515
+ return
516
+ if toc_urls:
517
+ return toc_urls
518
+ logger.warning('No chapter links found on toc content')
519
+
520
+ def _get_next_page_from_toc_content(self, toc_content: str) -> str:
521
+ next_page = self.decoder.decode_html(toc_content, 'next_page')
522
+ if next_page:
523
+ return next_page[0]['href']
524
+
525
+ def _add_or_update_chapter_data(self, chapter: Chapter, link_idx: int = None, save_in_file: bool = True) -> None:
526
+ if link_idx:
527
+ chapter_idx = link_idx
528
+ else:
529
+ # Check if the chapter exists
530
+ chapter_idx = self._find_chapter_index_by_link(chapter.chapter_url)
531
+ if chapter_idx is None:
532
+ # If no existing chapter we append it
533
+ self.chapters.append(chapter)
534
+ chapter_idx = len(self.chapters)
535
+ else:
536
+ if chapter.chapter_title:
537
+ self.chapters[chapter_idx].chapter_title = chapter.chapter_title
538
+ if chapter.chapter_html_filename:
539
+ self.chapters[chapter_idx].chapter_html_filename = chapter.chapter_html_filename
540
+ if save_in_file:
541
+ self.save_novel()
542
+ return chapter_idx
543
+
544
+ def _order_chapters_by_link_list(self) -> None:
545
+ self.chapters.sort(
546
+ key=lambda x: self.chapters_url_list.index(x.chapter_url))
547
+
548
+ def _get_chapter_by_url(self, chapter_url: str) -> Chapter:
549
+ for chapter in self.chapters:
550
+ if chapter_url == chapter.chapter_url:
551
+ return chapter
552
+ return None
553
+
554
+ def _find_chapter_index_by_link(self, chapter_url: str) -> str:
555
+ for index, chapter in enumerate(self.chapters):
556
+ if chapter.chapter_url == chapter_url:
557
+ return index
558
+ return None
559
+
560
+ def _delete_chapters_not_in_toc(self) -> None:
561
+ self.chapters = [
562
+ chapter for chapter in self.chapters if chapter.chapter_url in self.chapters_url_list]
563
+
564
+ def _create_chapters_from_toc(self):
565
+ self._delete_chapters_not_in_toc()
566
+ increment = 100
567
+ aux = 1
568
+ for chapter_url in self.chapters_url_list:
569
+ aux += 1
570
+ chapter_idx = self._find_chapter_index_by_link(chapter_url)
571
+ if not chapter_idx:
572
+ chapter = Chapter(chapter_url=chapter_url)
573
+ self._add_or_update_chapter_data(
574
+ chapter=chapter, save_in_file=False)
575
+ if aux == increment:
576
+ self.save_novel()
577
+ aux = 1
578
+ self._order_chapters_by_link_list()
579
+ self.save_novel()
580
+
581
+ def _decode_chapter(self, chapter: Chapter, idx_for_chapter_name: str = None) -> Chapter:
582
+ chapter_title = None
583
+
584
+ if chapter.chapter_html is None:
585
+ chapter = self._get_chapter(chapter)
586
+
587
+ if not chapter.chapter_html:
588
+ logger.error(f'No chapter content found for chapter link {
589
+ chapter.chapter_url} on file {chapter.chapter_html_filename}')
590
+ return None
591
+
592
+ paragraphs = self.decoder.decode_html(chapter.chapter_html, 'content')
593
+
594
+ if not paragraphs:
595
+ if chapter:
596
+ logger.warning(f'No paragraphs found in chapter link {
597
+ chapter.chapter_url} on file {chapter.chapter_html_filename}')
598
+
599
+ chapter_title = self.decoder.decode_html(chapter.chapter_html, 'title')
600
+ if not chapter_title:
601
+ chapter_title = f'{self.metadata.novel_title} Chapter {
602
+ idx_for_chapter_name}'
603
+ chapter.chapter_title = str(chapter_title)
604
+
605
+ chapter.chapter_content = ""
606
+ if self.scraper_behavior.save_title_to_content:
607
+ chapter.chapter_content += f'<h4>{chapter_title}</h4>'
608
+ logger.info(f'{len(paragraphs)} paragraphs found in chapter')
609
+ for paragraph in paragraphs:
610
+ chapter.chapter_content += str(paragraph)
611
+
612
+ return chapter
613
+
614
+ def _create_epub_book(self, book_title: str = None, calibre_collection: dict = None) -> epub.EpubBook:
615
+ book = epub.EpubBook()
616
+ if not book_title:
617
+ book_title = self.metadata.novel_title
618
+ book.set_title(book_title)
619
+ book.set_language(self.metadata.language)
620
+ book.add_metadata('DC', 'description', self.metadata.description)
621
+ book.add_metadata('DC', 'subject', 'Novela Web')
622
+ book.add_metadata('DC', 'subject', 'Scrapped')
623
+ if self.metadata.tags:
624
+ for tag in self.metadata.tags:
625
+ book.add_metadata('DC', 'subject', tag)
626
+
627
+ if self.metadata.author:
628
+ book.add_author(self.metadata.author)
629
+
630
+ date_metadata = ''
631
+ if self.metadata.start_date:
632
+ date_metadata += self.metadata.start_date
633
+ # Calibre specification doesn't use end_date.
634
+ # For now we use a custom metadata
635
+ # https://idpf.org/epub/31/spec/epub-packages.html#sec-opf-dcdate
636
+ # if self.metadata.end_date:
637
+ # date_metadata += f'/{self.metadata.end_date}'
638
+ if self.metadata.end_date:
639
+ book.add_metadata('OPF', 'meta', self.metadata.end_date, {
640
+ 'name': 'end_date', 'content': self.metadata.end_date})
641
+ if date_metadata:
642
+ logger.debug(f'Using date_metadata {date_metadata}')
643
+ book.add_metadata('DC', 'date', date_metadata)
644
+
645
+ # Collections with calibre
646
+ if calibre_collection:
647
+ book.add_metadata('OPF', 'meta', '', {
648
+ 'name': 'calibre:series', 'content': calibre_collection["title"]})
649
+ book.add_metadata('OPF', 'meta', '', {
650
+ 'name': 'calibre:series_index', 'content': calibre_collection["idx"]})
651
+
652
+ cover_image_content = self.file_manager.load_novel_cover()
653
+ if cover_image_content:
654
+ book.set_cover('cover.jpg', cover_image_content)
655
+ book.spine += ['cover']
656
+
657
+ book.spine.append('nav')
658
+ return book
659
+
660
+ def _add_chapter_to_epub_book(self, chapter: Chapter, book: epub.EpubBook):
661
+ chapter = self.scrap_chapter(
662
+ chapter_url=chapter.chapter_url)
663
+ if chapter is None:
664
+ logger.warning('Error reading chapter')
665
+ return
666
+ self._add_or_update_chapter_data(
667
+ chapter=chapter, save_in_file=False)
668
+ file_name = utils.generate_epub_file_name_from_title(
669
+ chapter.chapter_title)
670
+
671
+ chapter_epub = epub.EpubHtml(
672
+ title=chapter.chapter_title, file_name=file_name)
673
+ chapter_epub.set_content(chapter.chapter_content)
674
+ book.add_item(chapter_epub)
675
+ link = epub.Link(file_name, chapter.chapter_title,
676
+ file_name.rstrip('.xhtml'))
677
+ toc = book.toc
678
+ toc.append(link)
679
+ book.toc = toc
680
+ book.spine.append(chapter_epub)
681
+ return book
682
+
683
+ def _save_chapters_to_epub(self,
684
+ start_chapter: int,
685
+ end_chapter: int = None,
686
+ collection_idx: int = None):
687
+
688
+ if start_chapter > len(self.chapters):
689
+ logger.error('start_chapter out of range')
690
+ return
691
+ # If end_chapter is not set, we set it to idx_start + chapters_num - 1
692
+ if not end_chapter:
693
+ end_chapter = len(self.chapters)
694
+ # If end_chapter is out of range, we set it to the last chapter
695
+ if end_chapter > len(self.chapters):
696
+ end_chapter = len(self.chapters)
697
+
698
+ # We use a slice so every chapter starting from idx_start and before idx_end
699
+ idx_start = start_chapter - 1
700
+ idx_end = end_chapter
701
+ # We create the epub book
702
+ book_title = f'{self.metadata.novel_title} Chapters {
703
+ start_chapter} - {end_chapter}'
704
+ calibre_collection = None
705
+ # If collection_idx is set, we create a calibre collection
706
+ if collection_idx:
707
+ calibre_collection = {'title': self.metadata.novel_title,
708
+ 'idx': str(collection_idx)}
709
+ book = self._create_epub_book(book_title, calibre_collection)
710
+
711
+ for chapter in self.chapters[idx_start:idx_end]:
712
+ book = self._add_chapter_to_epub_book(chapter=chapter,
713
+ book=book)
714
+ if book is None:
715
+ logger.critical(f'Error saving epub {book_title}, could not decode chapter {
716
+ chapter} using host {self.host}')
717
+ return False
718
+
719
+ book.add_item(epub.EpubNcx())
720
+ book.add_item(epub.EpubNav())
721
+ self.file_manager.save_book(book, f'{book_title}.epub')
722
+ self.save_novel()
723
+ return True