web-novel-scraper 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web_novel_scraper/__init__.py +0 -0
- web_novel_scraper/__main__.py +430 -0
- web_novel_scraper/decode.py +141 -0
- web_novel_scraper/decode_guide/decode_guide.json +213 -0
- web_novel_scraper/file_manager.py +292 -0
- web_novel_scraper/logger_manager.py +72 -0
- web_novel_scraper/novel_scraper.py +723 -0
- web_novel_scraper/request_manager.py +135 -0
- web_novel_scraper/utils.py +66 -0
- web_novel_scraper/version.py +1 -0
- web_novel_scraper-1.0.2.dist-info/METADATA +231 -0
- web_novel_scraper-1.0.2.dist-info/RECORD +14 -0
- web_novel_scraper-1.0.2.dist-info/WHEEL +4 -0
- web_novel_scraper-1.0.2.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,723 @@
|
|
1
|
+
from dataclasses import dataclass, fields, field
|
2
|
+
import sys
|
3
|
+
|
4
|
+
from dataclasses_json import dataclass_json, config, Undefined
|
5
|
+
from ebooklib import epub
|
6
|
+
from typing import Optional
|
7
|
+
|
8
|
+
from . import logger_manager
|
9
|
+
from .decode import Decoder
|
10
|
+
from .file_manager import FileManager
|
11
|
+
from . import utils
|
12
|
+
|
13
|
+
from . import request_manager
|
14
|
+
|
15
|
+
logger = logger_manager.create_logger('NOVEL SCRAPPING')
|
16
|
+
|
17
|
+
|
18
|
+
@dataclass_json
|
19
|
+
@dataclass
|
20
|
+
class Metadata:
|
21
|
+
novel_title: str
|
22
|
+
author: Optional[str] = None
|
23
|
+
start_date: Optional[str] = None
|
24
|
+
end_date: Optional[str] = None
|
25
|
+
language: Optional[str] = "en"
|
26
|
+
description: Optional[str] = None
|
27
|
+
tags: list[str] = field(default_factory=list)
|
28
|
+
|
29
|
+
def update_behavior(self, **kwargs):
|
30
|
+
"""
|
31
|
+
Updates the behavior configuration dynamically.
|
32
|
+
Only updates the attributes provided in kwargs.
|
33
|
+
"""
|
34
|
+
for key, value in kwargs.items():
|
35
|
+
if hasattr(self, key) and value is not None:
|
36
|
+
setattr(self, key, value)
|
37
|
+
|
38
|
+
def __str__(self):
|
39
|
+
"""
|
40
|
+
Dynamic string representation of the configuration.
|
41
|
+
"""
|
42
|
+
attributes = [f"{field.name}={
|
43
|
+
getattr(self, field.name)}" for field in fields(self)]
|
44
|
+
return f"Metadata: \n{'\n'.join(attributes)}"
|
45
|
+
|
46
|
+
|
47
|
+
@dataclass_json
|
48
|
+
@dataclass
|
49
|
+
class ScraperBehavior:
|
50
|
+
# Some novels already have the title in the content.
|
51
|
+
save_title_to_content: bool = False
|
52
|
+
# Some novels have the toc link without the host
|
53
|
+
auto_add_host: bool = False
|
54
|
+
# Some hosts return 403 when scrapping, this will force the use of FlareSolver
|
55
|
+
# to save time
|
56
|
+
force_flaresolver: bool = False
|
57
|
+
# When you clean the html files, you can use hard clean by default
|
58
|
+
hard_clean: bool = False
|
59
|
+
|
60
|
+
def update_behavior(self, **kwargs):
|
61
|
+
"""
|
62
|
+
Updates the behavior configuration dynamically.
|
63
|
+
Only updates the attributes provided in kwargs.
|
64
|
+
"""
|
65
|
+
for key, value in kwargs.items():
|
66
|
+
if hasattr(self, key) and value is not None:
|
67
|
+
setattr(self, key, value)
|
68
|
+
|
69
|
+
def __str__(self):
|
70
|
+
"""
|
71
|
+
Dynamic string representation of the configuration.
|
72
|
+
"""
|
73
|
+
attributes = [f"{field.name}={
|
74
|
+
getattr(self, field.name)}" for field in fields(self)]
|
75
|
+
return f"Scraper Behavior: \n{'\n'.join(attributes)}"
|
76
|
+
|
77
|
+
|
78
|
+
@dataclass_json(undefined=Undefined.EXCLUDE)
|
79
|
+
@dataclass
|
80
|
+
class Chapter:
|
81
|
+
chapter_url: str
|
82
|
+
chapter_html_filename: Optional[str] = None
|
83
|
+
chapter_title: Optional[str] = None
|
84
|
+
|
85
|
+
def __init__(self,
|
86
|
+
chapter_url: str,
|
87
|
+
chapter_html: str = None,
|
88
|
+
chapter_content: str = None,
|
89
|
+
chapter_html_filename: str = None,
|
90
|
+
chapter_title: str = None):
|
91
|
+
self.chapter_url = chapter_url
|
92
|
+
self.chapter_html = chapter_html
|
93
|
+
self.chapter_content = chapter_content
|
94
|
+
self.chapter_html_filename = chapter_html_filename
|
95
|
+
self.chapter_title = chapter_title
|
96
|
+
|
97
|
+
def __str__(self):
|
98
|
+
return f'Title: "{self.chapter_title}"\nURL: "{self.chapter_url}"\nFilename: "{self.chapter_html_filename}"'
|
99
|
+
|
100
|
+
def __lt__(self, another):
|
101
|
+
return self.chapter_title < another.chapter_title
|
102
|
+
|
103
|
+
|
104
|
+
@dataclass_json
|
105
|
+
@dataclass
|
106
|
+
class Novel:
|
107
|
+
metadata: Metadata
|
108
|
+
scraper_behavior: ScraperBehavior = None
|
109
|
+
chapters: list[Chapter] = field(default_factory=list)
|
110
|
+
toc_main_url: Optional[str] = None
|
111
|
+
chapters_url_list: list[str] = field(default_factory=list)
|
112
|
+
host: str = None
|
113
|
+
|
114
|
+
def __init__(self,
|
115
|
+
novel_title: str = None,
|
116
|
+
toc_main_url: str = None,
|
117
|
+
toc_html: str = None,
|
118
|
+
chapters_url_list: list[str] = None,
|
119
|
+
metadata: Metadata = None,
|
120
|
+
chapters: list[Chapter] = None,
|
121
|
+
novel_base_dir: str = None,
|
122
|
+
scraper_behavior: ScraperBehavior = None,
|
123
|
+
host: str = None):
|
124
|
+
|
125
|
+
if toc_main_url and toc_html:
|
126
|
+
logger.error('There can only be one or toc_main_url or toc_html')
|
127
|
+
sys.exit(1)
|
128
|
+
|
129
|
+
if metadata is not None:
|
130
|
+
self.metadata = metadata
|
131
|
+
elif novel_title is not None:
|
132
|
+
self.metadata = Metadata(novel_title)
|
133
|
+
else:
|
134
|
+
logger.error('You need to set "novel_title" or "metadata".')
|
135
|
+
sys.exit(1)
|
136
|
+
|
137
|
+
self.file_manager = FileManager(novel_title=self.metadata.novel_title,
|
138
|
+
novel_base_dir=novel_base_dir)
|
139
|
+
|
140
|
+
if toc_html:
|
141
|
+
self.file_manager.add_toc(toc_html)
|
142
|
+
|
143
|
+
self.toc_main_url = toc_main_url
|
144
|
+
self.chapters_url_list = chapters_url_list if chapters_url_list else []
|
145
|
+
|
146
|
+
self.chapters = chapters if chapters else []
|
147
|
+
|
148
|
+
self.scraper_behavior = scraper_behavior if scraper_behavior else ScraperBehavior()
|
149
|
+
if not host and not toc_main_url:
|
150
|
+
logger.error('You need to set "host" or "toc_main_url".')
|
151
|
+
sys.exit(1)
|
152
|
+
|
153
|
+
self.host = host if host else utils.obtain_host(self.toc_main_url)
|
154
|
+
self.decoder = Decoder(self.host)
|
155
|
+
|
156
|
+
self.save_novel()
|
157
|
+
|
158
|
+
def __str__(self):
|
159
|
+
"""
|
160
|
+
Dynamic string representation of the novel.
|
161
|
+
"""
|
162
|
+
toc_info = self.toc_main_url if self.toc_main_url else "TOC added manually"
|
163
|
+
attributes = [
|
164
|
+
f"Title: {self.metadata.novel_title}",
|
165
|
+
f"Author: {self.metadata.author}",
|
166
|
+
f"Language: {self.metadata.language}",
|
167
|
+
f"Description: {self.metadata.description}",
|
168
|
+
f"Tags: {', '.join(self.metadata.tags)}",
|
169
|
+
f"TOC Info: {toc_info}",
|
170
|
+
f"Host: {self.host}"
|
171
|
+
]
|
172
|
+
return f"Novel Info: \n{'\n'.join(attributes)}"
|
173
|
+
|
174
|
+
# NOVEL PARAMETERS MANAGEMENT
|
175
|
+
|
176
|
+
def set_scraper_behavior(self, **kwargs) -> None:
|
177
|
+
self.scraper_behavior.update_behavior(**kwargs)
|
178
|
+
self.save_novel()
|
179
|
+
|
180
|
+
def set_metadata(self, **kwargs) -> None:
|
181
|
+
self.metadata.update_behavior(**kwargs)
|
182
|
+
self.save_novel()
|
183
|
+
|
184
|
+
def add_tag(self, tag: str) -> bool:
|
185
|
+
if tag not in self.metadata.tags:
|
186
|
+
self.metadata.tags.append(tag)
|
187
|
+
self.save_novel()
|
188
|
+
return True
|
189
|
+
logger.warning(f'Tag "{tag}" already exists on novel {
|
190
|
+
self.metadata.novel_title}')
|
191
|
+
return False
|
192
|
+
|
193
|
+
def remove_tag(self, tag: str) -> bool:
|
194
|
+
if tag in self.metadata.tags:
|
195
|
+
self.metadata.tags.remove(tag)
|
196
|
+
self.save_novel()
|
197
|
+
return True
|
198
|
+
logger.warning(f'Tag "{tag}" doesn\'t exist on novel {
|
199
|
+
self.metadata.novel_title}')
|
200
|
+
return False
|
201
|
+
|
202
|
+
def set_cover_image(self, cover_image_path: str) -> bool:
|
203
|
+
return self.file_manager.save_novel_cover(cover_image_path)
|
204
|
+
|
205
|
+
def set_host(self, host: str) -> None:
|
206
|
+
self.host = host
|
207
|
+
self.decoder = Decoder(self.host)
|
208
|
+
self.save_novel()
|
209
|
+
|
210
|
+
def save_novel(self) -> None:
|
211
|
+
self.file_manager.save_novel_json(self.to_dict())
|
212
|
+
|
213
|
+
# TABLE OF CONTENTS MANAGEMENT
|
214
|
+
|
215
|
+
def set_toc_main_url(self, toc_main_url: str, host: str = None, update_host: bool = False) -> None:
|
216
|
+
self.toc_main_url = toc_main_url
|
217
|
+
self.file_manager.delete_toc()
|
218
|
+
if host:
|
219
|
+
self.host = host
|
220
|
+
self.decoder = Decoder(self.host)
|
221
|
+
elif update_host:
|
222
|
+
self.decoder = Decoder(utils.obtain_host(self.toc_main_url))
|
223
|
+
|
224
|
+
def add_toc_html(self, html: str, host: str = None) -> None:
|
225
|
+
if self.toc_main_url:
|
226
|
+
self.delete_toc()
|
227
|
+
self.toc_main_url = None
|
228
|
+
|
229
|
+
if host:
|
230
|
+
self.host = host
|
231
|
+
self.decoder = Decoder(self.host)
|
232
|
+
self.file_manager.add_toc(html)
|
233
|
+
# Delete toc_main_url since they are exclusive
|
234
|
+
self.save_novel()
|
235
|
+
|
236
|
+
def delete_toc(self):
|
237
|
+
self.file_manager.delete_toc()
|
238
|
+
self.chapters = []
|
239
|
+
self.chapters_url_list = []
|
240
|
+
self.save_novel()
|
241
|
+
|
242
|
+
def sync_toc(self, reload_files: bool = False) -> bool:
|
243
|
+
# Hard reload will request again the toc files from the toc_main_url
|
244
|
+
# Only works with toc_main_url
|
245
|
+
all_tocs_content = self.file_manager.get_all_toc()
|
246
|
+
|
247
|
+
# If there is no toc_main_url and no manually added toc, there is no way to sync toc
|
248
|
+
toc_not_exists = not all_tocs_content and self.toc_main_url is None
|
249
|
+
if toc_not_exists:
|
250
|
+
logger.critical(
|
251
|
+
'There is no toc html and no toc url setted, unable to get toc.')
|
252
|
+
return False
|
253
|
+
|
254
|
+
reload_files = reload_files and self.toc_main_url is not None
|
255
|
+
if reload_files or not all_tocs_content:
|
256
|
+
self.chapters = []
|
257
|
+
self.file_manager.delete_toc()
|
258
|
+
all_tocs_content = []
|
259
|
+
toc_content = self._add_toc(self.toc_main_url)
|
260
|
+
all_tocs_content.append(toc_content)
|
261
|
+
if self.decoder.has_pagination():
|
262
|
+
next_page = self._get_next_page_from_toc_content(toc_content)
|
263
|
+
while next_page:
|
264
|
+
toc_content = self._add_toc(next_page)
|
265
|
+
next_page = self._get_next_page_from_toc_content(
|
266
|
+
toc_content)
|
267
|
+
all_tocs_content.append(toc_content)
|
268
|
+
|
269
|
+
# Now we get the links from the toc content
|
270
|
+
self.chapters_url_list = []
|
271
|
+
for toc_content in all_tocs_content:
|
272
|
+
chapters_url_from_toc_content = self._get_chapter_urls_from_toc_content(
|
273
|
+
toc_content)
|
274
|
+
if chapters_url_from_toc_content is None:
|
275
|
+
logger.error('Chapters url not found on toc_content')
|
276
|
+
return False
|
277
|
+
self.chapters_url_list = [*self.chapters_url_list,
|
278
|
+
*chapters_url_from_toc_content]
|
279
|
+
if self.scraper_behavior.auto_add_host:
|
280
|
+
self.chapters_url_list = [
|
281
|
+
f'https://{self.host}{chapter_url}' for chapter_url in self.chapters_url_list]
|
282
|
+
self.chapters_url_list = utils.delete_duplicates(
|
283
|
+
self.chapters_url_list)
|
284
|
+
self.save_novel()
|
285
|
+
self._create_chapters_from_toc()
|
286
|
+
return True
|
287
|
+
|
288
|
+
def show_toc(self):
|
289
|
+
if not self.chapters_url_list:
|
290
|
+
return 'No chapters in TOC, reload TOC and try again'
|
291
|
+
toc_str = 'Table Of Contents:'
|
292
|
+
for i, chapter_url in enumerate(self.chapters_url_list):
|
293
|
+
toc_str += f'\nChapter {i+1}: {chapter_url}'
|
294
|
+
return toc_str
|
295
|
+
|
296
|
+
# CHAPTERS MANAGEMENT
|
297
|
+
|
298
|
+
def show_chapters(self) -> str:
|
299
|
+
chapter_list = "Chapters List:\n"
|
300
|
+
for i, chapter in enumerate(self.chapters):
|
301
|
+
chapter_list += f"Chapter {i + 1}:\n"
|
302
|
+
chapter_list += f" Title: {
|
303
|
+
chapter.chapter_title if chapter.chapter_title else 'Title not yet scrapped'}\n"
|
304
|
+
chapter_list += f" URL: {chapter.chapter_url}\n"
|
305
|
+
chapter_list += f" Filename: {
|
306
|
+
chapter.chapter_html_filename if chapter.chapter_html_filename else 'File not yet requested'}\n"
|
307
|
+
return chapter_list
|
308
|
+
|
309
|
+
def scrap_chapter(self, chapter_url: str = None, chapter_idx: int = None, update_html: bool = False) -> Chapter:
|
310
|
+
if not utils.check_exclusive_params(chapter_url, chapter_idx):
|
311
|
+
logger.error(
|
312
|
+
'chapter_url and chapter_id, only one needs to be setted')
|
313
|
+
return
|
314
|
+
|
315
|
+
if chapter_url is not None:
|
316
|
+
chapter = self._get_chapter_by_url(chapter_url=chapter_url)
|
317
|
+
if chapter is None:
|
318
|
+
chapter = Chapter(chapter_url=chapter_url)
|
319
|
+
|
320
|
+
if chapter_idx is not None:
|
321
|
+
if chapter_idx < 0 or chapter_idx >= len(self.chapters):
|
322
|
+
logger.error(f'Could not find chapter with idx {chapter_idx}')
|
323
|
+
return
|
324
|
+
chapter = self.chapters[chapter_idx]
|
325
|
+
|
326
|
+
chapter = self._get_chapter(chapter,
|
327
|
+
reload=update_html)
|
328
|
+
|
329
|
+
if not chapter.chapter_html or not chapter.chapter_html_filename:
|
330
|
+
logger.warning(f'Failed to create chapter on link: "{
|
331
|
+
chapter_url}" on path "{chapter.chapter_html_filename}"')
|
332
|
+
return
|
333
|
+
|
334
|
+
# We get the title and content, if there's no title, we autogenerate one.
|
335
|
+
chapter = self._decode_chapter(chapter=chapter, idx_for_chapter_name=chapter_idx)
|
336
|
+
if not chapter.chapter_content:
|
337
|
+
logger.error('Content not found')
|
338
|
+
return
|
339
|
+
|
340
|
+
logger.info(f'Chapter scrapped from link: {chapter_url}')
|
341
|
+
return chapter
|
342
|
+
|
343
|
+
def scrap_all_chapters(self, sync_toc: bool = False, update_chapters: bool = False, update_html: bool = False) -> None:
|
344
|
+
if sync_toc:
|
345
|
+
self.sync_toc()
|
346
|
+
# We scrap all chapters from our chapter list
|
347
|
+
if self.chapters_url_list:
|
348
|
+
for i, chapter in enumerate(len(self.chapters)):
|
349
|
+
|
350
|
+
# If update_chapters is true, we scrap again the chapter info
|
351
|
+
if update_chapters:
|
352
|
+
chapter = self.scrap_chapter(chapter_idx=i,
|
353
|
+
update_html=update_html)
|
354
|
+
self._add_or_update_chapter_data(
|
355
|
+
chapter=chapter, link_idx=i)
|
356
|
+
continue
|
357
|
+
# If not, we only update if the chapter doesn't have a title or html
|
358
|
+
if chapter.chapter_html_filename and chapter.chapter_title:
|
359
|
+
continue
|
360
|
+
chapter = self.scrap_chapter(chapter_idx=i,
|
361
|
+
update_html=update_html)
|
362
|
+
self._add_or_update_chapter_data(chapter=chapter,
|
363
|
+
save_in_file=True)
|
364
|
+
else:
|
365
|
+
logger.warning('No chapters found')
|
366
|
+
|
367
|
+
def request_all_chapters(self, sync_toc: bool = False, update_html: bool = False, clean_chapters: bool = False) -> None:
|
368
|
+
if sync_toc:
|
369
|
+
self.sync_toc()
|
370
|
+
if self.chapters_url_list:
|
371
|
+
# We request the HTML files of all the chapters
|
372
|
+
for i, chapter in enumerate(self.chapters):
|
373
|
+
# If the chapter exists and update_html is false, we can skip
|
374
|
+
if chapter.chapter_html_filename and not update_html:
|
375
|
+
continue
|
376
|
+
chapter = self._get_chapter(
|
377
|
+
chapter=chapter, reload=update_html)
|
378
|
+
if not chapter.chapter_html_filename:
|
379
|
+
logger.critical(f'Error requesting chapter {
|
380
|
+
i} with url {chapter.chapter_url}')
|
381
|
+
return False
|
382
|
+
|
383
|
+
self._add_or_update_chapter_data(chapter=chapter, link_idx=i,
|
384
|
+
save_in_file=True)
|
385
|
+
if clean_chapters:
|
386
|
+
self._clean_chapter(chapter.chapter_html_filename)
|
387
|
+
return True
|
388
|
+
else:
|
389
|
+
logger.warning('No chapters found')
|
390
|
+
|
391
|
+
# EPUB CREATION
|
392
|
+
|
393
|
+
def save_novel_to_epub(self,
|
394
|
+
sync_toc: bool = False,
|
395
|
+
start_chapter: int = 1,
|
396
|
+
end_chapter: int = None,
|
397
|
+
chapters_by_book: int = 100) -> None:
|
398
|
+
if sync_toc:
|
399
|
+
self.sync_toc()
|
400
|
+
|
401
|
+
if start_chapter > len(self.chapters):
|
402
|
+
logger.info(f'The start chapter is bigger than the number of chapters saved ({
|
403
|
+
len(self.chapters)})')
|
404
|
+
return
|
405
|
+
|
406
|
+
if not end_chapter:
|
407
|
+
end_chapter = len(self.chapters)
|
408
|
+
elif end_chapter > len(self.chapters):
|
409
|
+
end_chapter = len(self.chapters)
|
410
|
+
logger.info(f'The end chapter is bigger than the number of chapters, automatically setting it to {
|
411
|
+
end_chapter}.')
|
412
|
+
|
413
|
+
idx = 1
|
414
|
+
start = start_chapter
|
415
|
+
while start <= end_chapter:
|
416
|
+
end = min(start + chapters_by_book - 1, end_chapter)
|
417
|
+
result = self._save_chapters_to_epub(start_chapter=start,
|
418
|
+
end_chapter=end,
|
419
|
+
collection_idx=idx)
|
420
|
+
if not result:
|
421
|
+
logger.critical(f'Error with saving novel to epub, with start chapter: {
|
422
|
+
start_chapter} and end chapter: {end_chapter}')
|
423
|
+
return False
|
424
|
+
start = start + chapters_by_book
|
425
|
+
idx = idx + 1
|
426
|
+
return True
|
427
|
+
|
428
|
+
|
429
|
+
# UTILS
|
430
|
+
|
431
|
+
|
432
|
+
def clean_files(self, clean_chapters: bool = True, clean_toc: bool = True, hard_clean: bool = False) -> None:
|
433
|
+
hard_clean = hard_clean or self.scraper_behavior.hard_clean
|
434
|
+
if clean_chapters:
|
435
|
+
for chapter in self.chapters:
|
436
|
+
if chapter.chapter_html_filename:
|
437
|
+
self._clean_chapter(
|
438
|
+
chapter.chapter_html_filename, hard_clean)
|
439
|
+
if clean_toc:
|
440
|
+
self._clean_toc(hard_clean)
|
441
|
+
|
442
|
+
def show_novel_dir(self) -> str:
|
443
|
+
return self.file_manager.novel_base_dir
|
444
|
+
|
445
|
+
def _clean_chapter(self, chapter_html_filename: str, hard_clean: bool = False) -> None:
|
446
|
+
hard_clean = hard_clean or self.scraper_behavior.hard_clean
|
447
|
+
chapter_html = self.file_manager.load_chapter_html(
|
448
|
+
chapter_html_filename)
|
449
|
+
if not chapter_html:
|
450
|
+
logger.warning(f'No content found on file {chapter_html_filename}')
|
451
|
+
return
|
452
|
+
chapter_html = self.decoder.clean_html(
|
453
|
+
chapter_html, hard_clean=hard_clean)
|
454
|
+
self.file_manager.save_chapter_html(
|
455
|
+
chapter_html_filename, chapter_html)
|
456
|
+
|
457
|
+
def _clean_toc(self, hard_clean: bool = False) -> None:
|
458
|
+
hard_clean = hard_clean or self.scraper_behavior.hard_clean
|
459
|
+
tocs_content = self.file_manager.get_all_toc()
|
460
|
+
for i, toc in enumerate(tocs_content):
|
461
|
+
toc = self.decoder.clean_html(toc, hard_clean=hard_clean)
|
462
|
+
self.file_manager.update_toc(toc, i)
|
463
|
+
|
464
|
+
def _get_chapter(self,
|
465
|
+
chapter: Chapter,
|
466
|
+
reload: bool = False) -> Chapter | None:
|
467
|
+
|
468
|
+
# Generate filename if needed
|
469
|
+
if not chapter.chapter_html_filename:
|
470
|
+
chapter.chapter_html_filename = utils.generate_file_name_from_url(
|
471
|
+
chapter.chapter_url)
|
472
|
+
|
473
|
+
# Try loading from cache first
|
474
|
+
if not reload:
|
475
|
+
chapter.chapter_html = self.file_manager.load_chapter_html(
|
476
|
+
chapter.chapter_html_filename)
|
477
|
+
if chapter.chapter_html:
|
478
|
+
return chapter
|
479
|
+
|
480
|
+
# Fetch fresh content
|
481
|
+
chapter.chapter_html = request_manager.get_html_content(chapter.chapter_url,
|
482
|
+
force_flaresolver=self.scraper_behavior.force_flaresolver)
|
483
|
+
if not chapter.chapter_html:
|
484
|
+
logger.error(f'No content found on link {chapter.chapter_url}')
|
485
|
+
return chapter
|
486
|
+
|
487
|
+
# Save content
|
488
|
+
self.file_manager.save_chapter_html(
|
489
|
+
chapter.chapter_html_filename, chapter.chapter_html)
|
490
|
+
return chapter
|
491
|
+
|
492
|
+
def _add_toc(self,
|
493
|
+
url: str,
|
494
|
+
toc_filename: str = None,
|
495
|
+
reload: bool = False):
|
496
|
+
if not reload:
|
497
|
+
content = self.file_manager.get_toc(toc_filename)
|
498
|
+
if content:
|
499
|
+
return content
|
500
|
+
|
501
|
+
content = request_manager.get_html_content(url)
|
502
|
+
if not content:
|
503
|
+
logger.warning(f'No content found on link {url}')
|
504
|
+
sys.exit(1)
|
505
|
+
|
506
|
+
self.file_manager.add_toc(content)
|
507
|
+
return content
|
508
|
+
|
509
|
+
def _get_chapter_urls_from_toc_content(self, toc_content: str) -> list[str]:
|
510
|
+
toc_elements = self.decoder.decode_html(toc_content, 'index')
|
511
|
+
try:
|
512
|
+
toc_urls = [toc_element['href'] for toc_element in toc_elements]
|
513
|
+
except KeyError as e:
|
514
|
+
logger.error(f'{e} not found on the Tag elements decoded from TOC')
|
515
|
+
return
|
516
|
+
if toc_urls:
|
517
|
+
return toc_urls
|
518
|
+
logger.warning('No chapter links found on toc content')
|
519
|
+
|
520
|
+
def _get_next_page_from_toc_content(self, toc_content: str) -> str:
|
521
|
+
next_page = self.decoder.decode_html(toc_content, 'next_page')
|
522
|
+
if next_page:
|
523
|
+
return next_page[0]['href']
|
524
|
+
|
525
|
+
def _add_or_update_chapter_data(self, chapter: Chapter, link_idx: int = None, save_in_file: bool = True) -> None:
|
526
|
+
if link_idx:
|
527
|
+
chapter_idx = link_idx
|
528
|
+
else:
|
529
|
+
# Check if the chapter exists
|
530
|
+
chapter_idx = self._find_chapter_index_by_link(chapter.chapter_url)
|
531
|
+
if chapter_idx is None:
|
532
|
+
# If no existing chapter we append it
|
533
|
+
self.chapters.append(chapter)
|
534
|
+
chapter_idx = len(self.chapters)
|
535
|
+
else:
|
536
|
+
if chapter.chapter_title:
|
537
|
+
self.chapters[chapter_idx].chapter_title = chapter.chapter_title
|
538
|
+
if chapter.chapter_html_filename:
|
539
|
+
self.chapters[chapter_idx].chapter_html_filename = chapter.chapter_html_filename
|
540
|
+
if save_in_file:
|
541
|
+
self.save_novel()
|
542
|
+
return chapter_idx
|
543
|
+
|
544
|
+
def _order_chapters_by_link_list(self) -> None:
|
545
|
+
self.chapters.sort(
|
546
|
+
key=lambda x: self.chapters_url_list.index(x.chapter_url))
|
547
|
+
|
548
|
+
def _get_chapter_by_url(self, chapter_url: str) -> Chapter:
|
549
|
+
for chapter in self.chapters:
|
550
|
+
if chapter_url == chapter.chapter_url:
|
551
|
+
return chapter
|
552
|
+
return None
|
553
|
+
|
554
|
+
def _find_chapter_index_by_link(self, chapter_url: str) -> str:
|
555
|
+
for index, chapter in enumerate(self.chapters):
|
556
|
+
if chapter.chapter_url == chapter_url:
|
557
|
+
return index
|
558
|
+
return None
|
559
|
+
|
560
|
+
def _delete_chapters_not_in_toc(self) -> None:
|
561
|
+
self.chapters = [
|
562
|
+
chapter for chapter in self.chapters if chapter.chapter_url in self.chapters_url_list]
|
563
|
+
|
564
|
+
def _create_chapters_from_toc(self):
|
565
|
+
self._delete_chapters_not_in_toc()
|
566
|
+
increment = 100
|
567
|
+
aux = 1
|
568
|
+
for chapter_url in self.chapters_url_list:
|
569
|
+
aux += 1
|
570
|
+
chapter_idx = self._find_chapter_index_by_link(chapter_url)
|
571
|
+
if not chapter_idx:
|
572
|
+
chapter = Chapter(chapter_url=chapter_url)
|
573
|
+
self._add_or_update_chapter_data(
|
574
|
+
chapter=chapter, save_in_file=False)
|
575
|
+
if aux == increment:
|
576
|
+
self.save_novel()
|
577
|
+
aux = 1
|
578
|
+
self._order_chapters_by_link_list()
|
579
|
+
self.save_novel()
|
580
|
+
|
581
|
+
def _decode_chapter(self, chapter: Chapter, idx_for_chapter_name: str = None) -> Chapter:
|
582
|
+
chapter_title = None
|
583
|
+
|
584
|
+
if chapter.chapter_html is None:
|
585
|
+
chapter = self._get_chapter(chapter)
|
586
|
+
|
587
|
+
if not chapter.chapter_html:
|
588
|
+
logger.error(f'No chapter content found for chapter link {
|
589
|
+
chapter.chapter_url} on file {chapter.chapter_html_filename}')
|
590
|
+
return None
|
591
|
+
|
592
|
+
paragraphs = self.decoder.decode_html(chapter.chapter_html, 'content')
|
593
|
+
|
594
|
+
if not paragraphs:
|
595
|
+
if chapter:
|
596
|
+
logger.warning(f'No paragraphs found in chapter link {
|
597
|
+
chapter.chapter_url} on file {chapter.chapter_html_filename}')
|
598
|
+
|
599
|
+
chapter_title = self.decoder.decode_html(chapter.chapter_html, 'title')
|
600
|
+
if not chapter_title:
|
601
|
+
chapter_title = f'{self.metadata.novel_title} Chapter {
|
602
|
+
idx_for_chapter_name}'
|
603
|
+
chapter.chapter_title = str(chapter_title)
|
604
|
+
|
605
|
+
chapter.chapter_content = ""
|
606
|
+
if self.scraper_behavior.save_title_to_content:
|
607
|
+
chapter.chapter_content += f'<h4>{chapter_title}</h4>'
|
608
|
+
logger.info(f'{len(paragraphs)} paragraphs found in chapter')
|
609
|
+
for paragraph in paragraphs:
|
610
|
+
chapter.chapter_content += str(paragraph)
|
611
|
+
|
612
|
+
return chapter
|
613
|
+
|
614
|
+
def _create_epub_book(self, book_title: str = None, calibre_collection: dict = None) -> epub.EpubBook:
|
615
|
+
book = epub.EpubBook()
|
616
|
+
if not book_title:
|
617
|
+
book_title = self.metadata.novel_title
|
618
|
+
book.set_title(book_title)
|
619
|
+
book.set_language(self.metadata.language)
|
620
|
+
book.add_metadata('DC', 'description', self.metadata.description)
|
621
|
+
book.add_metadata('DC', 'subject', 'Novela Web')
|
622
|
+
book.add_metadata('DC', 'subject', 'Scrapped')
|
623
|
+
if self.metadata.tags:
|
624
|
+
for tag in self.metadata.tags:
|
625
|
+
book.add_metadata('DC', 'subject', tag)
|
626
|
+
|
627
|
+
if self.metadata.author:
|
628
|
+
book.add_author(self.metadata.author)
|
629
|
+
|
630
|
+
date_metadata = ''
|
631
|
+
if self.metadata.start_date:
|
632
|
+
date_metadata += self.metadata.start_date
|
633
|
+
# Calibre specification doesn't use end_date.
|
634
|
+
# For now we use a custom metadata
|
635
|
+
# https://idpf.org/epub/31/spec/epub-packages.html#sec-opf-dcdate
|
636
|
+
# if self.metadata.end_date:
|
637
|
+
# date_metadata += f'/{self.metadata.end_date}'
|
638
|
+
if self.metadata.end_date:
|
639
|
+
book.add_metadata('OPF', 'meta', self.metadata.end_date, {
|
640
|
+
'name': 'end_date', 'content': self.metadata.end_date})
|
641
|
+
if date_metadata:
|
642
|
+
logger.debug(f'Using date_metadata {date_metadata}')
|
643
|
+
book.add_metadata('DC', 'date', date_metadata)
|
644
|
+
|
645
|
+
# Collections with calibre
|
646
|
+
if calibre_collection:
|
647
|
+
book.add_metadata('OPF', 'meta', '', {
|
648
|
+
'name': 'calibre:series', 'content': calibre_collection["title"]})
|
649
|
+
book.add_metadata('OPF', 'meta', '', {
|
650
|
+
'name': 'calibre:series_index', 'content': calibre_collection["idx"]})
|
651
|
+
|
652
|
+
cover_image_content = self.file_manager.load_novel_cover()
|
653
|
+
if cover_image_content:
|
654
|
+
book.set_cover('cover.jpg', cover_image_content)
|
655
|
+
book.spine += ['cover']
|
656
|
+
|
657
|
+
book.spine.append('nav')
|
658
|
+
return book
|
659
|
+
|
660
|
+
def _add_chapter_to_epub_book(self, chapter: Chapter, book: epub.EpubBook):
|
661
|
+
chapter = self.scrap_chapter(
|
662
|
+
chapter_url=chapter.chapter_url)
|
663
|
+
if chapter is None:
|
664
|
+
logger.warning('Error reading chapter')
|
665
|
+
return
|
666
|
+
self._add_or_update_chapter_data(
|
667
|
+
chapter=chapter, save_in_file=False)
|
668
|
+
file_name = utils.generate_epub_file_name_from_title(
|
669
|
+
chapter.chapter_title)
|
670
|
+
|
671
|
+
chapter_epub = epub.EpubHtml(
|
672
|
+
title=chapter.chapter_title, file_name=file_name)
|
673
|
+
chapter_epub.set_content(chapter.chapter_content)
|
674
|
+
book.add_item(chapter_epub)
|
675
|
+
link = epub.Link(file_name, chapter.chapter_title,
|
676
|
+
file_name.rstrip('.xhtml'))
|
677
|
+
toc = book.toc
|
678
|
+
toc.append(link)
|
679
|
+
book.toc = toc
|
680
|
+
book.spine.append(chapter_epub)
|
681
|
+
return book
|
682
|
+
|
683
|
+
def _save_chapters_to_epub(self,
|
684
|
+
start_chapter: int,
|
685
|
+
end_chapter: int = None,
|
686
|
+
collection_idx: int = None):
|
687
|
+
|
688
|
+
if start_chapter > len(self.chapters):
|
689
|
+
logger.error('start_chapter out of range')
|
690
|
+
return
|
691
|
+
# If end_chapter is not set, we set it to idx_start + chapters_num - 1
|
692
|
+
if not end_chapter:
|
693
|
+
end_chapter = len(self.chapters)
|
694
|
+
# If end_chapter is out of range, we set it to the last chapter
|
695
|
+
if end_chapter > len(self.chapters):
|
696
|
+
end_chapter = len(self.chapters)
|
697
|
+
|
698
|
+
# We use a slice so every chapter starting from idx_start and before idx_end
|
699
|
+
idx_start = start_chapter - 1
|
700
|
+
idx_end = end_chapter
|
701
|
+
# We create the epub book
|
702
|
+
book_title = f'{self.metadata.novel_title} Chapters {
|
703
|
+
start_chapter} - {end_chapter}'
|
704
|
+
calibre_collection = None
|
705
|
+
# If collection_idx is set, we create a calibre collection
|
706
|
+
if collection_idx:
|
707
|
+
calibre_collection = {'title': self.metadata.novel_title,
|
708
|
+
'idx': str(collection_idx)}
|
709
|
+
book = self._create_epub_book(book_title, calibre_collection)
|
710
|
+
|
711
|
+
for chapter in self.chapters[idx_start:idx_end]:
|
712
|
+
book = self._add_chapter_to_epub_book(chapter=chapter,
|
713
|
+
book=book)
|
714
|
+
if book is None:
|
715
|
+
logger.critical(f'Error saving epub {book_title}, could not decode chapter {
|
716
|
+
chapter} using host {self.host}')
|
717
|
+
return False
|
718
|
+
|
719
|
+
book.add_item(epub.EpubNcx())
|
720
|
+
book.add_item(epub.EpubNav())
|
721
|
+
self.file_manager.save_book(book, f'{book_title}.epub')
|
722
|
+
self.save_novel()
|
723
|
+
return True
|