web-novel-scraper 2.0.3__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web_novel_scraper/__main__.py +123 -69
- web_novel_scraper/config_manager.py +12 -12
- web_novel_scraper/custom_processor/__init__.py +1 -1
- web_novel_scraper/custom_processor/sites/fanmtl.py +15 -0
- web_novel_scraper/decode.py +225 -80
- web_novel_scraper/decode_guide/decode_guide.json +28 -0
- web_novel_scraper/file_manager.py +292 -110
- web_novel_scraper/models.py +76 -0
- web_novel_scraper/novel_scraper.py +893 -424
- web_novel_scraper/request_manager.py +50 -17
- web_novel_scraper/utils.py +22 -1
- web_novel_scraper/version.py +1 -1
- {web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.1.dist-info}/METADATA +1 -1
- web_novel_scraper-2.1.1.dist-info/RECORD +21 -0
- web_novel_scraper-2.0.3.dist-info/RECORD +0 -19
- {web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.1.dist-info}/WHEEL +0 -0
- {web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.1.dist-info}/entry_points.txt +0 -0
@@ -1,163 +1,86 @@
|
|
1
|
-
from dataclasses import dataclass,
|
2
|
-
import sys
|
1
|
+
from dataclasses import dataclass, field, replace
|
3
2
|
|
4
|
-
from dataclasses_json import dataclass_json,
|
3
|
+
from dataclasses_json import dataclass_json, Undefined, config
|
5
4
|
from ebooklib import epub
|
6
5
|
from typing import Optional
|
6
|
+
from pathlib import Path
|
7
7
|
|
8
8
|
from . import logger_manager
|
9
9
|
from .decode import Decoder
|
10
10
|
from .file_manager import FileManager
|
11
11
|
from . import utils
|
12
|
-
|
13
12
|
from .request_manager import get_html_content
|
14
13
|
from .config_manager import ScraperConfig
|
14
|
+
from .models import ScraperBehavior, Metadata, Chapter
|
15
|
+
from .utils import _always, ScraperError, FileManagerError, NetworkError, ValidationError, DecodeError
|
15
16
|
|
16
17
|
logger = logger_manager.create_logger('NOVEL SCRAPPING')
|
17
18
|
|
18
19
|
|
19
|
-
@dataclass_json
|
20
|
-
@dataclass
|
21
|
-
class Metadata:
|
22
|
-
author: Optional[str] = None
|
23
|
-
start_date: Optional[str] = None
|
24
|
-
end_date: Optional[str] = None
|
25
|
-
language: Optional[str] = "en"
|
26
|
-
description: Optional[str] = None
|
27
|
-
tags: list[str] = field(default_factory=list)
|
28
|
-
|
29
|
-
def update_behavior(self, **kwargs):
|
30
|
-
"""
|
31
|
-
Updates the behavior configuration dynamically.
|
32
|
-
Only updates the attributes provided in kwargs.
|
33
|
-
"""
|
34
|
-
for key, value in kwargs.items():
|
35
|
-
if hasattr(self, key) and value is not None:
|
36
|
-
setattr(self, key, value)
|
37
|
-
|
38
|
-
def __str__(self):
|
39
|
-
"""
|
40
|
-
Dynamic string representation of the configuration.
|
41
|
-
"""
|
42
|
-
attributes = [(f"{field.name}="
|
43
|
-
f"{getattr(self, field.name)}") for field in fields(self)]
|
44
|
-
attributes_str = '\n'.join(attributes)
|
45
|
-
return (f"Metadata: \n"
|
46
|
-
f"{attributes_str}")
|
47
|
-
|
48
|
-
|
49
|
-
@dataclass_json
|
50
|
-
@dataclass
|
51
|
-
class ScraperBehavior:
|
52
|
-
# Some novels already have the title in the content.
|
53
|
-
save_title_to_content: bool = False
|
54
|
-
# Some novels have the toc link without the host
|
55
|
-
auto_add_host: bool = False
|
56
|
-
# Some hosts return 403 when scrapping, this will force the use of FlareSolver
|
57
|
-
# to save time
|
58
|
-
force_flaresolver: bool = False
|
59
|
-
# When you clean the html files, you can use hard clean by default
|
60
|
-
hard_clean: bool = False
|
61
|
-
|
62
|
-
def update_behavior(self, **kwargs):
|
63
|
-
"""
|
64
|
-
Updates the behavior configuration dynamically.
|
65
|
-
Only updates the attributes provided in kwargs.
|
66
|
-
"""
|
67
|
-
for key, value in kwargs.items():
|
68
|
-
if hasattr(self, key) and value is not None:
|
69
|
-
setattr(self, key, value)
|
70
|
-
|
71
|
-
def __str__(self):
|
72
|
-
"""
|
73
|
-
Dynamic string representation of the configuration.
|
74
|
-
"""
|
75
|
-
attributes = [(f"{field.name}="
|
76
|
-
f"{getattr(self, field.name)}") for field in fields(self)]
|
77
|
-
attributes_str = '\n'.join(attributes)
|
78
|
-
return (f"Scraper Behavior: \n"
|
79
|
-
f"{attributes_str}")
|
80
|
-
|
81
|
-
|
82
|
-
@dataclass_json(undefined=Undefined.EXCLUDE)
|
83
|
-
@dataclass
|
84
|
-
class Chapter:
|
85
|
-
chapter_url: str
|
86
|
-
chapter_html_filename: Optional[str] = None
|
87
|
-
chapter_title: Optional[str] = None
|
88
|
-
|
89
|
-
def __init__(self,
|
90
|
-
chapter_url: str,
|
91
|
-
chapter_html: str = None,
|
92
|
-
chapter_content: str = None,
|
93
|
-
chapter_html_filename: str = None,
|
94
|
-
chapter_title: str = None):
|
95
|
-
self.chapter_url = chapter_url
|
96
|
-
self.chapter_html = chapter_html
|
97
|
-
self.chapter_content = chapter_content
|
98
|
-
self.chapter_html_filename = chapter_html_filename
|
99
|
-
self.chapter_title = chapter_title
|
100
|
-
|
101
|
-
def __str__(self):
|
102
|
-
return f'Title: "{self.chapter_title}"\nURL: "{self.chapter_url}"\nFilename: "{self.chapter_html_filename}"'
|
103
|
-
|
104
|
-
def __lt__(self, another):
|
105
|
-
return self.chapter_title < another.chapter_title
|
106
|
-
|
107
|
-
|
108
20
|
@dataclass_json(undefined=Undefined.EXCLUDE)
|
109
21
|
@dataclass
|
110
22
|
class Novel:
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
23
|
+
"""
|
24
|
+
A class representing a web novel with its metadata and content.
|
25
|
+
|
26
|
+
This class handles all operations related to scraping, storing, and managing web novels,
|
27
|
+
including their chapters, table of contents, and metadata.
|
28
|
+
|
29
|
+
Attributes:
|
30
|
+
title (str): The title of the novel.
|
31
|
+
host (Optional[str]): The host domain where the novel is located.
|
32
|
+
toc_main_url (Optional[str]): The main URL for the table of contents.
|
33
|
+
chapters (list[Chapter]): List of chapters in the novel.
|
34
|
+
chapters_url_list (list[str]): List of URLs for all chapters.
|
35
|
+
metadata (Metadata): Novel metadata like author, language, etc.
|
36
|
+
scraper_behavior (ScraperBehavior): Configuration for scraping behavior.
|
37
|
+
file_manager (FileManager): Handles file operations for the novel.
|
38
|
+
decoder (Decoder): Handles HTML decoding and parsing.
|
39
|
+
config (ScraperConfig): General scraper configuration.
|
40
|
+
"""
|
41
|
+
|
42
|
+
title: str
|
43
|
+
host: Optional[str] = None
|
115
44
|
toc_main_url: Optional[str] = None
|
45
|
+
chapters: list[Chapter] = field(default_factory=list)
|
116
46
|
chapters_url_list: list[str] = field(default_factory=list)
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
self.metadata = metadata
|
137
|
-
|
138
|
-
if toc_html:
|
139
|
-
self.file_manager.add_toc(toc_html)
|
140
|
-
|
141
|
-
self.toc_main_url = toc_main_url
|
142
|
-
self.chapters_url_list = chapters_url_list if chapters_url_list else []
|
143
|
-
|
144
|
-
self.chapters = chapters if chapters else []
|
145
|
-
|
146
|
-
self.scraper_behavior = scraper_behavior if scraper_behavior else ScraperBehavior()
|
147
|
-
if not host and not toc_main_url:
|
148
|
-
logger.error('You need to set "host" or "toc_main_url".')
|
149
|
-
sys.exit(1)
|
47
|
+
metadata: Metadata = field(default_factory=Metadata)
|
48
|
+
scraper_behavior: ScraperBehavior = field(default_factory=ScraperBehavior)
|
49
|
+
|
50
|
+
file_manager: FileManager = field(default=None,
|
51
|
+
repr=False,
|
52
|
+
compare=False,
|
53
|
+
metadata=config(exclude=_always))
|
54
|
+
decoder: Decoder = field(default=None,
|
55
|
+
repr=False,
|
56
|
+
compare=False,
|
57
|
+
metadata=config(exclude=_always))
|
58
|
+
config: ScraperConfig = field(default=None,
|
59
|
+
repr=False,
|
60
|
+
compare=False,
|
61
|
+
metadata=config(exclude=_always))
|
62
|
+
|
63
|
+
def __post_init__(self):
|
64
|
+
"""
|
65
|
+
Validates the novel instance after initialization.
|
150
66
|
|
151
|
-
|
67
|
+
Raises:
|
68
|
+
ValidationError: If the title is empty or neither host nor toc_main_url is provided.
|
69
|
+
"""
|
152
70
|
|
153
|
-
self.
|
154
|
-
|
155
|
-
self.
|
71
|
+
if not self.title:
|
72
|
+
raise ValidationError("title can't be empty")
|
73
|
+
if not (self.host or self.toc_main_url):
|
74
|
+
raise ValidationError('You must provide "host" or "toc_main_url"')
|
156
75
|
|
157
76
|
def __str__(self):
|
158
77
|
"""
|
159
|
-
|
78
|
+
Returns a string representation of the novel with its main attributes.
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
str: A formatted string containing the novel's main information.
|
160
82
|
"""
|
83
|
+
|
161
84
|
toc_info = self.toc_main_url if self.toc_main_url else "TOC added manually"
|
162
85
|
attributes = [
|
163
86
|
f"Title: {self.title}",
|
@@ -172,99 +95,316 @@ class Novel:
|
|
172
95
|
return (f"Novel Info: \n"
|
173
96
|
f"{attributes_str}")
|
174
97
|
|
175
|
-
@
|
176
|
-
def load(title: str, cfg: ScraperConfig, novel_base_dir:
|
98
|
+
@classmethod
|
99
|
+
def load(cls, title: str, cfg: ScraperConfig, novel_base_dir: Path = None) -> 'Novel':
|
100
|
+
"""
|
101
|
+
Loads a novel from stored JSON data.
|
102
|
+
|
103
|
+
Args:
|
104
|
+
title (str): Title of the novel to load.
|
105
|
+
cfg (ScraperConfig): Scraper configuration.
|
106
|
+
novel_base_dir (Path, optional): Base directory for the novel data.
|
107
|
+
|
108
|
+
Returns:
|
109
|
+
Novel: A new Novel instance loaded from stored data.
|
110
|
+
|
111
|
+
Raises:
|
112
|
+
ValidationError: If the novel with the given title is not found.
|
113
|
+
"""
|
114
|
+
|
177
115
|
fm = FileManager(title, cfg.base_novels_dir, novel_base_dir, read_only=True)
|
178
116
|
raw = fm.load_novel_json()
|
179
117
|
if raw is None:
|
180
118
|
logger.debug(f'Novel "{title}" was not found.')
|
181
|
-
raise
|
182
|
-
novel =
|
183
|
-
novel.config = cfg
|
119
|
+
raise ValidationError(f'Novel "{title}" was not found.')
|
120
|
+
novel = cls.from_json(raw)
|
184
121
|
novel.set_config(cfg=cfg, novel_base_dir=novel_base_dir)
|
185
122
|
return novel
|
186
123
|
|
124
|
+
@classmethod
|
125
|
+
def new(cls, title: str, cfg: ScraperConfig, host: str = None, toc_html: str = None,
|
126
|
+
toc_main_url: str = None) -> 'Novel':
|
127
|
+
"""Creates a new Novel instance.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
title: Title of the novel (required)
|
131
|
+
cfg: Scraper configuration (required)
|
132
|
+
host: Host URL for the novel content (optional)
|
133
|
+
toc_html: HTML content for the table of contents (optional)
|
134
|
+
toc_main_url: URL for the table of contents (optional)
|
135
|
+
|
136
|
+
Note:
|
137
|
+
- Either toc_html or toc_main_url must be provided
|
138
|
+
- If toc_main_url is provided, host will be extracted from it if not explicitly provided
|
139
|
+
- If toc_html is provided, host must be explicitly provided
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
Novel: A new Novel instance
|
143
|
+
|
144
|
+
Raises:
|
145
|
+
ValidationError: If the title is empty, or if neither toc_html nor toc_main_url is provided
|
146
|
+
"""
|
147
|
+
if not title:
|
148
|
+
raise ValidationError("Title cannot be empty")
|
149
|
+
|
150
|
+
if not (toc_html or toc_main_url):
|
151
|
+
raise ValidationError("Either toc_html or toc_main_url must be provided")
|
152
|
+
|
153
|
+
if toc_html and not host:
|
154
|
+
raise ValidationError("When providing toc_html, host must be explicitly provided")
|
155
|
+
|
156
|
+
novel = cls(title=title, host=host, toc_main_url=toc_main_url)
|
157
|
+
# If toc_main_url is provided and the host isn't, extract host from URL
|
158
|
+
if toc_main_url and not host:
|
159
|
+
host = utils.obtain_host(toc_main_url)
|
160
|
+
novel.host = host
|
161
|
+
|
162
|
+
# If toc_html is provided, add it to the novel
|
163
|
+
if toc_html:
|
164
|
+
novel.add_toc_html(toc_html, host)
|
165
|
+
|
166
|
+
return novel
|
167
|
+
|
187
168
|
# NOVEL PARAMETERS MANAGEMENT
|
188
169
|
|
189
170
|
def set_config(self,
|
190
|
-
cfg: ScraperConfig
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
171
|
+
cfg: ScraperConfig,
|
172
|
+
novel_base_dir: str | None = None) -> None:
|
173
|
+
"""
|
174
|
+
Configures the novel with the provided scraper configuration and base directory.
|
175
|
+
|
176
|
+
Sets up the file manager and decoder for the novel based on the provided configuration.
|
177
|
+
|
178
|
+
Args:
|
179
|
+
cfg (ScraperConfig): The scraper configuration to use.
|
180
|
+
novel_base_dir (str | None, optional): Base directory for the novel files.
|
181
|
+
If None, it uses the default directory from configuration.
|
201
182
|
|
202
|
-
|
203
|
-
|
204
|
-
|
183
|
+
Raises:
|
184
|
+
FileManagerError: If there's an error when reading the config or decoding guide files.
|
185
|
+
"""
|
186
|
+
|
187
|
+
try:
|
188
|
+
self.config = cfg
|
189
|
+
self.file_manager = FileManager(title=self.title,
|
190
|
+
base_novels_dir=self.config.base_novels_dir,
|
191
|
+
novel_base_dir=novel_base_dir)
|
192
|
+
self.decoder = Decoder(self.host, self.config.decode_guide_file)
|
193
|
+
except FileManagerError as e:
|
194
|
+
logger.error("Could not set configuration. File Manager Error", exc_info=e)
|
195
|
+
raise
|
196
|
+
|
197
|
+
def set_scraper_behavior(self, **kwargs) -> None:
|
198
|
+
"""
|
199
|
+
Updates the scraper behavior configuration with the provided parameters.
|
205
200
|
|
206
|
-
|
201
|
+
Args:
|
202
|
+
**kwargs: Keyword arguments for updating scraper behavior settings.
|
203
|
+
Can include any valid ScraperBehavior attributes.
|
204
|
+
"""
|
207
205
|
|
208
|
-
|
209
|
-
self.scraper_behavior.
|
206
|
+
filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
207
|
+
self.scraper_behavior = replace(self.scraper_behavior, **filtered_kwargs)
|
208
|
+
logger.info(f'Scraper behavior updated')
|
210
209
|
|
211
210
|
def set_metadata(self, **kwargs) -> None:
|
212
|
-
|
211
|
+
"""
|
212
|
+
Updates the novel's metadata with the provided parameters.
|
213
|
+
|
214
|
+
Args:
|
215
|
+
**kwargs: Keyword arguments for updating metadata.
|
216
|
+
Can include any valid Metadata attributes like author, language, etc.
|
217
|
+
"""
|
218
|
+
filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
219
|
+
self.metadata = replace(self.metadata, **filtered_kwargs)
|
220
|
+
logger.info(f'Metadata updated')
|
221
|
+
|
222
|
+
def add_tag(self, tag: str) -> None:
|
223
|
+
"""
|
224
|
+
Adds a new tag to the novel's metadata if it doesn't already exist.
|
225
|
+
|
226
|
+
Args:
|
227
|
+
tag (str): The tag to add to the novel's metadata.
|
228
|
+
"""
|
213
229
|
|
214
|
-
def add_tag(self, tag: str) -> bool:
|
215
230
|
if tag not in self.metadata.tags:
|
216
|
-
self.metadata
|
217
|
-
|
218
|
-
|
219
|
-
|
231
|
+
self.metadata = replace(
|
232
|
+
self.metadata, tags=(*self.metadata.tags, tag)
|
233
|
+
)
|
234
|
+
logger.info('Tag %s added to metadata', tag)
|
235
|
+
else:
|
236
|
+
logger.debug("Tag %s already present in %s", tag, self.title)
|
237
|
+
|
238
|
+
def remove_tag(self, tag: str) -> None:
|
239
|
+
"""
|
240
|
+
Removes a tag from the novel's metadata if it exists.
|
241
|
+
|
242
|
+
Args:
|
243
|
+
tag (str): The tag to remove from the novel's metadata.
|
244
|
+
"""
|
220
245
|
|
221
|
-
def remove_tag(self, tag: str) -> bool:
|
222
246
|
if tag in self.metadata.tags:
|
223
|
-
self.metadata.
|
224
|
-
|
225
|
-
|
226
|
-
|
247
|
+
self.metadata = replace(self.metadata,
|
248
|
+
tags=tuple(t for t in self.metadata.tags if t != tag))
|
249
|
+
logger.info('Tag %s removed from metadata', tag)
|
250
|
+
else:
|
251
|
+
logger.debug("Tag %s not present in %s", tag, self.title)
|
227
252
|
|
228
253
|
def set_cover_image(self, cover_image_path: str) -> None:
|
229
|
-
|
254
|
+
"""
|
255
|
+
Sets or updates the novel's cover image.
|
256
|
+
|
257
|
+
Args:
|
258
|
+
cover_image_path (str): Path to the cover image file.
|
259
|
+
|
260
|
+
Raises:
|
261
|
+
FileManagerError: If there's an error when saving the cover image.
|
262
|
+
"""
|
263
|
+
|
264
|
+
try:
|
265
|
+
self.file_manager.save_novel_cover(cover_image_path)
|
266
|
+
logger.info('Cover image updated')
|
267
|
+
except FileManagerError as e:
|
268
|
+
logger.error("Could not update cover. File Manager Error", exc_info=e)
|
269
|
+
raise
|
230
270
|
|
231
271
|
def set_host(self, host: str) -> None:
|
272
|
+
"""
|
273
|
+
Sets or updates the novel's host URL and modifies the decoder.
|
274
|
+
|
275
|
+
Args:
|
276
|
+
host (str): The host URL for the novel.
|
277
|
+
|
278
|
+
Raises:
|
279
|
+
DecodeError: If there's an error when setting up the decoder with the new host.
|
280
|
+
"""
|
281
|
+
|
232
282
|
self.host = host
|
233
|
-
|
283
|
+
try:
|
284
|
+
self.decoder.set_host(host)
|
285
|
+
logger.info(f'Host updated to "{self.host}"')
|
286
|
+
except ValidationError as e:
|
287
|
+
logger.error("Could not set host. Decode Error", exc_info=e)
|
288
|
+
raise
|
289
|
+
|
290
|
+
def save_novel(self) -> None:
|
291
|
+
"""
|
292
|
+
Saves the current state of the novel to disk.
|
293
|
+
|
294
|
+
Persists all novel data including metadata, chapters, and configuration
|
295
|
+
to the novel's JSON file.
|
296
|
+
|
297
|
+
Raises:
|
298
|
+
FileManagerError: If there's an error when saving the novel data.
|
299
|
+
"""
|
234
300
|
|
235
|
-
|
236
|
-
|
301
|
+
try:
|
302
|
+
self.file_manager.save_novel_json(self.to_dict())
|
303
|
+
logger.info(f'Novel data saved to disk on file "{self.file_manager.novel_json_file}".')
|
304
|
+
except FileManagerError as e:
|
305
|
+
logger.error("Could not save novel. File Manager Error", exc_info=e)
|
306
|
+
raise
|
237
307
|
|
238
308
|
# TABLE OF CONTENTS MANAGEMENT
|
239
309
|
|
240
|
-
def set_toc_main_url(self, toc_main_url: str,
|
310
|
+
def set_toc_main_url(self, toc_main_url: str, update_host: bool = True) -> None:
|
311
|
+
"""
|
312
|
+
Sets the main URL for the table of contents and optionally updates the host.
|
313
|
+
|
314
|
+
Deletes any existing TOC files as they will be refreshed from the new URL.
|
315
|
+
If update_host is True, extracts and updates the host from the new URL.
|
316
|
+
|
317
|
+
Args:
|
318
|
+
toc_main_url: Main URL for the table of contents
|
319
|
+
update_host: Whether to update the host based on the URL (default: True)
|
320
|
+
|
321
|
+
Raises:
|
322
|
+
ValidationError: If host extraction fails
|
323
|
+
FileManagerError: If TOC deletion fails
|
324
|
+
"""
|
325
|
+
|
241
326
|
self.toc_main_url = toc_main_url
|
242
|
-
self.
|
243
|
-
|
244
|
-
self.
|
245
|
-
|
246
|
-
|
247
|
-
|
327
|
+
logger.info(f'Main URL updated to "{self.toc_main_url}", TOCs already requested will be deleted.')
|
328
|
+
try:
|
329
|
+
self.file_manager.delete_toc()
|
330
|
+
except FileManagerError as e:
|
331
|
+
logger.error("Could not delete TOCs. File Manager Error", exc_info=e)
|
332
|
+
raise
|
333
|
+
|
334
|
+
if update_host:
|
335
|
+
new_host = utils.obtain_host(self.toc_main_url)
|
336
|
+
logger.debug(f'Update Host flag present, new host is "{new_host}".')
|
337
|
+
self.set_host(new_host)
|
248
338
|
|
249
339
|
def add_toc_html(self, html: str, host: str = None) -> None:
|
340
|
+
"""
|
341
|
+
Adds HTML content as a table of contents fragment.
|
342
|
+
|
343
|
+
This method is mutually exclusive with using toc_main_url - if a main URL exists,
|
344
|
+
it will be cleared. Host must be provided either directly or from a previous configuration.
|
345
|
+
|
346
|
+
Args:
|
347
|
+
html: HTML content to add as TOC fragment
|
348
|
+
host: Optional host to set for this content
|
349
|
+
|
350
|
+
Raises:
|
351
|
+
ValidationError: If no host is provided when required
|
352
|
+
FileManagerError: If saving TOC content fails
|
353
|
+
"""
|
354
|
+
|
250
355
|
if self.toc_main_url:
|
356
|
+
logger.debug(f'TOC main URL is exclusive with manual TOC files, TOC main URL will be deleted.')
|
251
357
|
self.delete_toc()
|
252
358
|
self.toc_main_url = None
|
253
359
|
|
254
360
|
if host:
|
255
|
-
self.host
|
256
|
-
|
361
|
+
self.set_host(host)
|
362
|
+
else:
|
363
|
+
if self.host is None:
|
364
|
+
logger.error(f'When using TOC files instead of URLs, host must be provided.')
|
365
|
+
raise ValidationError('Host must be provided when using TOC files instead of URLs.')
|
257
366
|
self.file_manager.add_toc(html)
|
258
|
-
|
367
|
+
logger.info('New TOC file added to disk.')
|
259
368
|
|
260
369
|
def delete_toc(self):
|
370
|
+
"""
|
371
|
+
Deletes all table of contents files and resets chapter data.
|
372
|
+
|
373
|
+
Clears:
|
374
|
+
- All TOC files from disk
|
375
|
+
- Chapter list
|
376
|
+
- Chapter URL list
|
377
|
+
|
378
|
+
Raises:
|
379
|
+
FileManagerError: If deletion of TOC files fails
|
380
|
+
"""
|
381
|
+
|
261
382
|
self.file_manager.delete_toc()
|
262
383
|
self.chapters = []
|
263
384
|
self.chapters_url_list = []
|
385
|
+
logger.info('TOC files deleted from disk.')
|
386
|
+
|
387
|
+
def sync_toc(self, reload_files: bool = True) -> None:
|
388
|
+
"""
|
389
|
+
Synchronizes the table of contents with stored/remote content.
|
390
|
+
|
391
|
+
Process:
|
392
|
+
1. Checks if TOC content exists (stored or retrievable)
|
393
|
+
2. Optionally reloads TOC files from remote if needed
|
394
|
+
3. Extracts chapter URLs from TOC content
|
395
|
+
4. Creates/updates chapters based on URLs
|
396
|
+
|
397
|
+
Args:
|
398
|
+
reload_files: Whether to force reload of TOC files from remote (default: True)
|
399
|
+
|
400
|
+
Raises:
|
401
|
+
ScraperError: If no TOC content is available
|
402
|
+
FileManagerError: If file operations fail
|
403
|
+
DecodeError: If TOC parsing fails
|
404
|
+
NetworkError: If remote content retrieval fails
|
405
|
+
ValidationError: If chapter creation fails
|
406
|
+
"""
|
264
407
|
|
265
|
-
def sync_toc(self, reload_files: bool = False) -> bool:
|
266
|
-
# Hard reload will request again the toc files from the toc_main_url
|
267
|
-
# Only works with toc_main_url
|
268
408
|
all_tocs_content = self.file_manager.get_all_toc()
|
269
409
|
|
270
410
|
# If there is no toc_main_url and no manually added toc, there is no way to sync toc
|
@@ -272,59 +412,116 @@ class Novel:
|
|
272
412
|
if toc_not_exists:
|
273
413
|
logger.critical(
|
274
414
|
'There is no toc html and no toc url set, unable to get toc.')
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
415
|
+
raise ScraperError('There is no toc html and no toc url set, unable to get toc.')
|
416
|
+
|
417
|
+
# Will reload files if:
|
418
|
+
# Reload_files is True (requested by user) AND there is a toc_main_url present.
|
419
|
+
# OR
|
420
|
+
# There is a toc_main_url present, but no toc files are saved in the disk.
|
421
|
+
reload_files = ((reload_files or
|
422
|
+
all_tocs_content is None) or
|
423
|
+
self.toc_main_url is not None)
|
424
|
+
if reload_files:
|
425
|
+
logger.debug('Reloading TOC files.')
|
426
|
+
try:
|
427
|
+
self._request_toc_files()
|
428
|
+
except FileManagerError as e:
|
429
|
+
logger.error("Could not request TOC files. File Manager Error", exc_info=e)
|
430
|
+
raise
|
431
|
+
except DecodeError as e:
|
432
|
+
logger.error("Could not request TOC files. Decoder Error", exc_info=e)
|
433
|
+
raise
|
434
|
+
except NetworkError as e:
|
435
|
+
logger.error("Could not request TOC files. Network Error", exc_info=e)
|
436
|
+
raise
|
437
|
+
|
438
|
+
try:
|
439
|
+
self._load_or_request_chapter_urls_from_toc()
|
440
|
+
except DecodeError as e:
|
441
|
+
logger.error("Could not get chapter urls from TOC files. Decoder Error", exc_info=e)
|
442
|
+
raise
|
443
|
+
except FileManagerError as e:
|
444
|
+
logger.error("Could not get chapter urls from TOC files. File Manager Error", exc_info=e)
|
445
|
+
raise
|
446
|
+
|
447
|
+
try:
|
448
|
+
self._create_chapters_from_toc()
|
449
|
+
except ValidationError as e:
|
450
|
+
logger.error("Could not create chapters from TOC files. Validation Error", exc_info=e)
|
451
|
+
raise
|
452
|
+
logger.info('TOC synced with files, Chapters created from Table of Contents.')
|
453
|
+
|
454
|
+
def show_toc(self) -> Optional[str]:
|
455
|
+
"""
|
456
|
+
Generates a human-readable representation of the Table Of Contents.
|
290
457
|
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
chapters_url_from_toc_content = self.decoder.get_chapter_urls(toc_content)
|
295
|
-
if chapters_url_from_toc_content is None:
|
296
|
-
logger.error('Chapters url not found on toc_content')
|
297
|
-
return False
|
298
|
-
# First we save a list of lists in case we need to invert the orderAdd commentMore actions
|
299
|
-
self.chapters_url_list.append(chapters_url_from_toc_content)
|
300
|
-
|
301
|
-
invert = self.decoder.is_index_inverted()
|
302
|
-
self.chapters_url_list = [
|
303
|
-
chapter
|
304
|
-
for chapters_url in (self.chapters_url_list[::-1] if invert else self.chapters_url_list)
|
305
|
-
for chapter in chapters_url
|
306
|
-
]
|
307
|
-
add_host_to_chapter = self.scraper_behavior.auto_add_host or self.decoder.add_host_to_chapter()
|
308
|
-
if add_host_to_chapter:
|
309
|
-
self.chapters_url_list = [
|
310
|
-
f'https://{self.host}{chapter_url}' for chapter_url in self.chapters_url_list]
|
311
|
-
self.chapters_url_list = utils.delete_duplicates(
|
312
|
-
self.chapters_url_list)
|
313
|
-
self.save_novel()
|
314
|
-
self._create_chapters_from_toc()
|
315
|
-
return True
|
458
|
+
Returns:
|
459
|
+
Optional[str]: Formatted string showing chapter numbers and URLs, None if no chapters_urls found
|
460
|
+
"""
|
316
461
|
|
317
|
-
def show_toc(self):
|
318
462
|
if not self.chapters_url_list:
|
319
|
-
|
463
|
+
logger.warning('No chapters in TOC')
|
464
|
+
return None
|
320
465
|
toc_str = 'Table Of Contents:'
|
321
466
|
for i, chapter_url in enumerate(self.chapters_url_list):
|
322
|
-
toc_str += f'\nChapter {i+1}: {chapter_url}'
|
467
|
+
toc_str += f'\nChapter {i + 1}: {chapter_url}'
|
323
468
|
return toc_str
|
324
469
|
|
325
470
|
# CHAPTERS MANAGEMENT
|
326
471
|
|
472
|
+
def get_chapter(self, chapter_index: Optional[int] = None, chapter_url: Optional[str] = None) -> Optional[Chapter]:
|
473
|
+
"""
|
474
|
+
Retrieves a chapter either by its index in the chapter list or by its URL.
|
475
|
+
|
476
|
+
Args:
|
477
|
+
chapter_index (Optional[int]): The index of the chapter in the chapter list
|
478
|
+
chapter_url (Optional[str]): The URL of the chapter to retrieve
|
479
|
+
|
480
|
+
Returns:
|
481
|
+
Optional[Chapter]: The requested chapter if found, None otherwise
|
482
|
+
|
483
|
+
Raises:
|
484
|
+
ValidationError: If neither index nor url is provided, or if both are provided
|
485
|
+
IndexError: If the provided index is out of range
|
486
|
+
"""
|
487
|
+
if not utils.check_exclusive_params(chapter_index, chapter_url):
|
488
|
+
raise ValidationError("Exactly one of 'chapter_index' or 'chapter_url' must be provided")
|
489
|
+
|
490
|
+
if chapter_url is not None:
|
491
|
+
chapter_index = self._find_chapter_index_by_url(chapter_url)
|
492
|
+
|
493
|
+
if chapter_index is not None:
|
494
|
+
if chapter_index < 0:
|
495
|
+
raise ValueError("Index must be positive")
|
496
|
+
try:
|
497
|
+
return self.chapters[chapter_index]
|
498
|
+
except IndexError:
|
499
|
+
logger.warning(f"No chapter found at index {chapter_index}")
|
500
|
+
return None
|
501
|
+
logger.warning(f"No chapter found with url {chapter_url}")
|
502
|
+
return None
|
503
|
+
|
327
504
|
def show_chapters(self) -> str:
|
505
|
+
"""
|
506
|
+
Generates a text representation of all novel chapters.
|
507
|
+
|
508
|
+
Returns:
|
509
|
+
str: Formatted string containing the list of chapters with their information:
|
510
|
+
- Chapter number
|
511
|
+
- Title (if available)
|
512
|
+
- URL
|
513
|
+
- HTML filename (if available)
|
514
|
+
|
515
|
+
Note:
|
516
|
+
Output format is:
|
517
|
+
Chapters List:
|
518
|
+
Chapter 1:
|
519
|
+
Title: [title or message]
|
520
|
+
URL: [url]
|
521
|
+
Filename: [filename or message]
|
522
|
+
...
|
523
|
+
"""
|
524
|
+
|
328
525
|
chapter_list = "Chapters List:\n"
|
329
526
|
for i, chapter in enumerate(self.chapters):
|
330
527
|
chapter_list += f"Chapter {i + 1}:\n"
|
@@ -333,105 +530,166 @@ class Novel:
|
|
333
530
|
chapter_list += f" Filename: {chapter.chapter_html_filename if chapter.chapter_html_filename else 'File not yet requested'}\n"
|
334
531
|
return chapter_list
|
335
532
|
|
336
|
-
def scrap_chapter(self,
|
337
|
-
|
338
|
-
chapter
|
339
|
-
if not utils.check_exclusive_params(chapter_url, chapter_idx):
|
340
|
-
raise ValueError("chapter_url and chapter_id, only one needs to be set")
|
533
|
+
def scrap_chapter(self, chapter: Chapter, reload_file: bool = False) -> Chapter:
|
534
|
+
"""
|
535
|
+
Processes and decodes a specific chapter of the novel.
|
341
536
|
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
537
|
+
This method handles the complete scraping process for an individual chapter,
|
538
|
+
including HTML loading or requesting and content decoding.
|
539
|
+
|
540
|
+
Args:
|
541
|
+
chapter (Chapter): Chapter object to process
|
542
|
+
reload_file (bool, optional): If True, forces a new download of the chapter
|
543
|
+
even if it already exists locally. Defaults to False.
|
544
|
+
|
545
|
+
Returns:
|
546
|
+
Chapter: The updated Chapter object with decoded content
|
547
|
+
|
548
|
+
Raises:
|
549
|
+
ValidationError: If there are issues with the values of the provided Chapter object
|
550
|
+
DecodeError: If there are issues during content decoding
|
551
|
+
NetworkError: If there are issues during HTML request
|
552
|
+
FileManagerError: If there are issues during file operations
|
553
|
+
"""
|
554
|
+
|
555
|
+
logger.debug('Scraping Chapter...')
|
556
|
+
if chapter.chapter_url is None:
|
557
|
+
logger.error('Chapter trying to be scrapped does not have a URL')
|
558
|
+
raise ValidationError('Chapter trying to be scrapped does not have a URL')
|
559
|
+
|
560
|
+
logger.debug(f'Using chapter url: {chapter.chapter_url}')
|
561
|
+
|
562
|
+
if reload_file:
|
563
|
+
logger.debug('Reload file Flag present, HTML will be requested...')
|
564
|
+
|
565
|
+
try:
|
566
|
+
chapter = self._load_or_request_chapter(chapter,
|
567
|
+
reload_file=reload_file)
|
568
|
+
except ValidationError as e:
|
569
|
+
logger.error(f'Could get chapter for URL "{chapter.chapter_url}" HTML content. Validation Error',
|
570
|
+
exc_info=e)
|
571
|
+
raise
|
572
|
+
except FileManagerError as e:
|
573
|
+
logger.error(f'Could get chapter for URL "{chapter.chapter_url}" HTML content. File Manager Error',
|
574
|
+
exc_info=e)
|
575
|
+
raise
|
576
|
+
except NetworkError as e:
|
577
|
+
logger.error(f'Could get chapter for URL "{chapter.chapter_url}" HTML content. Network Error', exc_info=e)
|
578
|
+
raise
|
579
|
+
|
580
|
+
if not chapter.chapter_html:
|
581
|
+
logger.error(f'Could not get HTML content for chapter with URL "{chapter.chapter_url}"')
|
582
|
+
raise ScraperError(f'Could not get HTML content for chapter with URL "{chapter.chapter_url}"')
|
367
583
|
|
368
584
|
# We get the chapter title and content
|
369
585
|
# We pass an index so we can autogenerate a Title
|
370
|
-
|
371
|
-
|
372
|
-
|
586
|
+
save_title_to_content = (self.scraper_behavior.save_title_to_content or
|
587
|
+
self.decoder.save_title_to_content())
|
588
|
+
try:
|
589
|
+
chapter = self._decode_chapter(chapter=chapter,
|
590
|
+
save_title_to_content=save_title_to_content)
|
591
|
+
except DecodeError as e:
|
592
|
+
logger.error(f'Could not decode HTML title and content for chapter with URL "{chapter.chapter_url}"',
|
593
|
+
exc_info=e)
|
594
|
+
raise
|
595
|
+
except ValidationError as e:
|
596
|
+
logger.error(f'Could not decode HTML title and content for chapter with URL "{chapter.chapter_url}"',
|
597
|
+
exc_info=e)
|
598
|
+
raise
|
599
|
+
|
600
|
+
logger.info(f'Chapter scrapped from link: {chapter.chapter_url}')
|
373
601
|
return chapter
|
374
602
|
|
375
|
-
def
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
603
|
+
def request_all_chapters(self,
|
604
|
+
sync_toc: bool = True,
|
605
|
+
reload_files: bool = False,
|
606
|
+
clean_chapters: bool = False) -> None:
|
607
|
+
"""
|
608
|
+
Requests and processes all chapters of the novel.
|
609
|
+
|
610
|
+
This method performs scraping of all available chapters in the novel,
|
611
|
+
handling the loading and decoding of each one.
|
612
|
+
|
613
|
+
Args:
|
614
|
+
sync_toc (bool, optional): If True, syncs the table of contents
|
615
|
+
reload_files (bool, optional): If True, forces a new download of all
|
616
|
+
chapters, even if they already exist locally. Defaults to False.
|
617
|
+
clean_chapters (bool, optional): If True, cleans the HTML content of the files
|
618
|
+
|
619
|
+
Raises:
|
620
|
+
FileManagerError: If there are issues during file operations
|
621
|
+
DecodeError: If there are issues during content decoding
|
622
|
+
ValidationError: If there are issues during content decoding
|
623
|
+
|
624
|
+
Note:
|
625
|
+
- Process is performed sequentially for each chapter
|
626
|
+
- Errors in individual chapters don't stop the complete process
|
627
|
+
- Progress is logged through the logging system
|
628
|
+
"""
|
398
629
|
|
399
|
-
|
630
|
+
logger.debug('Requesting all chapters...')
|
400
631
|
if sync_toc:
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
632
|
+
logger.debug('Sync TOC flag present, syncing TOC...')
|
633
|
+
try:
|
634
|
+
self.sync_toc(reload_files=False)
|
635
|
+
except ScraperError:
|
636
|
+
logger.warning('Error when trying to sync TOC, continuing without syncing...')
|
637
|
+
|
638
|
+
if len(self.chapters_url_list) == 0:
|
639
|
+
logger.warning('No chapters in TOC, returning without requesting any...')
|
640
|
+
return None
|
641
|
+
|
642
|
+
# We request the HTML files of all the chapters
|
643
|
+
# The chapter will be requested again if:
|
644
|
+
# 1. Reload files flag is True (Requested by user)
|
645
|
+
chapters_obtained = 0
|
646
|
+
total_chapters = len(self.chapters)
|
647
|
+
for i in range(len(self.chapters)):
|
648
|
+
logger.info(f'Requesting chapter {i + 1} of {total_chapters}')
|
649
|
+
try:
|
650
|
+
self.chapters[i] = self._load_or_request_chapter(chapter=self.chapters[i],
|
651
|
+
reload_file=reload_files)
|
652
|
+
except FileManagerError:
|
653
|
+
logger.warning(f'Error requesting chapter {i + 1} with url {self.chapters[i].chapter_url}, Skipping...')
|
654
|
+
continue
|
655
|
+
except ValidationError:
|
656
|
+
logger.warning(f'Error validating chapter {i + 1} with url {self.chapters[i].chapter_url}, Skipping...')
|
657
|
+
continue
|
658
|
+
|
659
|
+
if not self.chapters[i].chapter_html:
|
660
|
+
logger.warning(f'Error requesting chapter {i + 1} with url {self.chapters[i].chapter_url}')
|
661
|
+
continue
|
662
|
+
|
663
|
+
if clean_chapters:
|
664
|
+
self._clean_chapter(self.chapters[i].chapter_html_filename)
|
665
|
+
self.save_novel()
|
666
|
+
chapters_obtained += 1
|
667
|
+
logger.info(f'Successfully requested {chapters_obtained} of {total_chapters} chapters.')
|
668
|
+
return None
|
421
669
|
|
422
|
-
# EPUB CREATION
|
670
|
+
# EPUB CREATION
|
423
671
|
|
424
672
|
def save_novel_to_epub(self,
|
425
673
|
sync_toc: bool = False,
|
426
674
|
start_chapter: int = 1,
|
427
675
|
end_chapter: int = None,
|
428
676
|
chapters_by_book: int = 100) -> None:
|
677
|
+
logger.debug('Saving novel to epub...')
|
429
678
|
if sync_toc:
|
430
|
-
|
679
|
+
logger.debug('Sync TOC flag present, syncing TOC...')
|
680
|
+
try:
|
681
|
+
self.sync_toc(reload_files=False)
|
682
|
+
except ScraperError:
|
683
|
+
logger.warning('Error when trying to sync TOC, continuing without syncing...')
|
684
|
+
|
685
|
+
if start_chapter < 1:
|
686
|
+
logger.error('Start chapter is invalid.')
|
687
|
+
raise ValidationError('Start chapter is invalid.')
|
431
688
|
|
432
689
|
if start_chapter > len(self.chapters):
|
433
|
-
logger.
|
434
|
-
|
690
|
+
logger.error(f'The start chapter is bigger than the number of chapters saved ({len(self.chapters)})')
|
691
|
+
raise ValidationError(
|
692
|
+
f'The start chapter is bigger than the number of chapters saved ({len(self.chapters)})')
|
435
693
|
|
436
694
|
if not end_chapter:
|
437
695
|
end_chapter = len(self.chapters)
|
@@ -443,22 +701,19 @@ class Novel:
|
|
443
701
|
idx = 1
|
444
702
|
start = start_chapter
|
445
703
|
while start <= end_chapter:
|
446
|
-
end = min(start + chapters_by_book - 1,
|
704
|
+
end = min(start + chapters_by_book - 1,
|
705
|
+
end_chapter)
|
447
706
|
result = self._save_chapters_to_epub(start_chapter=start,
|
448
707
|
end_chapter=end,
|
449
708
|
collection_idx=idx)
|
450
709
|
if not result:
|
451
710
|
logger.critical(f'Error with saving novel to epub, with start chapter: '
|
452
711
|
f'{start_chapter} and end chapter: {end_chapter}')
|
453
|
-
return False
|
454
712
|
start = start + chapters_by_book
|
455
713
|
idx = idx + 1
|
456
|
-
return True
|
457
|
-
|
458
714
|
|
459
715
|
## UTILS
|
460
716
|
|
461
|
-
|
462
717
|
def clean_files(self, clean_chapters: bool = True, clean_toc: bool = True, hard_clean: bool = False) -> None:
|
463
718
|
hard_clean = hard_clean or self.scraper_behavior.hard_clean
|
464
719
|
if clean_chapters:
|
@@ -470,8 +725,7 @@ class Novel:
|
|
470
725
|
self._clean_toc(hard_clean)
|
471
726
|
|
472
727
|
def show_novel_dir(self) -> str:
|
473
|
-
return self.file_manager.novel_base_dir
|
474
|
-
|
728
|
+
return str(self.file_manager.novel_base_dir)
|
475
729
|
|
476
730
|
## PRIVATE HELPERS
|
477
731
|
|
@@ -492,9 +746,25 @@ class Novel:
|
|
492
746
|
tocs_content = self.file_manager.get_all_toc()
|
493
747
|
for i, toc in enumerate(tocs_content):
|
494
748
|
toc = self.decoder.clean_html(toc, hard_clean=hard_clean)
|
495
|
-
self.file_manager.update_toc(
|
749
|
+
self.file_manager.update_toc(idx=i,
|
750
|
+
html=toc)
|
496
751
|
|
497
752
|
def _request_html_content(self, url: str) -> Optional[str]:
|
753
|
+
"""
|
754
|
+
Performs an HTTP request to retrieve HTML content from a URL.
|
755
|
+
|
756
|
+
Args:
|
757
|
+
url (str): The URL of the webpage to request
|
758
|
+
|
759
|
+
Returns:
|
760
|
+
Optional[str]: The HTML content of the webpage if the request is successful,
|
761
|
+
None otherwise
|
762
|
+
|
763
|
+
Note:
|
764
|
+
This method uses the decoder configuration and scraper behavior
|
765
|
+
to handle HTTP requests, including retries and timeouts.
|
766
|
+
"""
|
767
|
+
|
498
768
|
request_config = self.decoder.request_config
|
499
769
|
force_flaresolver = request_config.get('force_flaresolver') or self.scraper_behavior.force_flaresolver
|
500
770
|
html_content = get_html_content(url,
|
@@ -504,135 +774,331 @@ class Novel:
|
|
504
774
|
force_flaresolver=force_flaresolver)
|
505
775
|
return html_content
|
506
776
|
|
507
|
-
def
|
508
|
-
|
509
|
-
|
777
|
+
def _load_or_request_chapter(self,
|
778
|
+
chapter: Chapter,
|
779
|
+
reload_file: bool = False) -> Chapter:
|
780
|
+
"""
|
781
|
+
Loads or requests a chapter's HTML content from a local file or a URL.
|
782
|
+
|
783
|
+
This method first attempts to load the chapter content from a local file.
|
784
|
+
If not possible or if reload is requested, it fetches the content from the web.
|
785
|
+
|
786
|
+
Args:
|
787
|
+
chapter (Chapter): Chapter object containing chapter information.
|
788
|
+
reload_file (bool, optional): If True, forces a new web request
|
789
|
+
regardless of local file existence. Defaults to False.
|
790
|
+
|
791
|
+
Returns:
|
792
|
+
Chapter: The Chapter object updated with HTML content.
|
793
|
+
|
794
|
+
Raises:
|
795
|
+
FileManagerError: If there's an error loading or saving the chapter file.
|
796
|
+
ValidationError: If there's a validation error when requesting the chapter.
|
797
|
+
NetworkError: If there's a network error when requesting the chapter.
|
798
|
+
|
799
|
+
Note:
|
800
|
+
- If the file doesn't exist locally, a web request will be made.
|
801
|
+
- If the file exists but is empty, a web request will be made.
|
802
|
+
- File saving errors are logged as warnings but don't stop execution.
|
803
|
+
"""
|
510
804
|
|
511
|
-
# Generate filename if needed
|
805
|
+
# Generate a filename if needed
|
512
806
|
if not chapter.chapter_html_filename:
|
807
|
+
logger.debug('Generating a filename for the chapter')
|
513
808
|
chapter.chapter_html_filename = utils.generate_file_name_from_url(
|
514
809
|
chapter.chapter_url)
|
515
810
|
|
516
|
-
#
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
811
|
+
# The HTML will be requested again if:
|
812
|
+
# 1. "Reload file" flag is True (requested by user)
|
813
|
+
# 2. Chapter file does not exist
|
814
|
+
# 3. The Chapter file does exist, but there is no content
|
815
|
+
reload_file = reload_file or not self.file_manager.chapter_file_exists(chapter.chapter_html_filename)
|
816
|
+
# Try loading from the disk first
|
817
|
+
if not reload_file:
|
818
|
+
try:
|
819
|
+
logger.debug(f'Loading chapter HTML from file: "{chapter.chapter_html_filename}"')
|
820
|
+
chapter.chapter_html = self.file_manager.load_chapter_html(chapter.chapter_html_filename)
|
821
|
+
except FileManagerError as e:
|
822
|
+
logger.error(f'Error when trying to load chapter {chapter.chapter_title} from file', exc_info=e)
|
823
|
+
raise
|
824
|
+
if chapter.chapter_html is not None:
|
521
825
|
return chapter
|
522
826
|
|
523
827
|
# Fetch fresh content
|
524
|
-
|
828
|
+
try:
|
829
|
+
logger.debug(f'Requesting chapter HTML from URL: "{chapter.chapter_url}"')
|
830
|
+
chapter.chapter_html = self._request_html_content(chapter.chapter_url)
|
831
|
+
except ValidationError:
|
832
|
+
logger.error(
|
833
|
+
f'Error when trying to request chapter {chapter.chapter_title} from url: {chapter.chapter_url}')
|
834
|
+
raise
|
835
|
+
except NetworkError:
|
836
|
+
logger.error(
|
837
|
+
f'Error when trying to request chapter {chapter.chapter_title} from url: {chapter.chapter_url}')
|
838
|
+
raise
|
839
|
+
|
840
|
+
# If the requests failed, we will let the higher methods decide if they throw an error.
|
525
841
|
if not chapter.chapter_html:
|
526
842
|
logger.error(f'No content found on link {chapter.chapter_url}')
|
527
843
|
return chapter
|
528
844
|
|
529
845
|
# Save content
|
530
|
-
|
531
|
-
|
846
|
+
try:
|
847
|
+
logger.info(f'Saving chapter HTML to file: "{chapter.chapter_html_filename}"')
|
848
|
+
self.file_manager.save_chapter_html(chapter.chapter_html_filename,
|
849
|
+
chapter.chapter_html)
|
850
|
+
except FileManagerError as e:
|
851
|
+
# We can pass this error and try again later
|
852
|
+
logger.warning(f'Error when trying to save chapter {chapter.chapter_title} to file', exc_info=e)
|
853
|
+
|
532
854
|
return chapter
|
533
855
|
|
534
|
-
def
|
535
|
-
|
536
|
-
|
537
|
-
reload: bool = False):
|
538
|
-
if not reload:
|
539
|
-
content = self.file_manager.get_toc(toc_filename)
|
540
|
-
if content:
|
541
|
-
return content
|
856
|
+
def _request_toc_files(self):
|
857
|
+
"""
|
858
|
+
Requests and stores all table of contents (TOC) files from the novel's website.
|
542
859
|
|
543
|
-
|
544
|
-
|
860
|
+
This method handles both paginated and non-paginated TOCs:
|
861
|
+
- For non-paginated TOCs: Downloads and stores a single TOC file
|
862
|
+
- For paginated TOCs: Iteratively downloads all TOC pages until no next page is found
|
545
863
|
|
546
|
-
|
547
|
-
content = self._request_html_content(url)
|
548
|
-
if not content:
|
549
|
-
logger.warning(f'No content found on link {url}')
|
550
|
-
sys.exit(1)
|
864
|
+
The method first clears any existing TOC files before downloading new ones.
|
551
865
|
|
552
|
-
|
553
|
-
|
866
|
+
Raises:
|
867
|
+
NetworkError: If there's an error during the HTTP request
|
868
|
+
ValidationError: If no content is found at the TOC URL
|
869
|
+
DecodeError: If there's an error parsing the next page URL
|
870
|
+
|
871
|
+
Note:
|
872
|
+
This is an internal method that uses the decoder configuration to determine
|
873
|
+
pagination behavior and to parse TOC content.
|
874
|
+
"""
|
875
|
+
|
876
|
+
def _get_toc(toc_url: str, get_next_page: bool) -> str | None:
|
877
|
+
# Some TOCs next page links have incomplete URLS (e.g., /page/2)
|
878
|
+
if utils.check_incomplete_url(toc_url):
|
879
|
+
toc_url = self.toc_main_url + toc_url
|
880
|
+
logger.debug(f'Toc link is incomplete, trying with toc link: "{toc_url}"')
|
881
|
+
|
882
|
+
# Fetch fresh content
|
883
|
+
logger.debug(f'Requesting TOC from link: "{toc_url}"')
|
884
|
+
try:
|
885
|
+
toc_content = self._request_html_content(toc_url)
|
886
|
+
except NetworkError as E:
|
887
|
+
logger.error(f'Error with network, error: {E}')
|
888
|
+
raise
|
889
|
+
|
890
|
+
if not toc_content:
|
891
|
+
logger.error(f'No content found on link "{toc_url}"')
|
892
|
+
raise ValidationError(f'No content found on link "{toc_url}"')
|
893
|
+
|
894
|
+
logger.debug('Saving new TOC file to disk.')
|
895
|
+
self.file_manager.add_toc(toc_content)
|
896
|
+
|
897
|
+
if get_next_page:
|
898
|
+
try:
|
899
|
+
logger.debug(f'Parsing next page from link: {toc_url}')
|
900
|
+
next_page = self.decoder.get_toc_next_page_url(toc_content)
|
901
|
+
except DecodeError:
|
902
|
+
raise
|
903
|
+
return next_page
|
904
|
+
return None
|
554
905
|
|
555
|
-
|
556
|
-
|
557
|
-
|
906
|
+
self.file_manager.delete_toc()
|
907
|
+
has_pagination = self.decoder.has_pagination()
|
908
|
+
|
909
|
+
if not has_pagination:
|
910
|
+
logger.debug('TOC does not have pagination, requesting only one file.')
|
911
|
+
_get_toc(self.toc_main_url, get_next_page=False)
|
558
912
|
else:
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
913
|
+
logger.debug('TOC has pagination, requesting all files.')
|
914
|
+
next_page_url = self.toc_main_url
|
915
|
+
while next_page_url:
|
916
|
+
next_page_url = _get_toc(next_page_url, get_next_page=True)
|
917
|
+
|
918
|
+
def _load_or_request_chapter_urls_from_toc(self) -> None:
|
919
|
+
"""
|
920
|
+
Extracts and processes chapter URLs from the table of contents.
|
921
|
+
|
922
|
+
Raises:
|
923
|
+
DecodeError: If fails to decode chapter URLs from TOC content
|
924
|
+
"""
|
925
|
+
# Get configuration
|
926
|
+
is_inverted = self.decoder.is_index_inverted()
|
927
|
+
add_host_to_chapter = self.scraper_behavior.auto_add_host or self.decoder.add_host_to_chapter()
|
928
|
+
|
929
|
+
# Get all TOC content at once
|
930
|
+
try:
|
931
|
+
all_tocs = self.file_manager.get_all_toc()
|
932
|
+
except FileManagerError:
|
933
|
+
logger.error('Error when trying to load TOC files from disk.')
|
934
|
+
raise
|
935
|
+
|
936
|
+
# Extract URLs from all TOC fragments
|
937
|
+
self.chapters_url_list = []
|
938
|
+
for toc_content in all_tocs:
|
939
|
+
try:
|
940
|
+
urls = self.decoder.get_chapter_urls(toc_content)
|
941
|
+
self.chapters_url_list.extend(urls) # More efficient than creating intermediate lists
|
942
|
+
except DecodeError as e:
|
943
|
+
logger.error('Failed to decode chapter URLs from TOC content', exc_info=e)
|
944
|
+
raise
|
945
|
+
|
946
|
+
# Handle inversion if needed
|
947
|
+
if is_inverted:
|
948
|
+
logger.debug('Inverting chapter URLs order')
|
949
|
+
self.chapters_url_list.reverse() # In-place reversal is more efficient
|
950
|
+
|
951
|
+
# Add host if needed
|
952
|
+
if add_host_to_chapter:
|
953
|
+
logger.debug('Adding host to chapter URLs')
|
954
|
+
self.chapters_url_list = [f'https://{self.host}{url}' for url in self.chapters_url_list]
|
955
|
+
|
956
|
+
# Remove duplicates while preserving order
|
957
|
+
# self.chapters_url_list = utils.delete_duplicates(self.chapters_url_list)
|
958
|
+
|
959
|
+
logger.info(f'Successfully extracted {len(self.chapters_url_list)} unique chapter URLs')
|
960
|
+
|
961
|
+
def _create_chapters_from_toc(self):
|
962
|
+
"""
|
963
|
+
Synchronizes existing chapters with the table of contents (TOC) URL list.
|
964
|
+
|
965
|
+
This method performs the following operations:
|
966
|
+
1. Removes chapters whose URLs are no longer in the TOC
|
967
|
+
2. Adds new chapters for URLs found in the TOC
|
968
|
+
3. Reorders chapters according to the TOC sequence
|
969
|
+
|
970
|
+
Raises:
|
971
|
+
ValidationError: If there's an error when creating a new chapter
|
972
|
+
|
973
|
+
Note:
|
974
|
+
This is an internal method used to maintain consistency
|
975
|
+
between chapters and the table of contents.
|
976
|
+
"""
|
573
977
|
|
574
|
-
|
978
|
+
existing_urls = {chapter.chapter_url for chapter in self.chapters}
|
979
|
+
toc_urls_set = set(self.chapters_url_list)
|
980
|
+
|
981
|
+
# Find chapters to remove and new chapters to add
|
982
|
+
urls_to_remove = existing_urls - toc_urls_set
|
983
|
+
urls_to_add = toc_urls_set - existing_urls
|
984
|
+
|
985
|
+
if urls_to_remove:
|
986
|
+
logger.info(f'Removing {len(urls_to_remove)} chapters not found in TOC')
|
987
|
+
self.chapters = [ch for ch in self.chapters if ch.chapter_url not in urls_to_remove]
|
988
|
+
|
989
|
+
if urls_to_add:
|
990
|
+
logger.info(f'Adding {len(urls_to_add)} new chapters from TOC')
|
991
|
+
for url in self.chapters_url_list:
|
992
|
+
if url in urls_to_add:
|
993
|
+
try:
|
994
|
+
new_chapter = Chapter(chapter_url=url)
|
995
|
+
self.chapters.append(new_chapter)
|
996
|
+
except ValidationError as e:
|
997
|
+
logger.error(f'Failed to create chapter for URL {url}: {e}')
|
998
|
+
raise
|
999
|
+
|
1000
|
+
# Reorder according to TOC
|
1001
|
+
logger.debug('Reordering chapters according to TOC')
|
575
1002
|
self.chapters.sort(
|
576
1003
|
key=lambda x: self.chapters_url_list.index(x.chapter_url))
|
577
1004
|
|
578
|
-
|
579
|
-
for chapter in self.chapters:
|
580
|
-
if chapter_url == chapter.chapter_url:
|
581
|
-
return chapter
|
582
|
-
return None
|
1005
|
+
logger.info(f'Chapter synchronization complete. Total chapters: {len(self.chapters)}')
|
583
1006
|
|
584
|
-
def
|
585
|
-
for index, chapter in enumerate(self.chapters):
|
586
|
-
if chapter.chapter_url == chapter_url:
|
587
|
-
return index
|
588
|
-
return None
|
1007
|
+
def _add_or_update_chapter_data(self, chapter: Chapter, save_in_file: bool = True) -> None:
|
589
1008
|
|
590
|
-
|
591
|
-
|
592
|
-
|
1009
|
+
# Check if the chapter exists
|
1010
|
+
chapter_idx = self._find_chapter_index_by_url(chapter.chapter_url)
|
1011
|
+
if chapter_idx is None:
|
1012
|
+
# If no existing chapter, we append it
|
1013
|
+
self.chapters.append(chapter)
|
1014
|
+
else:
|
1015
|
+
if chapter.chapter_title:
|
1016
|
+
self.chapters[chapter_idx].chapter_title = chapter.chapter_title
|
1017
|
+
if chapter.chapter_html_filename:
|
1018
|
+
self.chapters[chapter_idx].chapter_html_filename = chapter.chapter_html_filename
|
593
1019
|
|
594
|
-
|
595
|
-
|
596
|
-
increment = 100
|
597
|
-
aux = 1
|
598
|
-
for chapter_url in self.chapters_url_list:
|
599
|
-
aux += 1
|
600
|
-
chapter_idx = self._find_chapter_index_by_link(chapter_url)
|
601
|
-
if not chapter_idx:
|
602
|
-
chapter = Chapter(chapter_url=chapter_url)
|
603
|
-
self._add_or_update_chapter_data(
|
604
|
-
chapter=chapter, save_in_file=False)
|
605
|
-
if aux == increment:
|
606
|
-
self.save_novel()
|
607
|
-
aux = 1
|
608
|
-
self._order_chapters_by_link_list()
|
609
|
-
self.save_novel()
|
1020
|
+
if save_in_file:
|
1021
|
+
self.save_novel()
|
610
1022
|
|
611
|
-
def
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
1023
|
+
def _find_chapter_index_by_url(self, chapter_url: str) -> Optional[int]:
|
1024
|
+
"""
|
1025
|
+
Find the chapter index by its URL in the chapter list.
|
1026
|
+
|
1027
|
+
Args:
|
1028
|
+
chapter_url: URL of the chapter to find
|
1029
|
+
|
1030
|
+
Returns:
|
1031
|
+
Optional[int]: Index of the chapter if found, None otherwise
|
1032
|
+
|
1033
|
+
Note:
|
1034
|
+
Uses next() for efficient iteration - stops as soon as a match is found
|
1035
|
+
"""
|
1036
|
+
try:
|
1037
|
+
return next(i for i, ch in enumerate(self.chapters)
|
1038
|
+
if ch.chapter_url == chapter_url)
|
1039
|
+
except StopIteration:
|
1040
|
+
return None
|
1041
|
+
|
1042
|
+
def _decode_chapter(self,
|
1043
|
+
chapter: Chapter,
|
1044
|
+
save_title_to_content: bool = False) -> Chapter:
|
1045
|
+
"""
|
1046
|
+
Decodes a chapter's HTML content to extract title and content.
|
1047
|
+
|
1048
|
+
This method processes the HTML content of a chapter to extract its title and content.
|
1049
|
+
If no title is found, it auto-generates one using the chapter's index in the URL list.
|
1050
|
+
|
1051
|
+
Args:
|
1052
|
+
chapter (Chapter): Chapter object containing the HTML content to decode.
|
1053
|
+
save_title_to_content (bool, optional): Whether to include the title in the
|
1054
|
+
chapter content. Defaults to False.
|
1055
|
+
|
1056
|
+
Returns:
|
1057
|
+
Chapter: The updated Chapter object with decoded title and content.
|
616
1058
|
|
617
|
-
|
618
|
-
|
619
|
-
|
1059
|
+
Raises:
|
1060
|
+
ScraperError: If the chapter's HTML content is None.
|
1061
|
+
DecodeError: If there's an error decoding the chapter's title or content.
|
1062
|
+
|
1063
|
+
Note:
|
1064
|
+
- If no title is found, it will be auto-generated as "{novel_title} Chapter {index}".
|
1065
|
+
- The chapter's HTML must be loaded before calling this method.
|
1066
|
+
"""
|
1067
|
+
|
1068
|
+
logger.debug(f'Decoding chapter with URL {chapter.chapter_url}...')
|
1069
|
+
if chapter.chapter_html is None:
|
1070
|
+
logger.error(f'Chapter HTML not found for chapter with URL "{chapter.chapter_url}"')
|
1071
|
+
raise ScraperError(f'Chapter HTML not found for chapter with URL "{chapter.chapter_url}"')
|
620
1072
|
|
621
1073
|
logger.debug('Obtaining chapter title...')
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
logger.debug(f'Chapter title: "{chapter_title}"')
|
1074
|
+
try:
|
1075
|
+
chapter_title = self.decoder.get_chapter_title(chapter.chapter_html)
|
1076
|
+
except DecodeError as e:
|
1077
|
+
logger.error(f'Failed to decode chapter title from HTML content: {e}')
|
1078
|
+
raise
|
628
1079
|
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
1080
|
+
if chapter_title is None:
|
1081
|
+
logger.debug('No chapter title found, trying to autogenerate one...')
|
1082
|
+
try:
|
1083
|
+
chapter_idx = self.chapters_url_list.index(chapter.chapter_url)
|
1084
|
+
except ValueError:
|
1085
|
+
chapter_idx = ""
|
1086
|
+
|
1087
|
+
chapter_title = f'{self.title} Chapter {chapter_idx}'
|
635
1088
|
|
1089
|
+
chapter.chapter_title = chapter_title
|
1090
|
+
logger.info(f'Chapter title: "{chapter_title}"')
|
1091
|
+
|
1092
|
+
logger.debug('Obtaining chapter content...')
|
1093
|
+
try:
|
1094
|
+
chapter.chapter_content = self.decoder.get_chapter_content(chapter.chapter_html,
|
1095
|
+
save_title_to_content,
|
1096
|
+
chapter.chapter_title)
|
1097
|
+
except DecodeError:
|
1098
|
+
logger.error(f'Failed to decode chapter content for chapter with URL "{chapter.chapter_url}"')
|
1099
|
+
raise
|
1100
|
+
|
1101
|
+
logger.debug('Chapter title and content successfully decoded from HTML')
|
636
1102
|
return chapter
|
637
1103
|
|
638
1104
|
def _create_epub_book(self, book_title: str = None, calibre_collection: dict = None) -> epub.EpubBook:
|
@@ -661,7 +1127,7 @@ class Novel:
|
|
661
1127
|
# date_metadata += f'/{self.metadata.end_date}'
|
662
1128
|
if self.metadata.end_date:
|
663
1129
|
book.add_metadata('OPF', 'meta', self.metadata.end_date, {
|
664
|
-
|
1130
|
+
'name': 'end_date', 'content': self.metadata.end_date})
|
665
1131
|
if date_metadata:
|
666
1132
|
logger.debug(f'Using date_metadata {date_metadata}')
|
667
1133
|
book.add_metadata('DC', 'date', date_metadata)
|
@@ -669,9 +1135,9 @@ class Novel:
|
|
669
1135
|
# Collections with calibre
|
670
1136
|
if calibre_collection:
|
671
1137
|
book.add_metadata('OPF', 'meta', '', {
|
672
|
-
|
1138
|
+
'name': 'calibre:series', 'content': calibre_collection["title"]})
|
673
1139
|
book.add_metadata('OPF', 'meta', '', {
|
674
|
-
|
1140
|
+
'name': 'calibre:series_index', 'content': calibre_collection["idx"]})
|
675
1141
|
|
676
1142
|
cover_image_content = self.file_manager.load_novel_cover()
|
677
1143
|
if cover_image_content:
|
@@ -682,11 +1148,10 @@ class Novel:
|
|
682
1148
|
return book
|
683
1149
|
|
684
1150
|
def _add_chapter_to_epub_book(self, chapter: Chapter, book: epub.EpubBook):
|
685
|
-
chapter = self.scrap_chapter(
|
686
|
-
chapter_url=chapter.chapter_url)
|
1151
|
+
chapter = self.scrap_chapter(chapter)
|
687
1152
|
if chapter is None:
|
688
1153
|
logger.warning('Error reading chapter')
|
689
|
-
return
|
1154
|
+
return None
|
690
1155
|
self._add_or_update_chapter_data(
|
691
1156
|
chapter=chapter, save_in_file=False)
|
692
1157
|
file_name = utils.generate_epub_file_name_from_title(
|
@@ -708,10 +1173,9 @@ class Novel:
|
|
708
1173
|
start_chapter: int,
|
709
1174
|
end_chapter: int = None,
|
710
1175
|
collection_idx: int = None):
|
711
|
-
|
712
1176
|
if start_chapter > len(self.chapters):
|
713
1177
|
logger.error('start_chapter out of range')
|
714
|
-
return
|
1178
|
+
return None
|
715
1179
|
# If end_chapter is not set, we set it to idx_start + chapters_num - 1
|
716
1180
|
if not end_chapter:
|
717
1181
|
end_chapter = len(self.chapters)
|
@@ -725,7 +1189,7 @@ class Novel:
|
|
725
1189
|
# We create the epub book
|
726
1190
|
book_title = f'{self.title} Chapters {start_chapter} - {end_chapter}'
|
727
1191
|
calibre_collection = None
|
728
|
-
# If collection_idx is set, we create a
|
1192
|
+
# If collection_idx is set, we create a Calibre collection
|
729
1193
|
if collection_idx:
|
730
1194
|
calibre_collection = {'title': self.title,
|
731
1195
|
'idx': str(collection_idx)}
|
@@ -735,11 +1199,16 @@ class Novel:
|
|
735
1199
|
book = self._add_chapter_to_epub_book(chapter=chapter,
|
736
1200
|
book=book)
|
737
1201
|
if book is None:
|
738
|
-
logger.critical(
|
1202
|
+
logger.critical(
|
1203
|
+
f'Error saving epub {book_title}, could not decode chapter {chapter} using host {self.host}')
|
739
1204
|
return False
|
740
1205
|
|
741
1206
|
book.add_item(epub.EpubNcx())
|
742
1207
|
book.add_item(epub.EpubNav())
|
743
|
-
|
1208
|
+
try:
|
1209
|
+
self.file_manager.save_book(book, f'{book_title}.epub')
|
1210
|
+
except FileManagerError:
|
1211
|
+
logger.error(f'Error saving epub {book_title}')
|
1212
|
+
raise
|
744
1213
|
self.save_novel()
|
745
1214
|
return True
|