web-novel-scraper 2.0.2__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web_novel_scraper/__main__.py +123 -68
- web_novel_scraper/config_manager.py +12 -12
- web_novel_scraper/decode.py +225 -80
- web_novel_scraper/decode_guide/decode_guide.json +29 -0
- web_novel_scraper/file_manager.py +292 -110
- web_novel_scraper/models.py +76 -0
- web_novel_scraper/novel_scraper.py +895 -424
- web_novel_scraper/request_manager.py +50 -17
- web_novel_scraper/utils.py +22 -1
- web_novel_scraper/version.py +1 -1
- {web_novel_scraper-2.0.2.dist-info → web_novel_scraper-2.1.0.dist-info}/METADATA +1 -1
- web_novel_scraper-2.1.0.dist-info/RECORD +20 -0
- web_novel_scraper-2.0.2.dist-info/RECORD +0 -19
- {web_novel_scraper-2.0.2.dist-info → web_novel_scraper-2.1.0.dist-info}/WHEEL +0 -0
- {web_novel_scraper-2.0.2.dist-info → web_novel_scraper-2.1.0.dist-info}/entry_points.txt +0 -0
@@ -1,163 +1,86 @@
|
|
1
|
-
from dataclasses import dataclass,
|
2
|
-
import sys
|
1
|
+
from dataclasses import dataclass, field, replace
|
3
2
|
|
4
|
-
from dataclasses_json import dataclass_json,
|
3
|
+
from dataclasses_json import dataclass_json, Undefined, config
|
5
4
|
from ebooklib import epub
|
6
5
|
from typing import Optional
|
6
|
+
from pathlib import Path
|
7
7
|
|
8
8
|
from . import logger_manager
|
9
9
|
from .decode import Decoder
|
10
10
|
from .file_manager import FileManager
|
11
11
|
from . import utils
|
12
|
-
|
13
12
|
from .request_manager import get_html_content
|
14
13
|
from .config_manager import ScraperConfig
|
14
|
+
from .models import ScraperBehavior, Metadata, Chapter
|
15
|
+
from .utils import _always, ScraperError, FileManagerError, NetworkError, ValidationError, DecodeError
|
15
16
|
|
16
17
|
logger = logger_manager.create_logger('NOVEL SCRAPPING')
|
17
18
|
|
18
19
|
|
19
|
-
@dataclass_json
|
20
|
-
@dataclass
|
21
|
-
class Metadata:
|
22
|
-
author: Optional[str] = None
|
23
|
-
start_date: Optional[str] = None
|
24
|
-
end_date: Optional[str] = None
|
25
|
-
language: Optional[str] = "en"
|
26
|
-
description: Optional[str] = None
|
27
|
-
tags: list[str] = field(default_factory=list)
|
28
|
-
|
29
|
-
def update_behavior(self, **kwargs):
|
30
|
-
"""
|
31
|
-
Updates the behavior configuration dynamically.
|
32
|
-
Only updates the attributes provided in kwargs.
|
33
|
-
"""
|
34
|
-
for key, value in kwargs.items():
|
35
|
-
if hasattr(self, key) and value is not None:
|
36
|
-
setattr(self, key, value)
|
37
|
-
|
38
|
-
def __str__(self):
|
39
|
-
"""
|
40
|
-
Dynamic string representation of the configuration.
|
41
|
-
"""
|
42
|
-
attributes = [(f"{field.name}="
|
43
|
-
f"{getattr(self, field.name)}") for field in fields(self)]
|
44
|
-
attributes_str = '\n'.join(attributes)
|
45
|
-
return (f"Metadata: \n"
|
46
|
-
f"{attributes_str}")
|
47
|
-
|
48
|
-
|
49
|
-
@dataclass_json
|
50
|
-
@dataclass
|
51
|
-
class ScraperBehavior:
|
52
|
-
# Some novels already have the title in the content.
|
53
|
-
save_title_to_content: bool = False
|
54
|
-
# Some novels have the toc link without the host
|
55
|
-
auto_add_host: bool = False
|
56
|
-
# Some hosts return 403 when scrapping, this will force the use of FlareSolver
|
57
|
-
# to save time
|
58
|
-
force_flaresolver: bool = False
|
59
|
-
# When you clean the html files, you can use hard clean by default
|
60
|
-
hard_clean: bool = False
|
61
|
-
|
62
|
-
def update_behavior(self, **kwargs):
|
63
|
-
"""
|
64
|
-
Updates the behavior configuration dynamically.
|
65
|
-
Only updates the attributes provided in kwargs.
|
66
|
-
"""
|
67
|
-
for key, value in kwargs.items():
|
68
|
-
if hasattr(self, key) and value is not None:
|
69
|
-
setattr(self, key, value)
|
70
|
-
|
71
|
-
def __str__(self):
|
72
|
-
"""
|
73
|
-
Dynamic string representation of the configuration.
|
74
|
-
"""
|
75
|
-
attributes = [(f"{field.name}="
|
76
|
-
f"{getattr(self, field.name)}") for field in fields(self)]
|
77
|
-
attributes_str = '\n'.join(attributes)
|
78
|
-
return (f"Scraper Behavior: \n"
|
79
|
-
f"{attributes_str}")
|
80
|
-
|
81
|
-
|
82
|
-
@dataclass_json(undefined=Undefined.EXCLUDE)
|
83
|
-
@dataclass
|
84
|
-
class Chapter:
|
85
|
-
chapter_url: str
|
86
|
-
chapter_html_filename: Optional[str] = None
|
87
|
-
chapter_title: Optional[str] = None
|
88
|
-
|
89
|
-
def __init__(self,
|
90
|
-
chapter_url: str,
|
91
|
-
chapter_html: str = None,
|
92
|
-
chapter_content: str = None,
|
93
|
-
chapter_html_filename: str = None,
|
94
|
-
chapter_title: str = None):
|
95
|
-
self.chapter_url = chapter_url
|
96
|
-
self.chapter_html = chapter_html
|
97
|
-
self.chapter_content = chapter_content
|
98
|
-
self.chapter_html_filename = chapter_html_filename
|
99
|
-
self.chapter_title = chapter_title
|
100
|
-
|
101
|
-
def __str__(self):
|
102
|
-
return f'Title: "{self.chapter_title}"\nURL: "{self.chapter_url}"\nFilename: "{self.chapter_html_filename}"'
|
103
|
-
|
104
|
-
def __lt__(self, another):
|
105
|
-
return self.chapter_title < another.chapter_title
|
106
|
-
|
107
|
-
|
108
20
|
@dataclass_json(undefined=Undefined.EXCLUDE)
|
109
21
|
@dataclass
|
110
22
|
class Novel:
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
23
|
+
"""
|
24
|
+
A class representing a web novel with its metadata and content.
|
25
|
+
|
26
|
+
This class handles all operations related to scraping, storing, and managing web novels,
|
27
|
+
including their chapters, table of contents, and metadata.
|
28
|
+
|
29
|
+
Attributes:
|
30
|
+
title (str): The title of the novel.
|
31
|
+
host (Optional[str]): The host domain where the novel is located.
|
32
|
+
toc_main_url (Optional[str]): The main URL for the table of contents.
|
33
|
+
chapters (list[Chapter]): List of chapters in the novel.
|
34
|
+
chapters_url_list (list[str]): List of URLs for all chapters.
|
35
|
+
metadata (Metadata): Novel metadata like author, language, etc.
|
36
|
+
scraper_behavior (ScraperBehavior): Configuration for scraping behavior.
|
37
|
+
file_manager (FileManager): Handles file operations for the novel.
|
38
|
+
decoder (Decoder): Handles HTML decoding and parsing.
|
39
|
+
config (ScraperConfig): General scraper configuration.
|
40
|
+
"""
|
41
|
+
|
42
|
+
title: str
|
43
|
+
host: Optional[str] = None
|
115
44
|
toc_main_url: Optional[str] = None
|
45
|
+
chapters: list[Chapter] = field(default_factory=list)
|
116
46
|
chapters_url_list: list[str] = field(default_factory=list)
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
self.metadata = metadata
|
137
|
-
|
138
|
-
if toc_html:
|
139
|
-
self.file_manager.add_toc(toc_html)
|
140
|
-
|
141
|
-
self.toc_main_url = toc_main_url
|
142
|
-
self.chapters_url_list = chapters_url_list if chapters_url_list else []
|
143
|
-
|
144
|
-
self.chapters = chapters if chapters else []
|
145
|
-
|
146
|
-
self.scraper_behavior = scraper_behavior if scraper_behavior else ScraperBehavior()
|
147
|
-
if not host and not toc_main_url:
|
148
|
-
logger.error('You need to set "host" or "toc_main_url".')
|
149
|
-
sys.exit(1)
|
47
|
+
metadata: Metadata = field(default_factory=Metadata)
|
48
|
+
scraper_behavior: ScraperBehavior = field(default_factory=ScraperBehavior)
|
49
|
+
|
50
|
+
file_manager: FileManager = field(default=None,
|
51
|
+
repr=False,
|
52
|
+
compare=False,
|
53
|
+
metadata=config(exclude=_always))
|
54
|
+
decoder: Decoder = field(default=None,
|
55
|
+
repr=False,
|
56
|
+
compare=False,
|
57
|
+
metadata=config(exclude=_always))
|
58
|
+
config: ScraperConfig = field(default=None,
|
59
|
+
repr=False,
|
60
|
+
compare=False,
|
61
|
+
metadata=config(exclude=_always))
|
62
|
+
|
63
|
+
def __post_init__(self):
|
64
|
+
"""
|
65
|
+
Validates the novel instance after initialization.
|
150
66
|
|
151
|
-
|
67
|
+
Raises:
|
68
|
+
ValidationError: If the title is empty or neither host nor toc_main_url is provided.
|
69
|
+
"""
|
152
70
|
|
153
|
-
self.
|
154
|
-
|
155
|
-
self.
|
71
|
+
if not self.title:
|
72
|
+
raise ValidationError("title can't be empty")
|
73
|
+
if not (self.host or self.toc_main_url):
|
74
|
+
raise ValidationError('You must provide "host" or "toc_main_url"')
|
156
75
|
|
157
76
|
def __str__(self):
|
158
77
|
"""
|
159
|
-
|
78
|
+
Returns a string representation of the novel with its main attributes.
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
str: A formatted string containing the novel's main information.
|
160
82
|
"""
|
83
|
+
|
161
84
|
toc_info = self.toc_main_url if self.toc_main_url else "TOC added manually"
|
162
85
|
attributes = [
|
163
86
|
f"Title: {self.title}",
|
@@ -172,99 +95,317 @@ class Novel:
|
|
172
95
|
return (f"Novel Info: \n"
|
173
96
|
f"{attributes_str}")
|
174
97
|
|
175
|
-
@
|
176
|
-
def load(title: str, cfg: ScraperConfig, novel_base_dir:
|
98
|
+
@classmethod
|
99
|
+
def load(cls, title: str, cfg: ScraperConfig, novel_base_dir: Path = None) -> 'Novel':
|
100
|
+
"""
|
101
|
+
Loads a novel from stored JSON data.
|
102
|
+
|
103
|
+
Args:
|
104
|
+
title (str): Title of the novel to load.
|
105
|
+
cfg (ScraperConfig): Scraper configuration.
|
106
|
+
novel_base_dir (Path, optional): Base directory for the novel data.
|
107
|
+
|
108
|
+
Returns:
|
109
|
+
Novel: A new Novel instance loaded from stored data.
|
110
|
+
|
111
|
+
Raises:
|
112
|
+
ValidationError: If the novel with the given title is not found.
|
113
|
+
"""
|
114
|
+
|
177
115
|
fm = FileManager(title, cfg.base_novels_dir, novel_base_dir, read_only=True)
|
178
116
|
raw = fm.load_novel_json()
|
179
117
|
if raw is None:
|
180
118
|
logger.debug(f'Novel "{title}" was not found.')
|
181
|
-
raise
|
182
|
-
novel =
|
183
|
-
novel.config = cfg
|
119
|
+
raise ValidationError(f'Novel "{title}" was not found.')
|
120
|
+
novel = cls.from_json(raw)
|
184
121
|
novel.set_config(cfg=cfg, novel_base_dir=novel_base_dir)
|
185
122
|
return novel
|
186
123
|
|
124
|
+
@classmethod
|
125
|
+
def new(cls, title: str, cfg: ScraperConfig, host: str = None, toc_html: str = None,
|
126
|
+
toc_main_url: str = None) -> 'Novel':
|
127
|
+
"""Creates a new Novel instance.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
title: Title of the novel (required)
|
131
|
+
cfg: Scraper configuration (required)
|
132
|
+
host: Host URL for the novel content (optional)
|
133
|
+
toc_html: HTML content for the table of contents (optional)
|
134
|
+
toc_main_url: URL for the table of contents (optional)
|
135
|
+
|
136
|
+
Note:
|
137
|
+
- Either toc_html or toc_main_url must be provided
|
138
|
+
- If toc_main_url is provided, host will be extracted from it if not explicitly provided
|
139
|
+
- If toc_html is provided, host must be explicitly provided
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
Novel: A new Novel instance
|
143
|
+
|
144
|
+
Raises:
|
145
|
+
ValidationError: If the title is empty, or if neither toc_html nor toc_main_url is provided
|
146
|
+
"""
|
147
|
+
if not title:
|
148
|
+
raise ValidationError("Title cannot be empty")
|
149
|
+
|
150
|
+
if not (toc_html or toc_main_url):
|
151
|
+
raise ValidationError("Either toc_html or toc_main_url must be provided")
|
152
|
+
|
153
|
+
if toc_html and not host:
|
154
|
+
raise ValidationError("When providing toc_html, host must be explicitly provided")
|
155
|
+
|
156
|
+
novel = cls(title=title, host=host, toc_main_url=toc_main_url)
|
157
|
+
breakpoint()
|
158
|
+
# If toc_main_url is provided and the host isn't, extract host from URL
|
159
|
+
if toc_main_url and not host:
|
160
|
+
host = utils.obtain_host(toc_main_url)
|
161
|
+
novel.host = host
|
162
|
+
|
163
|
+
# If toc_html is provided, add it to the novel
|
164
|
+
if toc_html:
|
165
|
+
novel.add_toc_html(toc_html, host)
|
166
|
+
|
167
|
+
return novel
|
168
|
+
|
187
169
|
# NOVEL PARAMETERS MANAGEMENT
|
188
170
|
|
189
171
|
def set_config(self,
|
190
|
-
cfg: ScraperConfig
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
172
|
+
cfg: ScraperConfig,
|
173
|
+
novel_base_dir: str | None = None) -> None:
|
174
|
+
"""
|
175
|
+
Configures the novel with the provided scraper configuration and base directory.
|
176
|
+
|
177
|
+
Sets up the file manager and decoder for the novel based on the provided configuration.
|
178
|
+
|
179
|
+
Args:
|
180
|
+
cfg (ScraperConfig): The scraper configuration to use.
|
181
|
+
novel_base_dir (str | None, optional): Base directory for the novel files.
|
182
|
+
If None, it uses the default directory from configuration.
|
201
183
|
|
202
|
-
|
203
|
-
|
204
|
-
|
184
|
+
Raises:
|
185
|
+
FileManagerError: If there's an error when reading the config or decoding guide files.
|
186
|
+
"""
|
187
|
+
|
188
|
+
try:
|
189
|
+
self.config = cfg
|
190
|
+
self.file_manager = FileManager(title=self.title,
|
191
|
+
base_novels_dir=self.config.base_novels_dir,
|
192
|
+
novel_base_dir=novel_base_dir)
|
193
|
+
self.decoder = Decoder(self.host, self.config.decode_guide_file)
|
194
|
+
except FileManagerError as e:
|
195
|
+
logger.error("Could not set configuration. File Manager Error", exc_info=e)
|
196
|
+
raise
|
197
|
+
|
198
|
+
def set_scraper_behavior(self, **kwargs) -> None:
|
199
|
+
"""
|
200
|
+
Updates the scraper behavior configuration with the provided parameters.
|
205
201
|
|
206
|
-
|
202
|
+
Args:
|
203
|
+
**kwargs: Keyword arguments for updating scraper behavior settings.
|
204
|
+
Can include any valid ScraperBehavior attributes.
|
205
|
+
"""
|
207
206
|
|
208
|
-
|
209
|
-
self.scraper_behavior.
|
207
|
+
filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
208
|
+
self.scraper_behavior = replace(self.scraper_behavior, **filtered_kwargs)
|
209
|
+
logger.info(f'Scraper behavior updated')
|
210
210
|
|
211
211
|
def set_metadata(self, **kwargs) -> None:
|
212
|
-
|
212
|
+
"""
|
213
|
+
Updates the novel's metadata with the provided parameters.
|
214
|
+
|
215
|
+
Args:
|
216
|
+
**kwargs: Keyword arguments for updating metadata.
|
217
|
+
Can include any valid Metadata attributes like author, language, etc.
|
218
|
+
"""
|
219
|
+
filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
220
|
+
self.metadata = replace(self.metadata, **filtered_kwargs)
|
221
|
+
logger.info(f'Metadata updated')
|
222
|
+
|
223
|
+
def add_tag(self, tag: str) -> None:
|
224
|
+
"""
|
225
|
+
Adds a new tag to the novel's metadata if it doesn't already exist.
|
226
|
+
|
227
|
+
Args:
|
228
|
+
tag (str): The tag to add to the novel's metadata.
|
229
|
+
"""
|
213
230
|
|
214
|
-
def add_tag(self, tag: str) -> bool:
|
215
231
|
if tag not in self.metadata.tags:
|
216
|
-
self.metadata
|
217
|
-
|
218
|
-
|
219
|
-
|
232
|
+
self.metadata = replace(
|
233
|
+
self.metadata, tags=(*self.metadata.tags, tag)
|
234
|
+
)
|
235
|
+
logger.info('Tag %s added to metadata', tag)
|
236
|
+
else:
|
237
|
+
logger.debug("Tag %s already present in %s", tag, self.title)
|
238
|
+
|
239
|
+
def remove_tag(self, tag: str) -> None:
|
240
|
+
"""
|
241
|
+
Removes a tag from the novel's metadata if it exists.
|
242
|
+
|
243
|
+
Args:
|
244
|
+
tag (str): The tag to remove from the novel's metadata.
|
245
|
+
"""
|
220
246
|
|
221
|
-
def remove_tag(self, tag: str) -> bool:
|
222
247
|
if tag in self.metadata.tags:
|
223
|
-
self.metadata.
|
224
|
-
|
225
|
-
|
226
|
-
|
248
|
+
self.metadata = replace(self.metadata,
|
249
|
+
tags=tuple(t for t in self.metadata.tags if t != tag))
|
250
|
+
logger.info('Tag %s removed from metadata', tag)
|
251
|
+
else:
|
252
|
+
logger.debug("Tag %s not present in %s", tag, self.title)
|
227
253
|
|
228
254
|
def set_cover_image(self, cover_image_path: str) -> None:
|
229
|
-
|
255
|
+
"""
|
256
|
+
Sets or updates the novel's cover image.
|
257
|
+
|
258
|
+
Args:
|
259
|
+
cover_image_path (str): Path to the cover image file.
|
260
|
+
|
261
|
+
Raises:
|
262
|
+
FileManagerError: If there's an error when saving the cover image.
|
263
|
+
"""
|
264
|
+
|
265
|
+
try:
|
266
|
+
self.file_manager.save_novel_cover(cover_image_path)
|
267
|
+
logger.info('Cover image updated')
|
268
|
+
except FileManagerError as e:
|
269
|
+
logger.error("Could not update cover. File Manager Error", exc_info=e)
|
270
|
+
raise
|
230
271
|
|
231
272
|
def set_host(self, host: str) -> None:
|
273
|
+
"""
|
274
|
+
Sets or updates the novel's host URL and modifies the decoder.
|
275
|
+
|
276
|
+
Args:
|
277
|
+
host (str): The host URL for the novel.
|
278
|
+
|
279
|
+
Raises:
|
280
|
+
DecodeError: If there's an error when setting up the decoder with the new host.
|
281
|
+
"""
|
282
|
+
|
232
283
|
self.host = host
|
233
|
-
|
284
|
+
try:
|
285
|
+
self.decoder.set_host(host)
|
286
|
+
logger.info(f'Host updated to "{self.host}"')
|
287
|
+
except ValidationError as e:
|
288
|
+
logger.error("Could not set host. Decode Error", exc_info=e)
|
289
|
+
raise
|
290
|
+
|
291
|
+
def save_novel(self) -> None:
|
292
|
+
"""
|
293
|
+
Saves the current state of the novel to disk.
|
294
|
+
|
295
|
+
Persists all novel data including metadata, chapters, and configuration
|
296
|
+
to the novel's JSON file.
|
297
|
+
|
298
|
+
Raises:
|
299
|
+
FileManagerError: If there's an error when saving the novel data.
|
300
|
+
"""
|
234
301
|
|
235
|
-
|
236
|
-
|
302
|
+
try:
|
303
|
+
self.file_manager.save_novel_json(self.to_dict())
|
304
|
+
logger.info(f'Novel data saved to disk on file "{self.file_manager.novel_json_file}".')
|
305
|
+
except FileManagerError as e:
|
306
|
+
logger.error("Could not save novel. File Manager Error", exc_info=e)
|
307
|
+
raise
|
237
308
|
|
238
309
|
# TABLE OF CONTENTS MANAGEMENT
|
239
310
|
|
240
|
-
def set_toc_main_url(self, toc_main_url: str,
|
311
|
+
def set_toc_main_url(self, toc_main_url: str, update_host: bool = True) -> None:
|
312
|
+
"""
|
313
|
+
Sets the main URL for the table of contents and optionally updates the host.
|
314
|
+
|
315
|
+
Deletes any existing TOC files as they will be refreshed from the new URL.
|
316
|
+
If update_host is True, extracts and updates the host from the new URL.
|
317
|
+
|
318
|
+
Args:
|
319
|
+
toc_main_url: Main URL for the table of contents
|
320
|
+
update_host: Whether to update the host based on the URL (default: True)
|
321
|
+
|
322
|
+
Raises:
|
323
|
+
ValidationError: If host extraction fails
|
324
|
+
FileManagerError: If TOC deletion fails
|
325
|
+
"""
|
326
|
+
|
241
327
|
self.toc_main_url = toc_main_url
|
242
|
-
self.
|
243
|
-
|
244
|
-
self.
|
245
|
-
|
246
|
-
|
247
|
-
|
328
|
+
logger.info(f'Main URL updated to "{self.toc_main_url}", TOCs already requested will be deleted.')
|
329
|
+
try:
|
330
|
+
self.file_manager.delete_toc()
|
331
|
+
except FileManagerError as e:
|
332
|
+
logger.error("Could not delete TOCs. File Manager Error", exc_info=e)
|
333
|
+
raise
|
334
|
+
|
335
|
+
if update_host:
|
336
|
+
new_host = utils.obtain_host(self.toc_main_url)
|
337
|
+
logger.debug(f'Update Host flag present, new host is "{new_host}".')
|
338
|
+
self.set_host(new_host)
|
248
339
|
|
249
340
|
def add_toc_html(self, html: str, host: str = None) -> None:
|
341
|
+
"""
|
342
|
+
Adds HTML content as a table of contents fragment.
|
343
|
+
|
344
|
+
This method is mutually exclusive with using toc_main_url - if a main URL exists,
|
345
|
+
it will be cleared. Host must be provided either directly or from a previous configuration.
|
346
|
+
|
347
|
+
Args:
|
348
|
+
html: HTML content to add as TOC fragment
|
349
|
+
host: Optional host to set for this content
|
350
|
+
|
351
|
+
Raises:
|
352
|
+
ValidationError: If no host is provided when required
|
353
|
+
FileManagerError: If saving TOC content fails
|
354
|
+
"""
|
355
|
+
|
250
356
|
if self.toc_main_url:
|
357
|
+
logger.debug(f'TOC main URL is exclusive with manual TOC files, TOC main URL will be deleted.')
|
251
358
|
self.delete_toc()
|
252
359
|
self.toc_main_url = None
|
253
360
|
|
254
361
|
if host:
|
255
|
-
self.host
|
256
|
-
|
362
|
+
self.set_host(host)
|
363
|
+
else:
|
364
|
+
if self.host is None:
|
365
|
+
logger.error(f'When using TOC files instead of URLs, host must be provided.')
|
366
|
+
raise ValidationError('Host must be provided when using TOC files instead of URLs.')
|
257
367
|
self.file_manager.add_toc(html)
|
258
|
-
|
368
|
+
logger.info('New TOC file added to disk.')
|
259
369
|
|
260
370
|
def delete_toc(self):
|
371
|
+
"""
|
372
|
+
Deletes all table of contents files and resets chapter data.
|
373
|
+
|
374
|
+
Clears:
|
375
|
+
- All TOC files from disk
|
376
|
+
- Chapter list
|
377
|
+
- Chapter URL list
|
378
|
+
|
379
|
+
Raises:
|
380
|
+
FileManagerError: If deletion of TOC files fails
|
381
|
+
"""
|
382
|
+
|
261
383
|
self.file_manager.delete_toc()
|
262
384
|
self.chapters = []
|
263
385
|
self.chapters_url_list = []
|
386
|
+
logger.info('TOC files deleted from disk.')
|
387
|
+
|
388
|
+
def sync_toc(self, reload_files: bool = True) -> None:
|
389
|
+
"""
|
390
|
+
Synchronizes the table of contents with stored/remote content.
|
391
|
+
|
392
|
+
Process:
|
393
|
+
1. Checks if TOC content exists (stored or retrievable)
|
394
|
+
2. Optionally reloads TOC files from remote if needed
|
395
|
+
3. Extracts chapter URLs from TOC content
|
396
|
+
4. Creates/updates chapters based on URLs
|
397
|
+
|
398
|
+
Args:
|
399
|
+
reload_files: Whether to force reload of TOC files from remote (default: True)
|
400
|
+
|
401
|
+
Raises:
|
402
|
+
ScraperError: If no TOC content is available
|
403
|
+
FileManagerError: If file operations fail
|
404
|
+
DecodeError: If TOC parsing fails
|
405
|
+
NetworkError: If remote content retrieval fails
|
406
|
+
ValidationError: If chapter creation fails
|
407
|
+
"""
|
264
408
|
|
265
|
-
def sync_toc(self, reload_files: bool = False) -> bool:
|
266
|
-
# Hard reload will request again the toc files from the toc_main_url
|
267
|
-
# Only works with toc_main_url
|
268
409
|
all_tocs_content = self.file_manager.get_all_toc()
|
269
410
|
|
270
411
|
# If there is no toc_main_url and no manually added toc, there is no way to sync toc
|
@@ -272,59 +413,116 @@ class Novel:
|
|
272
413
|
if toc_not_exists:
|
273
414
|
logger.critical(
|
274
415
|
'There is no toc html and no toc url set, unable to get toc.')
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
416
|
+
raise ScraperError('There is no toc html and no toc url set, unable to get toc.')
|
417
|
+
|
418
|
+
# Will reload files if:
|
419
|
+
# Reload_files is True (requested by user) AND there is a toc_main_url present.
|
420
|
+
# OR
|
421
|
+
# There is a toc_main_url present, but no toc files are saved in the disk.
|
422
|
+
reload_files = ((reload_files or
|
423
|
+
all_tocs_content is None) or
|
424
|
+
self.toc_main_url is not None)
|
425
|
+
if reload_files:
|
426
|
+
logger.debug('Reloading TOC files.')
|
427
|
+
try:
|
428
|
+
self._request_toc_files()
|
429
|
+
except FileManagerError as e:
|
430
|
+
logger.error("Could not request TOC files. File Manager Error", exc_info=e)
|
431
|
+
raise
|
432
|
+
except DecodeError as e:
|
433
|
+
logger.error("Could not request TOC files. Decoder Error", exc_info=e)
|
434
|
+
raise
|
435
|
+
except NetworkError as e:
|
436
|
+
logger.error("Could not request TOC files. Network Error", exc_info=e)
|
437
|
+
raise
|
438
|
+
|
439
|
+
try:
|
440
|
+
self._load_or_request_chapter_urls_from_toc()
|
441
|
+
except DecodeError as e:
|
442
|
+
logger.error("Could not get chapter urls from TOC files. Decoder Error", exc_info=e)
|
443
|
+
raise
|
444
|
+
except FileManagerError as e:
|
445
|
+
logger.error("Could not get chapter urls from TOC files. File Manager Error", exc_info=e)
|
446
|
+
raise
|
447
|
+
|
448
|
+
try:
|
449
|
+
self._create_chapters_from_toc()
|
450
|
+
except ValidationError as e:
|
451
|
+
logger.error("Could not create chapters from TOC files. Validation Error", exc_info=e)
|
452
|
+
raise
|
453
|
+
logger.info('TOC synced with files, Chapters created from Table of Contents.')
|
454
|
+
|
455
|
+
def show_toc(self) -> Optional[str]:
|
456
|
+
"""
|
457
|
+
Generates a human-readable representation of the Table Of Contents.
|
290
458
|
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
chapters_url_from_toc_content = self.decoder.get_chapter_urls(toc_content)
|
295
|
-
if chapters_url_from_toc_content is None:
|
296
|
-
logger.error('Chapters url not found on toc_content')
|
297
|
-
return False
|
298
|
-
# First we save a list of lists in case we need to invert the orderAdd commentMore actions
|
299
|
-
self.chapters_url_list.append(chapters_url_from_toc_content)
|
300
|
-
|
301
|
-
invert = self.decoder.is_index_inverted()
|
302
|
-
self.chapters_url_list = [
|
303
|
-
chapter
|
304
|
-
for chapters_url in (self.chapters_url_list[::-1] if invert else self.chapters_url_list)
|
305
|
-
for chapter in chapters_url
|
306
|
-
]
|
307
|
-
add_host_to_chapter = self.scraper_behavior.auto_add_host or self.decoder.add_host_to_chapter()
|
308
|
-
if add_host_to_chapter:
|
309
|
-
self.chapters_url_list = [
|
310
|
-
f'https://{self.host}{chapter_url}' for chapter_url in self.chapters_url_list]
|
311
|
-
self.chapters_url_list = utils.delete_duplicates(
|
312
|
-
self.chapters_url_list)
|
313
|
-
self.save_novel()
|
314
|
-
self._create_chapters_from_toc()
|
315
|
-
return True
|
459
|
+
Returns:
|
460
|
+
Optional[str]: Formatted string showing chapter numbers and URLs, None if no chapters_urls found
|
461
|
+
"""
|
316
462
|
|
317
|
-
def show_toc(self):
|
318
463
|
if not self.chapters_url_list:
|
319
|
-
|
464
|
+
logger.warning('No chapters in TOC')
|
465
|
+
return None
|
320
466
|
toc_str = 'Table Of Contents:'
|
321
467
|
for i, chapter_url in enumerate(self.chapters_url_list):
|
322
|
-
toc_str += f'\nChapter {i+1}: {chapter_url}'
|
468
|
+
toc_str += f'\nChapter {i + 1}: {chapter_url}'
|
323
469
|
return toc_str
|
324
470
|
|
325
471
|
# CHAPTERS MANAGEMENT
|
326
472
|
|
473
|
+
def get_chapter(self, chapter_index: Optional[int] = None, chapter_url: Optional[str] = None) -> Optional[Chapter]:
|
474
|
+
"""
|
475
|
+
Retrieves a chapter either by its index in the chapter list or by its URL.
|
476
|
+
|
477
|
+
Args:
|
478
|
+
chapter_index (Optional[int]): The index of the chapter in the chapter list
|
479
|
+
chapter_url (Optional[str]): The URL of the chapter to retrieve
|
480
|
+
|
481
|
+
Returns:
|
482
|
+
Optional[Chapter]: The requested chapter if found, None otherwise
|
483
|
+
|
484
|
+
Raises:
|
485
|
+
ValidationError: If neither index nor url is provided, or if both are provided
|
486
|
+
IndexError: If the provided index is out of range
|
487
|
+
"""
|
488
|
+
if not utils.check_exclusive_params(chapter_index, chapter_url):
|
489
|
+
raise ValidationError("Exactly one of 'chapter_index' or 'chapter_url' must be provided")
|
490
|
+
|
491
|
+
if chapter_url is not None:
|
492
|
+
chapter_index = self._find_chapter_index_by_url(chapter_url)
|
493
|
+
|
494
|
+
if chapter_index is not None:
|
495
|
+
if chapter_index < 0:
|
496
|
+
raise ValueError("Index must be positive")
|
497
|
+
try:
|
498
|
+
return self.chapters[chapter_index]
|
499
|
+
except IndexError:
|
500
|
+
logger.warning(f"No chapter found at index {chapter_index}")
|
501
|
+
return None
|
502
|
+
logger.warning(f"No chapter found with url {chapter_url}")
|
503
|
+
return None
|
504
|
+
|
327
505
|
def show_chapters(self) -> str:
|
506
|
+
"""
|
507
|
+
Generates a text representation of all novel chapters.
|
508
|
+
|
509
|
+
Returns:
|
510
|
+
str: Formatted string containing the list of chapters with their information:
|
511
|
+
- Chapter number
|
512
|
+
- Title (if available)
|
513
|
+
- URL
|
514
|
+
- HTML filename (if available)
|
515
|
+
|
516
|
+
Note:
|
517
|
+
Output format is:
|
518
|
+
Chapters List:
|
519
|
+
Chapter 1:
|
520
|
+
Title: [title or message]
|
521
|
+
URL: [url]
|
522
|
+
Filename: [filename or message]
|
523
|
+
...
|
524
|
+
"""
|
525
|
+
|
328
526
|
chapter_list = "Chapters List:\n"
|
329
527
|
for i, chapter in enumerate(self.chapters):
|
330
528
|
chapter_list += f"Chapter {i + 1}:\n"
|
@@ -333,105 +531,166 @@ class Novel:
|
|
333
531
|
chapter_list += f" Filename: {chapter.chapter_html_filename if chapter.chapter_html_filename else 'File not yet requested'}\n"
|
334
532
|
return chapter_list
|
335
533
|
|
336
|
-
def scrap_chapter(self,
|
337
|
-
|
338
|
-
chapter
|
339
|
-
if not utils.check_exclusive_params(chapter_url, chapter_idx):
|
340
|
-
raise ValueError("chapter_url and chapter_id, only one needs to be set")
|
534
|
+
def scrap_chapter(self, chapter: Chapter, reload_file: bool = False) -> Chapter:
|
535
|
+
"""
|
536
|
+
Processes and decodes a specific chapter of the novel.
|
341
537
|
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
538
|
+
This method handles the complete scraping process for an individual chapter,
|
539
|
+
including HTML loading or requesting and content decoding.
|
540
|
+
|
541
|
+
Args:
|
542
|
+
chapter (Chapter): Chapter object to process
|
543
|
+
reload_file (bool, optional): If True, forces a new download of the chapter
|
544
|
+
even if it already exists locally. Defaults to False.
|
545
|
+
|
546
|
+
Returns:
|
547
|
+
Chapter: The updated Chapter object with decoded content
|
548
|
+
|
549
|
+
Raises:
|
550
|
+
ValidationError: If there are issues with the values of the provided Chapter object
|
551
|
+
DecodeError: If there are issues during content decoding
|
552
|
+
NetworkError: If there are issues during HTML request
|
553
|
+
FileManagerError: If there are issues during file operations
|
554
|
+
"""
|
555
|
+
|
556
|
+
logger.debug('Scraping Chapter...')
|
557
|
+
if chapter.chapter_url is None:
|
558
|
+
logger.error('Chapter trying to be scrapped does not have a URL')
|
559
|
+
raise ValidationError('Chapter trying to be scrapped does not have a URL')
|
560
|
+
|
561
|
+
logger.debug(f'Using chapter url: {chapter.chapter_url}')
|
562
|
+
|
563
|
+
if reload_file:
|
564
|
+
logger.debug('Reload file Flag present, HTML will be requested...')
|
565
|
+
|
566
|
+
try:
|
567
|
+
chapter = self._load_or_request_chapter(chapter,
|
568
|
+
reload_file=reload_file)
|
569
|
+
except ValidationError as e:
|
570
|
+
logger.error(f'Could get chapter for URL "{chapter.chapter_url}" HTML content. Validation Error',
|
571
|
+
exc_info=e)
|
572
|
+
raise
|
573
|
+
except FileManagerError as e:
|
574
|
+
logger.error(f'Could get chapter for URL "{chapter.chapter_url}" HTML content. File Manager Error',
|
575
|
+
exc_info=e)
|
576
|
+
raise
|
577
|
+
except NetworkError as e:
|
578
|
+
logger.error(f'Could get chapter for URL "{chapter.chapter_url}" HTML content. Network Error', exc_info=e)
|
579
|
+
raise
|
580
|
+
|
581
|
+
if not chapter.chapter_html:
|
582
|
+
logger.error(f'Could not get HTML content for chapter with URL "{chapter.chapter_url}"')
|
583
|
+
raise ScraperError(f'Could not get HTML content for chapter with URL "{chapter.chapter_url}"')
|
367
584
|
|
368
585
|
# We get the chapter title and content
|
369
586
|
# We pass an index so we can autogenerate a Title
|
370
|
-
|
371
|
-
|
372
|
-
|
587
|
+
save_title_to_content = (self.scraper_behavior.save_title_to_content or
|
588
|
+
self.decoder.save_title_to_content())
|
589
|
+
try:
|
590
|
+
chapter = self._decode_chapter(chapter=chapter,
|
591
|
+
save_title_to_content=save_title_to_content)
|
592
|
+
except DecodeError as e:
|
593
|
+
logger.error(f'Could not decode HTML title and content for chapter with URL "{chapter.chapter_url}"',
|
594
|
+
exc_info=e)
|
595
|
+
raise
|
596
|
+
except ValidationError as e:
|
597
|
+
logger.error(f'Could not decode HTML title and content for chapter with URL "{chapter.chapter_url}"',
|
598
|
+
exc_info=e)
|
599
|
+
raise
|
600
|
+
|
601
|
+
logger.info(f'Chapter scrapped from link: {chapter.chapter_url}')
|
373
602
|
return chapter
|
374
603
|
|
375
|
-
def
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
604
|
+
def request_all_chapters(self,
|
605
|
+
sync_toc: bool = True,
|
606
|
+
reload_files: bool = False,
|
607
|
+
clean_chapters: bool = False) -> None:
|
608
|
+
"""
|
609
|
+
Requests and processes all chapters of the novel.
|
610
|
+
|
611
|
+
This method performs scraping of all available chapters in the novel,
|
612
|
+
handling the loading and decoding of each one.
|
613
|
+
|
614
|
+
Args:
|
615
|
+
sync_toc (bool, optional): If True, syncs the table of contents
|
616
|
+
reload_files (bool, optional): If True, forces a new download of all
|
617
|
+
chapters, even if they already exist locally. Defaults to False.
|
618
|
+
clean_chapters (bool, optional): If True, cleans the HTML content of the files
|
619
|
+
|
620
|
+
Raises:
|
621
|
+
FileManagerError: If there are issues during file operations
|
622
|
+
DecodeError: If there are issues during content decoding
|
623
|
+
ValidationError: If there are issues during content decoding
|
624
|
+
|
625
|
+
Note:
|
626
|
+
- Process is performed sequentially for each chapter
|
627
|
+
- Errors in individual chapters don't stop the complete process
|
628
|
+
- Progress is logged through the logging system
|
629
|
+
"""
|
398
630
|
|
399
|
-
|
631
|
+
logger.debug('Requesting all chapters...')
|
400
632
|
if sync_toc:
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
633
|
+
logger.debug('Sync TOC flag present, syncing TOC...')
|
634
|
+
try:
|
635
|
+
self.sync_toc(reload_files=False)
|
636
|
+
except ScraperError:
|
637
|
+
logger.warning('Error when trying to sync TOC, continuing without syncing...')
|
638
|
+
|
639
|
+
if len(self.chapters_url_list) == 0:
|
640
|
+
logger.warning('No chapters in TOC, returning without requesting any...')
|
641
|
+
return None
|
642
|
+
|
643
|
+
# We request the HTML files of all the chapters
|
644
|
+
# The chapter will be requested again if:
|
645
|
+
# 1. Reload files flag is True (Requested by user)
|
646
|
+
chapters_obtained = 0
|
647
|
+
total_chapters = len(self.chapters)
|
648
|
+
for i in range(len(self.chapters)):
|
649
|
+
logger.info(f'Requesting chapter {i + 1} of {total_chapters}')
|
650
|
+
try:
|
651
|
+
self.chapters[i] = self._load_or_request_chapter(chapter=self.chapters[i],
|
652
|
+
reload_file=reload_files)
|
653
|
+
except FileManagerError:
|
654
|
+
logger.warning(f'Error requesting chapter {i + 1} with url {self.chapters[i].chapter_url}, Skipping...')
|
655
|
+
continue
|
656
|
+
except ValidationError:
|
657
|
+
logger.warning(f'Error validating chapter {i + 1} with url {self.chapters[i].chapter_url}, Skipping...')
|
658
|
+
continue
|
659
|
+
|
660
|
+
if not self.chapters[i].chapter_html:
|
661
|
+
logger.warning(f'Error requesting chapter {i + 1} with url {self.chapters[i].chapter_url}')
|
662
|
+
continue
|
663
|
+
|
664
|
+
if clean_chapters:
|
665
|
+
self._clean_chapter(self.chapters[i].chapter_html_filename)
|
666
|
+
self.save_novel()
|
667
|
+
chapters_obtained += 1
|
668
|
+
logger.info(f'Successfully requested {chapters_obtained} of {total_chapters} chapters.')
|
669
|
+
return None
|
421
670
|
|
422
|
-
# EPUB CREATION
|
671
|
+
# EPUB CREATION
|
423
672
|
|
424
673
|
def save_novel_to_epub(self,
|
425
674
|
sync_toc: bool = False,
|
426
675
|
start_chapter: int = 1,
|
427
676
|
end_chapter: int = None,
|
428
677
|
chapters_by_book: int = 100) -> None:
|
678
|
+
logger.debug('Saving novel to epub...')
|
429
679
|
if sync_toc:
|
430
|
-
|
680
|
+
logger.debug('Sync TOC flag present, syncing TOC...')
|
681
|
+
try:
|
682
|
+
self.sync_toc(reload_files=False)
|
683
|
+
except ScraperError:
|
684
|
+
logger.warning('Error when trying to sync TOC, continuing without syncing...')
|
685
|
+
|
686
|
+
if start_chapter < 1:
|
687
|
+
logger.error('Start chapter is invalid.')
|
688
|
+
raise ValidationError('Start chapter is invalid.')
|
431
689
|
|
432
690
|
if start_chapter > len(self.chapters):
|
433
|
-
logger.
|
434
|
-
|
691
|
+
logger.error(f'The start chapter is bigger than the number of chapters saved ({len(self.chapters)})')
|
692
|
+
raise ValidationError(
|
693
|
+
f'The start chapter is bigger than the number of chapters saved ({len(self.chapters)})')
|
435
694
|
|
436
695
|
if not end_chapter:
|
437
696
|
end_chapter = len(self.chapters)
|
@@ -443,22 +702,19 @@ class Novel:
|
|
443
702
|
idx = 1
|
444
703
|
start = start_chapter
|
445
704
|
while start <= end_chapter:
|
446
|
-
end = min(start + chapters_by_book - 1,
|
705
|
+
end = min(start + chapters_by_book - 1,
|
706
|
+
end_chapter)
|
447
707
|
result = self._save_chapters_to_epub(start_chapter=start,
|
448
708
|
end_chapter=end,
|
449
709
|
collection_idx=idx)
|
450
710
|
if not result:
|
451
711
|
logger.critical(f'Error with saving novel to epub, with start chapter: '
|
452
712
|
f'{start_chapter} and end chapter: {end_chapter}')
|
453
|
-
return False
|
454
713
|
start = start + chapters_by_book
|
455
714
|
idx = idx + 1
|
456
|
-
return True
|
457
|
-
|
458
715
|
|
459
716
|
## UTILS
|
460
717
|
|
461
|
-
|
462
718
|
def clean_files(self, clean_chapters: bool = True, clean_toc: bool = True, hard_clean: bool = False) -> None:
|
463
719
|
hard_clean = hard_clean or self.scraper_behavior.hard_clean
|
464
720
|
if clean_chapters:
|
@@ -470,8 +726,7 @@ class Novel:
|
|
470
726
|
self._clean_toc(hard_clean)
|
471
727
|
|
472
728
|
def show_novel_dir(self) -> str:
|
473
|
-
return self.file_manager.novel_base_dir
|
474
|
-
|
729
|
+
return str(self.file_manager.novel_base_dir)
|
475
730
|
|
476
731
|
## PRIVATE HELPERS
|
477
732
|
|
@@ -492,9 +747,25 @@ class Novel:
|
|
492
747
|
tocs_content = self.file_manager.get_all_toc()
|
493
748
|
for i, toc in enumerate(tocs_content):
|
494
749
|
toc = self.decoder.clean_html(toc, hard_clean=hard_clean)
|
495
|
-
self.file_manager.update_toc(
|
750
|
+
self.file_manager.update_toc(idx=i,
|
751
|
+
html=toc)
|
496
752
|
|
497
753
|
def _request_html_content(self, url: str) -> Optional[str]:
|
754
|
+
"""
|
755
|
+
Performs an HTTP request to retrieve HTML content from a URL.
|
756
|
+
|
757
|
+
Args:
|
758
|
+
url (str): The URL of the webpage to request
|
759
|
+
|
760
|
+
Returns:
|
761
|
+
Optional[str]: The HTML content of the webpage if the request is successful,
|
762
|
+
None otherwise
|
763
|
+
|
764
|
+
Note:
|
765
|
+
This method uses the decoder configuration and scraper behavior
|
766
|
+
to handle HTTP requests, including retries and timeouts.
|
767
|
+
"""
|
768
|
+
|
498
769
|
request_config = self.decoder.request_config
|
499
770
|
force_flaresolver = request_config.get('force_flaresolver') or self.scraper_behavior.force_flaresolver
|
500
771
|
html_content = get_html_content(url,
|
@@ -504,135 +775,331 @@ class Novel:
|
|
504
775
|
force_flaresolver=force_flaresolver)
|
505
776
|
return html_content
|
506
777
|
|
507
|
-
def
|
508
|
-
|
509
|
-
|
778
|
+
def _load_or_request_chapter(self,
|
779
|
+
chapter: Chapter,
|
780
|
+
reload_file: bool = False) -> Chapter:
|
781
|
+
"""
|
782
|
+
Loads or requests a chapter's HTML content from a local file or a URL.
|
783
|
+
|
784
|
+
This method first attempts to load the chapter content from a local file.
|
785
|
+
If not possible or if reload is requested, it fetches the content from the web.
|
786
|
+
|
787
|
+
Args:
|
788
|
+
chapter (Chapter): Chapter object containing chapter information.
|
789
|
+
reload_file (bool, optional): If True, forces a new web request
|
790
|
+
regardless of local file existence. Defaults to False.
|
791
|
+
|
792
|
+
Returns:
|
793
|
+
Chapter: The Chapter object updated with HTML content.
|
794
|
+
|
795
|
+
Raises:
|
796
|
+
FileManagerError: If there's an error loading or saving the chapter file.
|
797
|
+
ValidationError: If there's a validation error when requesting the chapter.
|
798
|
+
NetworkError: If there's a network error when requesting the chapter.
|
799
|
+
|
800
|
+
Note:
|
801
|
+
- If the file doesn't exist locally, a web request will be made.
|
802
|
+
- If the file exists but is empty, a web request will be made.
|
803
|
+
- File saving errors are logged as warnings but don't stop execution.
|
804
|
+
"""
|
510
805
|
|
511
|
-
# Generate filename if needed
|
806
|
+
# Generate a filename if needed
|
512
807
|
if not chapter.chapter_html_filename:
|
808
|
+
logger.debug('Generating a filename for the chapter')
|
513
809
|
chapter.chapter_html_filename = utils.generate_file_name_from_url(
|
514
810
|
chapter.chapter_url)
|
515
811
|
|
516
|
-
#
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
812
|
+
# The HTML will be requested again if:
|
813
|
+
# 1. "Reload file" flag is True (requested by user)
|
814
|
+
# 2. Chapter file does not exist
|
815
|
+
# 3. The Chapter file does exist, but there is no content
|
816
|
+
reload_file = reload_file or not self.file_manager.chapter_file_exists(chapter.chapter_html_filename)
|
817
|
+
# Try loading from the disk first
|
818
|
+
if not reload_file:
|
819
|
+
try:
|
820
|
+
logger.debug(f'Loading chapter HTML from file: "{chapter.chapter_html_filename}"')
|
821
|
+
chapter.chapter_html = self.file_manager.load_chapter_html(chapter.chapter_html_filename)
|
822
|
+
except FileManagerError as e:
|
823
|
+
logger.error(f'Error when trying to load chapter {chapter.chapter_title} from file', exc_info=e)
|
824
|
+
raise
|
825
|
+
if chapter.chapter_html is not None:
|
521
826
|
return chapter
|
522
827
|
|
523
828
|
# Fetch fresh content
|
524
|
-
|
829
|
+
try:
|
830
|
+
logger.debug(f'Requesting chapter HTML from URL: "{chapter.chapter_url}"')
|
831
|
+
chapter.chapter_html = self._request_html_content(chapter.chapter_url)
|
832
|
+
except ValidationError:
|
833
|
+
logger.error(
|
834
|
+
f'Error when trying to request chapter {chapter.chapter_title} from url: {chapter.chapter_url}')
|
835
|
+
raise
|
836
|
+
except NetworkError:
|
837
|
+
logger.error(
|
838
|
+
f'Error when trying to request chapter {chapter.chapter_title} from url: {chapter.chapter_url}')
|
839
|
+
raise
|
840
|
+
|
841
|
+
# If the requests failed, we will let the higher methods decide if they throw an error.
|
525
842
|
if not chapter.chapter_html:
|
526
843
|
logger.error(f'No content found on link {chapter.chapter_url}')
|
527
844
|
return chapter
|
528
845
|
|
529
846
|
# Save content
|
530
|
-
|
531
|
-
|
847
|
+
try:
|
848
|
+
logger.info(f'Saving chapter HTML to file: "{chapter.chapter_html_filename}"')
|
849
|
+
self.file_manager.save_chapter_html(chapter.chapter_html_filename,
|
850
|
+
chapter.chapter_html)
|
851
|
+
except FileManagerError as e:
|
852
|
+
# We can pass this error and try again later
|
853
|
+
logger.warning(f'Error when trying to save chapter {chapter.chapter_title} to file', exc_info=e)
|
854
|
+
|
532
855
|
return chapter
|
533
856
|
|
534
|
-
def
|
535
|
-
|
536
|
-
|
537
|
-
reload: bool = False):
|
538
|
-
if not reload:
|
539
|
-
content = self.file_manager.get_toc(toc_filename)
|
540
|
-
if content:
|
541
|
-
return content
|
857
|
+
def _request_toc_files(self):
|
858
|
+
"""
|
859
|
+
Requests and stores all table of contents (TOC) files from the novel's website.
|
542
860
|
|
543
|
-
|
544
|
-
|
861
|
+
This method handles both paginated and non-paginated TOCs:
|
862
|
+
- For non-paginated TOCs: Downloads and stores a single TOC file
|
863
|
+
- For paginated TOCs: Iteratively downloads all TOC pages until no next page is found
|
545
864
|
|
546
|
-
|
547
|
-
content = self._request_html_content(url)
|
548
|
-
if not content:
|
549
|
-
logger.warning(f'No content found on link {url}')
|
550
|
-
sys.exit(1)
|
865
|
+
The method first clears any existing TOC files before downloading new ones.
|
551
866
|
|
552
|
-
|
553
|
-
|
867
|
+
Raises:
|
868
|
+
NetworkError: If there's an error during the HTTP request
|
869
|
+
ValidationError: If no content is found at the TOC URL
|
870
|
+
DecodeError: If there's an error parsing the next page URL
|
871
|
+
|
872
|
+
Note:
|
873
|
+
This is an internal method that uses the decoder configuration to determine
|
874
|
+
pagination behavior and to parse TOC content.
|
875
|
+
"""
|
876
|
+
|
877
|
+
def _get_toc(toc_url: str, get_next_page: bool) -> str | None:
|
878
|
+
# Some TOCs next page links have incomplete URLS (e.g., /page/2)
|
879
|
+
if utils.check_incomplete_url(toc_url):
|
880
|
+
toc_url = self.toc_main_url + toc_url
|
881
|
+
logger.debug(f'Toc link is incomplete, trying with toc link: "{toc_url}"')
|
882
|
+
|
883
|
+
# Fetch fresh content
|
884
|
+
logger.debug(f'Requesting TOC from link: "{toc_url}"')
|
885
|
+
try:
|
886
|
+
toc_content = self._request_html_content(toc_url)
|
887
|
+
except NetworkError as E:
|
888
|
+
logger.error(f'Error with network, error: {E}')
|
889
|
+
raise
|
890
|
+
|
891
|
+
if not toc_content:
|
892
|
+
logger.error(f'No content found on link "{toc_url}"')
|
893
|
+
raise ValidationError(f'No content found on link "{toc_url}"')
|
894
|
+
|
895
|
+
logger.debug('Saving new TOC file to disk.')
|
896
|
+
self.file_manager.add_toc(toc_content)
|
897
|
+
|
898
|
+
if get_next_page:
|
899
|
+
try:
|
900
|
+
logger.debug(f'Parsing next page from link: {toc_url}')
|
901
|
+
next_page = self.decoder.get_toc_next_page_url(toc_content)
|
902
|
+
except DecodeError:
|
903
|
+
raise
|
904
|
+
return next_page
|
905
|
+
return None
|
554
906
|
|
555
|
-
|
556
|
-
|
557
|
-
|
907
|
+
self.file_manager.delete_toc()
|
908
|
+
has_pagination = self.decoder.has_pagination()
|
909
|
+
|
910
|
+
if not has_pagination:
|
911
|
+
logger.debug('TOC does not have pagination, requesting only one file.')
|
912
|
+
_get_toc(self.toc_main_url, get_next_page=False)
|
558
913
|
else:
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
914
|
+
logger.debug('TOC has pagination, requesting all files.')
|
915
|
+
next_page_url = self.toc_main_url
|
916
|
+
while next_page_url:
|
917
|
+
next_page_url = _get_toc(next_page_url, get_next_page=True)
|
918
|
+
|
919
|
+
def _load_or_request_chapter_urls_from_toc(self) -> None:
|
920
|
+
"""
|
921
|
+
Extracts and processes chapter URLs from the table of contents.
|
922
|
+
|
923
|
+
Raises:
|
924
|
+
DecodeError: If fails to decode chapter URLs from TOC content
|
925
|
+
"""
|
926
|
+
# Get configuration
|
927
|
+
is_inverted = self.decoder.is_index_inverted()
|
928
|
+
add_host_to_chapter = self.scraper_behavior.auto_add_host or self.decoder.add_host_to_chapter()
|
929
|
+
|
930
|
+
# Get all TOC content at once
|
931
|
+
try:
|
932
|
+
all_tocs = self.file_manager.get_all_toc()
|
933
|
+
except FileManagerError:
|
934
|
+
logger.error('Error when trying to load TOC files from disk.')
|
935
|
+
raise
|
936
|
+
|
937
|
+
# Extract URLs from all TOC fragments
|
938
|
+
self.chapters_url_list = []
|
939
|
+
for toc_content in all_tocs:
|
940
|
+
try:
|
941
|
+
urls = self.decoder.get_chapter_urls(toc_content)
|
942
|
+
self.chapters_url_list.extend(urls) # More efficient than creating intermediate lists
|
943
|
+
except DecodeError as e:
|
944
|
+
logger.error('Failed to decode chapter URLs from TOC content', exc_info=e)
|
945
|
+
raise
|
946
|
+
|
947
|
+
# Handle inversion if needed
|
948
|
+
if is_inverted:
|
949
|
+
logger.debug('Inverting chapter URLs order')
|
950
|
+
self.chapters_url_list.reverse() # In-place reversal is more efficient
|
951
|
+
|
952
|
+
# Add host if needed
|
953
|
+
if add_host_to_chapter:
|
954
|
+
logger.debug('Adding host to chapter URLs')
|
955
|
+
self.chapters_url_list = [f'https://{self.host}{url}' for url in self.chapters_url_list]
|
956
|
+
|
957
|
+
# Remove duplicates while preserving order
|
958
|
+
# self.chapters_url_list = utils.delete_duplicates(self.chapters_url_list)
|
959
|
+
|
960
|
+
logger.info(f'Successfully extracted {len(self.chapters_url_list)} unique chapter URLs')
|
961
|
+
|
962
|
+
def _create_chapters_from_toc(self):
|
963
|
+
"""
|
964
|
+
Synchronizes existing chapters with the table of contents (TOC) URL list.
|
965
|
+
|
966
|
+
This method performs the following operations:
|
967
|
+
1. Removes chapters whose URLs are no longer in the TOC
|
968
|
+
2. Adds new chapters for URLs found in the TOC
|
969
|
+
3. Reorders chapters according to the TOC sequence
|
970
|
+
|
971
|
+
Raises:
|
972
|
+
ValidationError: If there's an error when creating a new chapter
|
973
|
+
|
974
|
+
Note:
|
975
|
+
This is an internal method used to maintain consistency
|
976
|
+
between chapters and the table of contents.
|
977
|
+
"""
|
573
978
|
|
574
|
-
|
979
|
+
existing_urls = {chapter.chapter_url for chapter in self.chapters}
|
980
|
+
toc_urls_set = set(self.chapters_url_list)
|
981
|
+
|
982
|
+
# Find chapters to remove and new chapters to add
|
983
|
+
urls_to_remove = existing_urls - toc_urls_set
|
984
|
+
urls_to_add = toc_urls_set - existing_urls
|
985
|
+
|
986
|
+
if urls_to_remove:
|
987
|
+
logger.info(f'Removing {len(urls_to_remove)} chapters not found in TOC')
|
988
|
+
self.chapters = [ch for ch in self.chapters if ch.chapter_url not in urls_to_remove]
|
989
|
+
|
990
|
+
if urls_to_add:
|
991
|
+
logger.info(f'Adding {len(urls_to_add)} new chapters from TOC')
|
992
|
+
for url in self.chapters_url_list:
|
993
|
+
if url in urls_to_add:
|
994
|
+
try:
|
995
|
+
new_chapter = Chapter(chapter_url=url)
|
996
|
+
self.chapters.append(new_chapter)
|
997
|
+
except ValidationError as e:
|
998
|
+
logger.error(f'Failed to create chapter for URL {url}: {e}')
|
999
|
+
raise
|
1000
|
+
|
1001
|
+
# Reorder according to TOC
|
1002
|
+
logger.debug('Reordering chapters according to TOC')
|
575
1003
|
self.chapters.sort(
|
576
1004
|
key=lambda x: self.chapters_url_list.index(x.chapter_url))
|
577
1005
|
|
578
|
-
|
579
|
-
for chapter in self.chapters:
|
580
|
-
if chapter_url == chapter.chapter_url:
|
581
|
-
return chapter
|
582
|
-
return None
|
1006
|
+
logger.info(f'Chapter synchronization complete. Total chapters: {len(self.chapters)}')
|
583
1007
|
|
584
|
-
def
|
585
|
-
for index, chapter in enumerate(self.chapters):
|
586
|
-
if chapter.chapter_url == chapter_url:
|
587
|
-
return index
|
588
|
-
return None
|
1008
|
+
def _add_or_update_chapter_data(self, chapter: Chapter, save_in_file: bool = True) -> None:
|
589
1009
|
|
590
|
-
|
591
|
-
|
592
|
-
|
1010
|
+
# Check if the chapter exists
|
1011
|
+
chapter_idx = self._find_chapter_index_by_url(chapter.chapter_url)
|
1012
|
+
if chapter_idx is None:
|
1013
|
+
# If no existing chapter, we append it
|
1014
|
+
self.chapters.append(chapter)
|
1015
|
+
else:
|
1016
|
+
if chapter.chapter_title:
|
1017
|
+
self.chapters[chapter_idx].chapter_title = chapter.chapter_title
|
1018
|
+
if chapter.chapter_html_filename:
|
1019
|
+
self.chapters[chapter_idx].chapter_html_filename = chapter.chapter_html_filename
|
593
1020
|
|
594
|
-
|
595
|
-
|
596
|
-
increment = 100
|
597
|
-
aux = 1
|
598
|
-
for chapter_url in self.chapters_url_list:
|
599
|
-
aux += 1
|
600
|
-
chapter_idx = self._find_chapter_index_by_link(chapter_url)
|
601
|
-
if not chapter_idx:
|
602
|
-
chapter = Chapter(chapter_url=chapter_url)
|
603
|
-
self._add_or_update_chapter_data(
|
604
|
-
chapter=chapter, save_in_file=False)
|
605
|
-
if aux == increment:
|
606
|
-
self.save_novel()
|
607
|
-
aux = 1
|
608
|
-
self._order_chapters_by_link_list()
|
609
|
-
self.save_novel()
|
1021
|
+
if save_in_file:
|
1022
|
+
self.save_novel()
|
610
1023
|
|
611
|
-
def
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
1024
|
+
def _find_chapter_index_by_url(self, chapter_url: str) -> Optional[int]:
|
1025
|
+
"""
|
1026
|
+
Find the chapter index by its URL in the chapter list.
|
1027
|
+
|
1028
|
+
Args:
|
1029
|
+
chapter_url: URL of the chapter to find
|
1030
|
+
|
1031
|
+
Returns:
|
1032
|
+
Optional[int]: Index of the chapter if found, None otherwise
|
1033
|
+
|
1034
|
+
Note:
|
1035
|
+
Uses next() for efficient iteration - stops as soon as a match is found
|
1036
|
+
"""
|
1037
|
+
try:
|
1038
|
+
return next(i for i, ch in enumerate(self.chapters)
|
1039
|
+
if ch.chapter_url == chapter_url)
|
1040
|
+
except StopIteration:
|
1041
|
+
return None
|
1042
|
+
|
1043
|
+
def _decode_chapter(self,
|
1044
|
+
chapter: Chapter,
|
1045
|
+
save_title_to_content: bool = False) -> Chapter:
|
1046
|
+
"""
|
1047
|
+
Decodes a chapter's HTML content to extract title and content.
|
1048
|
+
|
1049
|
+
This method processes the HTML content of a chapter to extract its title and content.
|
1050
|
+
If no title is found, it auto-generates one using the chapter's index in the URL list.
|
1051
|
+
|
1052
|
+
Args:
|
1053
|
+
chapter (Chapter): Chapter object containing the HTML content to decode.
|
1054
|
+
save_title_to_content (bool, optional): Whether to include the title in the
|
1055
|
+
chapter content. Defaults to False.
|
1056
|
+
|
1057
|
+
Returns:
|
1058
|
+
Chapter: The updated Chapter object with decoded title and content.
|
616
1059
|
|
617
|
-
|
618
|
-
|
619
|
-
|
1060
|
+
Raises:
|
1061
|
+
ScraperError: If the chapter's HTML content is None.
|
1062
|
+
DecodeError: If there's an error decoding the chapter's title or content.
|
1063
|
+
|
1064
|
+
Note:
|
1065
|
+
- If no title is found, it will be auto-generated as "{novel_title} Chapter {index}".
|
1066
|
+
- The chapter's HTML must be loaded before calling this method.
|
1067
|
+
"""
|
1068
|
+
|
1069
|
+
logger.debug(f'Decoding chapter with URL {chapter.chapter_url}...')
|
1070
|
+
if chapter.chapter_html is None:
|
1071
|
+
logger.error(f'Chapter HTML not found for chapter with URL "{chapter.chapter_url}"')
|
1072
|
+
raise ScraperError(f'Chapter HTML not found for chapter with URL "{chapter.chapter_url}"')
|
620
1073
|
|
621
1074
|
logger.debug('Obtaining chapter title...')
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
logger.debug(f'Chapter title: "{chapter_title}"')
|
1075
|
+
try:
|
1076
|
+
chapter_title = self.decoder.get_chapter_title(chapter.chapter_html)
|
1077
|
+
except DecodeError as e:
|
1078
|
+
logger.error(f'Failed to decode chapter title from HTML content: {e}')
|
1079
|
+
raise
|
628
1080
|
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
1081
|
+
if chapter_title is None:
|
1082
|
+
logger.debug('No chapter title found, trying to autogenerate one...')
|
1083
|
+
try:
|
1084
|
+
chapter_idx = self.chapters_url_list.index(chapter.chapter_url)
|
1085
|
+
except ValueError:
|
1086
|
+
chapter_idx = ""
|
1087
|
+
|
1088
|
+
chapter_title = f'{self.title} Chapter {chapter_idx}'
|
635
1089
|
|
1090
|
+
chapter.chapter_title = chapter_title
|
1091
|
+
logger.info(f'Chapter title: "{chapter_title}"')
|
1092
|
+
|
1093
|
+
logger.debug('Obtaining chapter content...')
|
1094
|
+
try:
|
1095
|
+
chapter.chapter_content = self.decoder.get_chapter_content(chapter.chapter_html,
|
1096
|
+
save_title_to_content,
|
1097
|
+
chapter.chapter_title)
|
1098
|
+
except DecodeError:
|
1099
|
+
logger.error(f'Failed to decode chapter content for chapter with URL "{chapter.chapter_url}"')
|
1100
|
+
raise
|
1101
|
+
|
1102
|
+
logger.debug('Chapter title and content successfully decoded from HTML')
|
636
1103
|
return chapter
|
637
1104
|
|
638
1105
|
def _create_epub_book(self, book_title: str = None, calibre_collection: dict = None) -> epub.EpubBook:
|
@@ -661,7 +1128,7 @@ class Novel:
|
|
661
1128
|
# date_metadata += f'/{self.metadata.end_date}'
|
662
1129
|
if self.metadata.end_date:
|
663
1130
|
book.add_metadata('OPF', 'meta', self.metadata.end_date, {
|
664
|
-
|
1131
|
+
'name': 'end_date', 'content': self.metadata.end_date})
|
665
1132
|
if date_metadata:
|
666
1133
|
logger.debug(f'Using date_metadata {date_metadata}')
|
667
1134
|
book.add_metadata('DC', 'date', date_metadata)
|
@@ -669,12 +1136,13 @@ class Novel:
|
|
669
1136
|
# Collections with calibre
|
670
1137
|
if calibre_collection:
|
671
1138
|
book.add_metadata('OPF', 'meta', '', {
|
672
|
-
|
1139
|
+
'name': 'calibre:series', 'content': calibre_collection["title"]})
|
673
1140
|
book.add_metadata('OPF', 'meta', '', {
|
674
|
-
|
1141
|
+
'name': 'calibre:series_index', 'content': calibre_collection["idx"]})
|
675
1142
|
|
676
1143
|
cover_image_content = self.file_manager.load_novel_cover()
|
677
1144
|
if cover_image_content:
|
1145
|
+
breakpoint()
|
678
1146
|
book.set_cover('cover.jpg', cover_image_content)
|
679
1147
|
book.spine += ['cover']
|
680
1148
|
|
@@ -682,11 +1150,10 @@ class Novel:
|
|
682
1150
|
return book
|
683
1151
|
|
684
1152
|
def _add_chapter_to_epub_book(self, chapter: Chapter, book: epub.EpubBook):
|
685
|
-
chapter = self.scrap_chapter(
|
686
|
-
chapter_url=chapter.chapter_url)
|
1153
|
+
chapter = self.scrap_chapter(chapter)
|
687
1154
|
if chapter is None:
|
688
1155
|
logger.warning('Error reading chapter')
|
689
|
-
return
|
1156
|
+
return None
|
690
1157
|
self._add_or_update_chapter_data(
|
691
1158
|
chapter=chapter, save_in_file=False)
|
692
1159
|
file_name = utils.generate_epub_file_name_from_title(
|
@@ -708,10 +1175,9 @@ class Novel:
|
|
708
1175
|
start_chapter: int,
|
709
1176
|
end_chapter: int = None,
|
710
1177
|
collection_idx: int = None):
|
711
|
-
|
712
1178
|
if start_chapter > len(self.chapters):
|
713
1179
|
logger.error('start_chapter out of range')
|
714
|
-
return
|
1180
|
+
return None
|
715
1181
|
# If end_chapter is not set, we set it to idx_start + chapters_num - 1
|
716
1182
|
if not end_chapter:
|
717
1183
|
end_chapter = len(self.chapters)
|
@@ -725,7 +1191,7 @@ class Novel:
|
|
725
1191
|
# We create the epub book
|
726
1192
|
book_title = f'{self.title} Chapters {start_chapter} - {end_chapter}'
|
727
1193
|
calibre_collection = None
|
728
|
-
# If collection_idx is set, we create a
|
1194
|
+
# If collection_idx is set, we create a Calibre collection
|
729
1195
|
if collection_idx:
|
730
1196
|
calibre_collection = {'title': self.title,
|
731
1197
|
'idx': str(collection_idx)}
|
@@ -735,11 +1201,16 @@ class Novel:
|
|
735
1201
|
book = self._add_chapter_to_epub_book(chapter=chapter,
|
736
1202
|
book=book)
|
737
1203
|
if book is None:
|
738
|
-
logger.critical(
|
1204
|
+
logger.critical(
|
1205
|
+
f'Error saving epub {book_title}, could not decode chapter {chapter} using host {self.host}')
|
739
1206
|
return False
|
740
1207
|
|
741
1208
|
book.add_item(epub.EpubNcx())
|
742
1209
|
book.add_item(epub.EpubNav())
|
743
|
-
|
1210
|
+
try:
|
1211
|
+
self.file_manager.save_book(book, f'{book_title}.epub')
|
1212
|
+
except FileManagerError:
|
1213
|
+
logger.error(f'Error saving epub {book_title}')
|
1214
|
+
raise
|
744
1215
|
self.save_novel()
|
745
1216
|
return True
|