web-novel-scraper 2.0.2__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ from typing import Optional, Dict
6
6
  import unicodedata
7
7
 
8
8
  from .logger_manager import create_logger
9
- from .utils import _normalize_dirname, FileOps, now_iso, FileManagerError
9
+ from .utils import _normalize_dirname, FileOps, now_iso, FileManagerError, ValidationError
10
10
 
11
11
  NOVEL_JSON_FILENAME = 'main.json'
12
12
  NOVEL_COVER_FILENAME = 'cover.jpg'
@@ -15,6 +15,21 @@ logger = create_logger('FILE MANAGER')
15
15
 
16
16
 
17
17
  class FileManager:
18
+ """
19
+ File manager for handling novel-related file operations.
20
+
21
+ Manages all file operations related to novels including chapters, table of contents,
22
+ cover images, and metadata.
23
+
24
+ Attributes:
25
+ novel_base_dir (Path): Base directory for the novel
26
+ novel_data_dir (Path): Directory for novel data
27
+ novel_chapters_dir (Path): Directory for chapters
28
+ novel_toc_dir (Path): Directory for table of contents
29
+ novel_json_file (Path): Main JSON file
30
+ novel_cover_file (Path): Cover image file
31
+ """
32
+
18
33
  novel_base_dir: Path
19
34
  novel_data_dir: Path
20
35
  novel_chapters_dir: Path
@@ -25,141 +40,309 @@ class FileManager:
25
40
 
26
41
  def __init__(self,
27
42
  title: str,
28
- base_novels_dir: str,
29
- novel_base_dir: str = None,
43
+ base_novels_dir: Path,
44
+ novel_base_dir: Path = None,
30
45
  read_only: bool = False):
31
- logger.debug(f'Initializing FileManager for novel: {title}')
32
- self.novel_base_dir = self._get_novel_base_dir(title, base_novels_dir, novel_base_dir)
33
- logger.debug(f'Novel base dir: {self.novel_base_dir}')
34
- self.novel_data_dir = self.novel_base_dir / 'data'
35
- self.novel_chapters_dir = self.novel_data_dir / 'chapters'
36
- self.novel_toc_dir = self.novel_data_dir / "toc"
37
- self.novel_json_file = self.novel_data_dir / NOVEL_JSON_FILENAME
38
- self.novel_cover_file = self.novel_data_dir / NOVEL_COVER_FILENAME
39
-
40
- if not read_only:
41
- FileOps.ensure_dir(self.novel_base_dir)
42
- if novel_base_dir is None:
43
- self._store_novel_base_dir(title, self.novel_base_dir, base_novels_dir)
44
- FileOps.ensure_dir(self.novel_data_dir)
45
- FileOps.ensure_dir(self.novel_chapters_dir)
46
- FileOps.ensure_dir(self.novel_toc_dir)
46
+ """
47
+ Initialize the file manager.
48
+
49
+ Args:
50
+ title: Novel title
51
+ base_novels_dir: Base directory for all novels
52
+ novel_base_dir: Specific novel directory (optional)
53
+ read_only: If True, doesn't create directories
54
+
55
+ Raises:
56
+ FileManagerError: If there are errors creating required directories
57
+ """
58
+
59
+ try:
60
+ logger.debug(f'Initializing FileManager for novel: {title}')
61
+ self.novel_base_dir = self._get_novel_base_dir(title, base_novels_dir, novel_base_dir)
62
+ self.novel_data_dir = self.novel_base_dir / 'data'
63
+ self.novel_chapters_dir = self.novel_data_dir / 'chapters'
64
+ self.novel_toc_dir = self.novel_data_dir / "toc"
65
+ self.novel_json_file = self.novel_data_dir / NOVEL_JSON_FILENAME
66
+ self.novel_cover_file = self.novel_data_dir / NOVEL_COVER_FILENAME
67
+
68
+ if not read_only:
69
+ FileOps.ensure_dir(self.novel_base_dir)
70
+ if novel_base_dir is None:
71
+ self._store_novel_base_dir(title, self.novel_base_dir, base_novels_dir)
72
+ FileOps.ensure_dir(self.novel_data_dir)
73
+ FileOps.ensure_dir(self.novel_chapters_dir)
74
+ FileOps.ensure_dir(self.novel_toc_dir)
75
+ except Exception as e:
76
+ raise FileManagerError(f"Error initializing FileManager: {str(e)}") from e
47
77
 
48
78
  def save_chapter_html(self, chapter_filename: str, content: str) -> None:
49
- full_path = self.novel_chapters_dir / chapter_filename
50
- logger.debug(f'Saving chapter to {full_path}')
51
- content = unicodedata.normalize('NFKC', content)
52
- FileOps.save_text(full_path, content)
79
+ """
80
+ Save chapter HTML content to file.
81
+
82
+ Args:
83
+ chapter_filename: Name of the chapter file
84
+ content: HTML content of the chapter
85
+
86
+ Raises:
87
+ FileManagerError: If there are errors when saving the file
88
+ """
89
+
90
+ try:
91
+ full_path = self.novel_chapters_dir / chapter_filename
92
+ logger.debug(f'Saving chapter to {full_path}')
93
+ content = unicodedata.normalize('NFKC', content)
94
+ FileOps.save_text(full_path, content)
95
+ except Exception as e:
96
+ raise FileManagerError(f"Error saving chapter {chapter_filename}: {str(e)}") from e
53
97
 
54
98
  def chapter_file_exists(self, chapter_filename: str) -> bool:
55
99
  full_path = self.novel_chapters_dir / chapter_filename
56
100
  return full_path.exists()
57
101
 
58
102
  def load_chapter_html(self, chapter_filename: str) -> Optional[str]:
59
- full_path = self.novel_chapters_dir / chapter_filename
60
- logger.debug(f'Loading chapter from {full_path}')
61
- chapter_content = FileOps.read_text(full_path)
62
- if not chapter_content:
63
- logger.debug(f'Chapter content not found: {chapter_filename}')
64
- return chapter_content
103
+ """
104
+ Load chapter HTML content from a file.
105
+
106
+ Args:
107
+ chapter_filename: Name of the chapter file
108
+
109
+ Returns:
110
+ str | None: Chapter content or None if the file doesn't exist
111
+
112
+ Raises:
113
+ FileManagerError: If there are errors reading the file
114
+ """
115
+
116
+ try:
117
+ full_path = self.novel_chapters_dir / chapter_filename
118
+ logger.debug(f'Loading chapter from {full_path}')
119
+ chapter_content = FileOps.read_text(full_path)
120
+ if not chapter_content:
121
+ logger.debug(f'Chapter content not found: {chapter_filename}')
122
+ return chapter_content
123
+ except Exception as e:
124
+ raise FileManagerError(f"Error loading chapter {chapter_filename}: {str(e)}") from e
65
125
 
66
126
  def delete_chapter_html(self, chapter_filename: str) -> None:
67
- full_path = self.novel_chapters_dir / chapter_filename
68
- logger.debug(f'Attempting to delete chapter: {chapter_filename}')
69
- FileOps.delete(full_path)
127
+ """
128
+ Delete a chapter's HTML file.
129
+
130
+ Args:
131
+ chapter_filename: Name of the chapter file to delete
132
+
133
+ Raises:
134
+ FileManagerError: If there are errors deleting the file
135
+ """
136
+ try:
137
+ full_path = self.novel_chapters_dir / chapter_filename
138
+ logger.debug(f'Attempting to delete chapter: {chapter_filename}')
139
+ FileOps.delete(full_path)
140
+ except Exception as e:
141
+ raise FileManagerError(f"Error deleting chapter {chapter_filename}: {str(e)}") from e
70
142
 
71
143
  def save_novel_json(self, novel_data: dict) -> None:
72
- logger.debug(f'Saving novel data to {self.novel_json_file}')
73
- FileOps.save_json(self.novel_json_file, novel_data)
144
+ """
145
+ Save novel data in JSON format.
146
+
147
+ Args:
148
+ novel_data: Dictionary containing novel data
149
+
150
+ Raises:
151
+ FileManagerError: If there are errors when saving the JSON file
152
+ """
153
+
154
+ try:
155
+ logger.debug(f'Saving novel data to {self.novel_json_file}')
156
+ FileOps.save_json(self.novel_json_file, novel_data)
157
+ except Exception as e:
158
+ raise FileManagerError(f"Error saving novel JSON: {str(e)}") from e
74
159
 
75
160
  def load_novel_json(self) -> Optional[str]:
76
- logger.debug(f'Loading novel data from {self.novel_json_file}')
77
- novel_json = FileOps.read_text(self.novel_json_file)
78
- if novel_json is None:
79
- logger.debug('Could not read novel JSON file')
80
- return novel_json
161
+ """
162
+ Load novel data from the JSON file.
163
+
164
+ Returns:
165
+ str | None: Novel JSON content or None if the file doesn't exist
166
+
167
+ Raises:
168
+ FileManagerError: If there are errors reading the JSON file
169
+ """
170
+ try:
171
+ logger.debug(f'Loading novel data from {self.novel_json_file}')
172
+ novel_json = FileOps.read_text(self.novel_json_file)
173
+ if novel_json is None:
174
+ logger.debug('Could not read novel JSON file')
175
+ return novel_json
176
+ except Exception as e:
177
+ raise FileManagerError(f"Error loading novel JSON: {str(e)}") from e
81
178
 
82
179
  def save_novel_cover(self, source_cover_path: str) -> None:
83
- source_cover_path = Path(source_cover_path)
84
- logger.debug(f'Attempting to save cover from {source_cover_path}')
85
- if not source_cover_path.exists():
86
- logger.critical(f'No cover found on {source_cover_path}')
87
- raise ValueError(f'No cover found on {source_cover_path}')
88
- FileOps.copy(source_cover_path, self.novel_cover_file)
180
+ """
181
+ Save the novel's cover image from a source path.
182
+
183
+ Args:
184
+ source_cover_path: Path to source cover image
185
+
186
+ Raises:
187
+ ValidationError: If the source cover file doesn't exist
188
+ FileManagerError: If there are errors copying the file
189
+ """
190
+ try:
191
+ source_cover_path = Path(source_cover_path)
192
+ logger.debug(f'Attempting to save cover from {source_cover_path}')
193
+ if not source_cover_path.exists():
194
+ logger.critical(f'No cover found on {source_cover_path}')
195
+ raise ValidationError(f'No cover found on {source_cover_path}')
196
+ FileOps.copy(source_cover_path, self.novel_cover_file)
197
+ except ValidationError:
198
+ raise
199
+ except Exception as e:
200
+ raise FileManagerError(f"Error saving novel cover: {str(e)}") from e
89
201
 
90
202
  def load_novel_cover(self) -> Optional[bytes]:
91
- if self.novel_cover_file is None:
92
- logger.debug(f'No cover found')
93
- return None
94
- logger.debug(f'Loading cover from {self.novel_cover_file}')
95
- cover = FileOps.read_binary(self.novel_cover_file)
96
- if cover is None:
97
- logger.debug(f'Could not read cover from {self.novel_cover_file}')
98
- return cover
203
+ """
204
+ Load novel cover image.
205
+
206
+ Returns:
207
+ bytes | None: Cover image binary data or None if the file doesn't exist
208
+
209
+ Raises:
210
+ FileManagerError: If there are errors reading the file
211
+ """
212
+ try:
213
+ if self.novel_cover_file is None:
214
+ logger.debug('No cover found')
215
+ return None
216
+ logger.debug(f'Loading cover from {self.novel_cover_file}')
217
+ cover = FileOps.read_binary(self.novel_cover_file)
218
+ if cover is None:
219
+ logger.debug(f'Could not read cover from {self.novel_cover_file}')
220
+ return cover
221
+ except Exception as e:
222
+ raise FileManagerError(f"Error loading novel cover: {str(e)}") from e
99
223
 
100
224
  ## TOC API
101
225
 
102
226
  def add_toc(self, html: str) -> int:
103
- """Add a new TOC fragment, return its index."""
104
- idx = self._next_toc_idx()
105
- toc_path = self.novel_toc_dir / f"toc_{idx}.html"
106
- FileOps.save_text(toc_path, html)
227
+ """
228
+ Add a new table of contents fragment.
107
229
 
108
- toc_index = self._load_toc_index()
109
- toc_index["entries"].append({"file": toc_path.name, "updated": now_iso()})
110
- self._store_toc_index(toc_index)
230
+ Args:
231
+ html: HTML content of the TOC fragment
111
232
 
112
- logger.debug(f"Added TOC #{idx} → {toc_path}")
113
- return idx
233
+ Returns:
234
+ int: Index of the added TOC fragment
235
+
236
+ Raises:
237
+ FileManagerError: If there are errors when saving the TOC fragment
238
+ """
239
+ try:
240
+ idx = self._next_toc_idx()
241
+ toc_path = self.novel_toc_dir / f"toc_{idx}.html"
242
+ FileOps.save_text(toc_path, html)
243
+
244
+ toc_index = self._load_toc_index()
245
+ toc_index["entries"].append({"file": toc_path.name, "updated": now_iso()})
246
+ self._store_toc_index(toc_index)
247
+
248
+ logger.debug(f"Added TOC #{idx} → {toc_path}")
249
+ return idx
250
+ except Exception as e:
251
+ raise FileManagerError(f"Error adding TOC fragment: {str(e)}") from e
114
252
 
115
253
  def update_toc(self, idx: int, html: str) -> None:
116
- toc_path = self.novel_toc_dir / f"toc_{idx}.html"
117
- if not toc_path.exists():
118
- raise FileManagerError(f"TOC #{idx} not found")
254
+ """
255
+ Update an existing table of contents fragment.
256
+
257
+ Args:
258
+ idx: Index of the TOC fragment to update
259
+ html: New HTML content
119
260
 
120
- FileOps.save_text(toc_path, html)
261
+ Raises:
262
+ FileManagerError: If TOC fragment doesn't exist or there are errors updating it
263
+ """
264
+ try:
265
+ toc_path = self.novel_toc_dir / f"toc_{idx}.html"
266
+ if not toc_path.exists():
267
+ raise FileManagerError(f"TOC #{idx} not found")
121
268
 
122
- toc_index = self._load_toc_index()
123
- for entry in toc_index["entries"]:
124
- if entry["file"] == toc_path.name:
125
- entry["updated"] = now_iso()
126
- break
127
- self._store_toc_index(toc_index)
128
- logger.debug(f"Updated TOC #{idx}")
269
+ FileOps.save_text(toc_path, html)
270
+
271
+ toc_index = self._load_toc_index()
272
+ for entry in toc_index["entries"]:
273
+ if entry["file"] == toc_path.name:
274
+ entry["updated"] = now_iso()
275
+ break
276
+ self._store_toc_index(toc_index)
277
+ logger.debug(f"Updated TOC #{idx}")
278
+ except Exception as e:
279
+ raise FileManagerError(f"Error updating TOC fragment: {str(e)}") from e
129
280
 
130
281
  def delete_toc(self, idx: Optional[int] = None) -> None:
131
- """Delete a single TOC by index or all if *idx* is None."""
132
- toc_index = self._load_toc_index()
282
+ """
283
+ Delete the table of contents fragment(s).
133
284
 
134
- def _delete(path: Path) -> None:
135
- FileOps.delete(path)
136
- logger.debug(f"Deleted {path}")
285
+ Args:
286
+ idx: Index of a specific TOC fragment to delete. If None, deletes all TOC fragments.
137
287
 
138
- if idx is None: # delete all
139
- for entry in toc_index["entries"]:
140
- _delete(self.novel_toc_dir / entry["file"])
141
- toc_index["entries"] = []
142
- else:
143
- toc_path = self.novel_toc_dir / f"toc_{idx}.html"
144
- _delete(toc_path)
145
- toc_index["entries"] = [
146
- e for e in toc_index["entries"] if e["file"] != toc_path.name
147
- ]
148
- self._store_toc_index(toc_index)
288
+ Raises:
289
+ FileManagerError: If there are errors deleting TOC files or updating the index
290
+ """
291
+ try:
292
+ toc_index = self._load_toc_index()
293
+
294
+ def _delete(path: Path) -> None:
295
+ """Helper function to delete a file and log the action."""
296
+ try:
297
+ FileOps.delete(path)
298
+ logger.debug(f"Deleted {path}")
299
+ except Exception as e:
300
+ raise FileManagerError(f"Failed to delete TOC file {path}: {str(e)}")
301
+
302
+ if idx is None:
303
+ logger.debug("Deleting all TOC fragments")
304
+ for entry in toc_index["entries"]:
305
+ _delete(self.novel_toc_dir / entry["file"])
306
+ toc_index["entries"] = []
307
+ else:
308
+ logger.debug(f"Deleting TOC fragment #{idx}")
309
+ toc_path = self.novel_toc_dir / f"toc_{idx}.html"
310
+ _delete(toc_path)
311
+ toc_index["entries"] = [
312
+ e for e in toc_index["entries"] if e["file"] != toc_path.name
313
+ ]
314
+
315
+ self._store_toc_index(toc_index)
316
+ logger.info(f"Successfully deleted TOC {'fragments' if idx is None else f'fragment #{idx}'}")
317
+
318
+ except Exception as e:
319
+ raise FileManagerError(
320
+ f"Error deleting TOC {'fragments' if idx is None else f'fragment #{idx}'}: {str(e)}") from e
149
321
 
150
322
  def get_toc(self, idx: int) -> Optional[str]:
151
323
  """Return TOC HTML content or None."""
152
324
  return FileOps.read_text(self.novel_toc_dir / f"toc_{idx}.html")
153
325
 
154
326
  def get_all_toc(self) -> list[str]:
155
- """Return all TOC fragments in order."""
156
- toc_index = self._load_toc_index()
157
- contents: list[str] = []
158
- for entry in toc_index["entries"]:
159
- html = FileOps.read_text(self.novel_toc_dir / entry["file"])
160
- if html is not None:
161
- contents.append(html)
162
- return contents
327
+ """
328
+ Get all table of contents fragments in order.
329
+
330
+ Returns:
331
+ list[str]: List of TOC HTML contents
332
+
333
+ Raises:
334
+ FileManagerError: If there are errors reading TOC files
335
+ """
336
+ try:
337
+ toc_index = self._load_toc_index()
338
+ contents: list[str] = []
339
+ for entry in toc_index["entries"]:
340
+ html = FileOps.read_text(self.novel_toc_dir / entry["file"])
341
+ if html is not None:
342
+ contents.append(html)
343
+ return contents
344
+ except Exception as e:
345
+ raise FileManagerError(f"Error retrieving TOC fragments: {str(e)}") from e
163
346
 
164
347
  def save_book(self, book: epub.EpubBook, filename: str) -> bool:
165
348
  book_path = self.novel_base_dir / filename
@@ -177,7 +360,7 @@ class FileManager:
177
360
  return False
178
361
  except Exception as e:
179
362
  logger.critical(f'Unexpected error saving book to {book_path}: {e}')
180
- return False
363
+ raise
181
364
 
182
365
  def _load_toc_index(self) -> dict:
183
366
  """Return the toc.json structure (creates a blank one if missing)."""
@@ -201,28 +384,27 @@ class FileManager:
201
384
  @staticmethod
202
385
  def _get_novel_base_dir(
203
386
  title: str,
204
- base_novels_dir: str,
205
- novel_base_dir: str | None = None
387
+ base_novels_dir: Path,
388
+ novel_base_dir: Path | None = None
206
389
  ) -> Path:
207
390
  """
208
391
  Resolve the base directory for *title* without creating any directories.
209
392
 
210
393
  Priority:
211
- 1. Explicit *novel_base_dir* argument.
394
+ 1. Explicit *base_novels_dir* argument.
212
395
  2. Stored value in <base_novels_dir>/meta.json.
213
- 3. New path derived from normalized title, recorded back to meta.json.
396
+ 3. New path derived from a normalized title, recorded back to meta.json.
214
397
  """
215
- base_dir_path = Path(base_novels_dir)
216
- if not base_dir_path.exists():
217
- logger.info(f'{base_dir_path} does not exist. Creating new base directory.')
218
- FileOps.ensure_dir(base_dir_path)
398
+ if not base_novels_dir.exists():
399
+ logger.info(f'{base_novels_dir} does not exist. Creating new base directory.')
400
+ FileOps.ensure_dir(base_novels_dir)
219
401
 
220
402
  # — 1. If the caller supplied a path, return it
221
403
  if novel_base_dir:
222
404
  return Path(novel_base_dir)
223
405
 
224
406
  # — 2. Try to read meta.json
225
- meta_path = base_dir_path / "meta.json"
407
+ meta_path = base_novels_dir / "meta.json"
226
408
  if meta_path.exists():
227
409
  try:
228
410
  meta: Dict[str, Dict[str, str]] = FileOps.read_json(meta_path)
@@ -234,18 +416,18 @@ class FileManager:
234
416
  # — 3. Fallback, generate a new directory name
235
417
  clean_title = _normalize_dirname(title)
236
418
 
237
- return base_dir_path / clean_title
419
+ return base_novels_dir / clean_title
238
420
 
239
421
  @staticmethod
240
422
  def _store_novel_base_dir(
241
423
  title: str,
242
424
  resolved_path: Path,
243
- base_novels_dir: str,
425
+ base_novels_dir: Path,
244
426
  ) -> None:
245
427
  """
246
428
  Persist <title, resolved_path> in <base_novels_dir>/meta.json.
247
429
  """
248
- meta_path = Path(base_novels_dir) / "meta.json"
430
+ meta_path = base_novels_dir / "meta.json"
249
431
  try:
250
432
  # Load existing metadata (ignore errors, start fresh on corruption)
251
433
  meta: Dict[str, Dict[str, str]] = {}
@@ -0,0 +1,76 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field, asdict
4
+ from dataclasses_json import dataclass_json, config
5
+ from typing import Optional, Tuple
6
+ from urllib.parse import urlparse
7
+ import pprint
8
+
9
+ from .utils import _always, ValidationError
10
+
11
+
12
+ def _pretty(obj, *, skip: set[str] | None = None) -> str:
13
+ """Pretty-print dataclass dict, omits keys in *skip*."""
14
+ d = asdict(obj)
15
+ if skip:
16
+ for key in skip:
17
+ d.pop(key, None)
18
+ return pprint.pformat(d, sort_dicts=False, compact=True)
19
+
20
+
21
+ @dataclass_json
22
+ @dataclass(slots=True, frozen=True)
23
+ class Metadata:
24
+ author: Optional[str] = None
25
+ start_date: Optional[str] = None
26
+ end_date: Optional[str] = None
27
+ language: str = "en"
28
+ description: Optional[str] = None
29
+ tags: Tuple[str, ...] = field(default_factory=tuple)
30
+
31
+ def __str__(self) -> str:
32
+ return "Metadata:\n" + _pretty(self)
33
+
34
+
35
+ @dataclass_json
36
+ @dataclass(slots=True, frozen=True)
37
+ class ScraperBehavior:
38
+ # Some novels already have the title in the content.
39
+ save_title_to_content: bool = False
40
+ # Some novels have the toc link without the host
41
+ auto_add_host: bool = False
42
+ # Some hosts return 403 when scrapping, this will force the use of FlareSolver
43
+ # to save time
44
+ force_flaresolver: bool = False
45
+ # When you clean the HTML files, you can use hard clean by default
46
+ hard_clean: bool = False
47
+
48
+ def __str__(self) -> str:
49
+ return "ScraperBehavior:\n" + _pretty(self)
50
+
51
+
52
+ @dataclass_json()
53
+ @dataclass
54
+ class Chapter:
55
+ chapter_url: str
56
+ chapter_html: Optional[str] = field(
57
+ default=None,
58
+ repr=False,
59
+ compare=False,
60
+ metadata=config(exclude=_always)
61
+ )
62
+ chapter_content: Optional[str] = field(
63
+ default=None,
64
+ repr=False,
65
+ compare=False,
66
+ metadata=config(exclude=_always)
67
+ )
68
+ chapter_html_filename: Optional[str] = None
69
+ chapter_title: Optional[str] = field(default=None, compare=False)
70
+
71
+ def __post_init__(self):
72
+ if not urlparse(self.chapter_url).scheme:
73
+ raise ValidationError(f"Invalid URL: {self.chapter_url}")
74
+
75
+ def __str__(self) -> str:
76
+ return "Chapter:\n" + _pretty(self, skip={"chapter_html"})