web-novel-scraper 1.1.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,201 +1,174 @@
1
- import os
2
1
  import json
3
- import sys
4
2
 
5
- import platformdirs
6
3
  from pathlib import Path
7
- import shutil
8
- from dotenv import load_dotenv
9
4
  from ebooklib import epub
5
+ from typing import Optional, Dict
10
6
  import unicodedata
11
7
 
12
- from . import logger_manager
8
+ from .logger_manager import create_logger
9
+ from .utils import _normalize_dirname, FileOps, now_iso, FileManagerError
13
10
 
14
- load_dotenv()
11
+ NOVEL_JSON_FILENAME = 'main.json'
12
+ NOVEL_COVER_FILENAME = 'cover.jpg'
15
13
 
16
- app_author = "ImagineBrkr"
17
- app_name = "web-novel-scraper"
14
+ logger = create_logger('FILE MANAGER')
18
15
 
19
16
 
20
- CURRENT_DIR = Path(__file__).resolve().parent
21
-
22
- SCRAPER_BASE_CONFIG_DIR = os.getenv(
23
- 'SCRAPER_BASE_CONFIG_DIR', platformdirs.user_config_dir(app_name, app_author))
24
- SCRAPER_BASE_DATA_DIR = os.getenv(
25
- 'SCRAPER_BASE_DATA_DIR', platformdirs.user_data_dir(app_name, app_author))
26
-
27
- logger = logger_manager.create_logger('FILE MANAGER')
28
-
29
17
  class FileManager:
30
18
  novel_base_dir: Path
31
19
  novel_data_dir: Path
32
- novel_config_dir: Path
33
20
  novel_chapters_dir: Path
21
+ novel_toc_dir: Path
34
22
 
35
- novel_json_filepath: Path
36
- novel_cover_filepath: Path
37
-
38
- novel_json_filename: str = "main.json"
39
- novel_cover_filename: str = "cover.jpg"
40
- toc_preffix: str = "toc"
23
+ novel_json_file: Path
24
+ novel_cover_file: Path = None
41
25
 
42
26
  def __init__(self,
43
- novel_title: str,
27
+ title: str,
28
+ base_novels_dir: str,
44
29
  novel_base_dir: str = None,
45
- novel_config_dir: str = None,
46
30
  read_only: bool = False):
47
- logger.debug(f'Initializing FileManager for novel: {novel_title}, read_only: {read_only}')
48
- novel_base_dir = novel_base_dir if novel_base_dir else \
49
- f'{SCRAPER_BASE_DATA_DIR}/{novel_title}'
50
- novel_config_dir = novel_config_dir if novel_config_dir else \
51
- f'{SCRAPER_BASE_CONFIG_DIR}/{novel_title}'
52
-
53
- logger.debug(f'Using base dir: {novel_base_dir}, config dir: {novel_config_dir}')
54
-
55
- if read_only:
56
- self.novel_base_dir = _check_path(novel_base_dir)
57
- self.novel_data_dir = _check_path(f'{novel_base_dir}/data')
58
- self.novel_chapters_dir = _check_path(f'{self.novel_data_dir}/chapters')
59
- self.novel_config_dir = _check_path(str(novel_config_dir))
60
- logger.info(f'Initialized read-only FileManager for {novel_title}')
61
- else:
62
- try:
63
- self.novel_base_dir = _create_path_if_not_exists(novel_base_dir)
64
- self.novel_data_dir = _create_path_if_not_exists(
65
- f'{novel_base_dir}/data')
66
- self.novel_chapters_dir = _create_path_if_not_exists(
67
- f'{self.novel_data_dir}/chapters')
68
- self.novel_config_dir = _create_path_if_not_exists(novel_config_dir)
69
- logger.info(f'Created directory structure for novel: {novel_title}')
70
- except Exception as e:
71
- logger.critical(f'Failed to create directory structure: {e}')
72
- raise
73
-
74
- self.novel_json_filepath = self.novel_data_dir / self.novel_json_filename
75
- self.novel_cover_filepath = self.novel_data_dir / self.novel_cover_filename
76
- logger.debug(f'Set json path: {self.novel_json_filepath}, cover path: {self.novel_cover_filepath}')
77
-
78
- def save_chapter_html(self, filename: str, content: str):
79
- full_path = self.novel_chapters_dir / filename
31
+ logger.debug(f'Initializing FileManager for novel: {title}')
32
+ self.novel_base_dir = self._get_novel_base_dir(title, base_novels_dir, novel_base_dir)
33
+ logger.debug(f'Novel base dir: {self.novel_base_dir}')
34
+ self.novel_data_dir = self.novel_base_dir / 'data'
35
+ self.novel_chapters_dir = self.novel_data_dir / 'chapters'
36
+ self.novel_toc_dir = self.novel_data_dir / "toc"
37
+ self.novel_json_file = self.novel_data_dir / NOVEL_JSON_FILENAME
38
+ self.novel_cover_file = self.novel_data_dir / NOVEL_COVER_FILENAME
39
+
40
+ if not read_only:
41
+ FileOps.ensure_dir(self.novel_base_dir)
42
+ if novel_base_dir is None:
43
+ self._store_novel_base_dir(title, self.novel_base_dir, base_novels_dir)
44
+ FileOps.ensure_dir(self.novel_data_dir)
45
+ FileOps.ensure_dir(self.novel_chapters_dir)
46
+ FileOps.ensure_dir(self.novel_toc_dir)
47
+
48
+ def save_chapter_html(self, chapter_filename: str, content: str) -> None:
49
+ full_path = self.novel_chapters_dir / chapter_filename
80
50
  logger.debug(f'Saving chapter to {full_path}')
81
51
  content = unicodedata.normalize('NFKC', content)
82
- char_replacements = {
83
- "â": "'", # Reemplazar â con apóstrofe
84
- "\u2018": "'", # Comillda simple izquierda Unicode
85
- "\u2019": "'", # Comilla simple derecha Unicode
86
- "\u201C": '"', # Comilla doble izquierda Unicode
87
- "\u201D": '"', # Comilla doble derecha Unicode
88
- }
89
- for old_char, new_char in char_replacements.items():
90
- content = content.replace(old_char, new_char)
91
- _save_content_to_file(full_path, content)
92
-
93
- def load_chapter_html(self, filename: str):
94
- full_path = self.novel_chapters_dir / filename
95
- logger.debug(f'Loading chapter from {full_path}')
96
- if full_path.exists():
97
- return _read_content_from_file(full_path)
98
- logger.warning(f'Chapter file not found: {filename}')
99
- return None
100
-
101
- def delete_chapter_html(self, filename: str):
102
- full_path = self.novel_chapters_dir / filename
103
- logger.debug(f'Attempting to delete chapter: {filename}')
104
- if full_path.exists():
105
- _delete_file(full_path)
106
- else:
107
- logger.warning(f'Chapter file not found for deletion: {filename}')
52
+ FileOps.save_text(full_path, content)
108
53
 
109
- def save_novel_json(self, novel_data: dict):
110
- logger.debug(f'Saving novel data to {self.novel_json_filepath}')
111
- _save_content_to_file(self.novel_json_filepath, novel_data, is_json=True)
54
+ def chapter_file_exists(self, chapter_filename: str) -> bool:
55
+ full_path = self.novel_chapters_dir / chapter_filename
56
+ return full_path.exists()
112
57
 
113
- def load_novel_json(self):
114
- logger.debug(f'Loading novel data from {self.novel_json_filepath}')
115
- if self.novel_json_filepath.exists():
116
- return _read_content_from_file(self.novel_json_filepath)
117
- logger.warning('Novel JSON file not found')
118
-
119
- def save_novel_cover(self, source_cover_path: str):
58
+ def load_chapter_html(self, chapter_filename: str) -> Optional[str]:
59
+ full_path = self.novel_chapters_dir / chapter_filename
60
+ logger.debug(f'Loading chapter from {full_path}')
61
+ chapter_content = FileOps.read_text(full_path)
62
+ if not chapter_content:
63
+ logger.debug(f'Chapter content not found: {chapter_filename}')
64
+ return chapter_content
65
+
66
+ def delete_chapter_html(self, chapter_filename: str) -> None:
67
+ full_path = self.novel_chapters_dir / chapter_filename
68
+ logger.debug(f'Attempting to delete chapter: {chapter_filename}')
69
+ FileOps.delete(full_path)
70
+
71
+ def save_novel_json(self, novel_data: dict) -> None:
72
+ logger.debug(f'Saving novel data to {self.novel_json_file}')
73
+ FileOps.save_json(self.novel_json_file, novel_data)
74
+
75
+ def load_novel_json(self) -> Optional[str]:
76
+ logger.debug(f'Loading novel data from {self.novel_json_file}')
77
+ novel_json = FileOps.read_text(self.novel_json_file)
78
+ if novel_json is None:
79
+ logger.debug('Could not read novel JSON file')
80
+ return novel_json
81
+
82
+ def save_novel_cover(self, source_cover_path: str) -> None:
120
83
  source_cover_path = Path(source_cover_path)
121
84
  logger.debug(f'Attempting to save cover from {source_cover_path}')
122
- if source_cover_path.exists():
123
- return _copy_file(source_cover_path, self.novel_cover_filepath)
124
- logger.error(f'Source cover path {source_cover_path} not found')
125
- return False
126
-
127
- def load_novel_cover(self):
128
- logger.debug(f'Loading cover from {self.novel_cover_filepath}')
129
- if self.novel_cover_filepath.exists():
130
- return _read_content_from_file(self.novel_cover_filepath, bytes=True)
131
- logger.warning('Cover file not found')
132
-
133
- def delete_toc(self):
134
- logger.debug('Starting TOC deletion process')
135
- toc_pos = 0
136
- toc_exists = True
137
- deleted_count = 0
138
- while toc_exists:
139
- toc_filename = f"{self.toc_preffix}_{toc_pos}.html"
140
- toc_path = self.novel_data_dir / toc_filename
141
- toc_exists = toc_path.exists()
142
- if toc_exists:
143
- _delete_file(toc_path)
144
- deleted_count += 1
145
- toc_pos += 1
146
- logger.info(f'Deleted {deleted_count} TOC files')
147
-
148
- def add_toc(self, content: str):
149
- logger.debug('Adding new TOC entry')
150
- toc_pos = 0
151
- toc_exists = True
152
- while toc_exists:
153
- toc_filename = f"{self.toc_preffix}_{toc_pos}.html"
154
- toc_path = self.novel_data_dir / toc_filename
155
- toc_exists = toc_path.exists()
156
- if toc_exists:
157
- toc_pos += 1
158
- _save_content_to_file(toc_path, content)
159
- logger.info(f'Added TOC entry at position {toc_pos}')
160
-
161
- def update_toc(self, content: str, toc_idx: int):
162
- toc_filename = f"{self.toc_preffix}_{toc_idx}.html"
163
- toc_path = self.novel_data_dir / toc_filename
164
- logger.debug(f'Updating TOC at index {toc_idx}')
165
- if toc_path.exists():
166
- _save_content_to_file(toc_path, content)
85
+ if not source_cover_path.exists():
86
+ logger.critical(f'No cover found on {source_cover_path}')
87
+ raise ValueError(f'No cover found on {source_cover_path}')
88
+ FileOps.copy(source_cover_path, self.novel_cover_file)
89
+
90
+ def load_novel_cover(self) -> Optional[bytes]:
91
+ if self.novel_cover_file is None:
92
+ logger.debug(f'No cover found')
93
+ return None
94
+ logger.debug(f'Loading cover from {self.novel_cover_file}')
95
+ cover = FileOps.read_binary(self.novel_cover_file)
96
+ if cover is None:
97
+ logger.debug(f'Could not read cover from {self.novel_cover_file}')
98
+ return cover
99
+
100
+ ## TOC API
101
+
102
+ def add_toc(self, html: str) -> int:
103
+ """Add a new TOC fragment, return its index."""
104
+ idx = self._next_toc_idx()
105
+ toc_path = self.novel_toc_dir / f"toc_{idx}.html"
106
+ FileOps.save_text(toc_path, html)
107
+
108
+ toc_index = self._load_toc_index()
109
+ toc_index["entries"].append({"file": toc_path.name, "updated": now_iso()})
110
+ self._store_toc_index(toc_index)
111
+
112
+ logger.debug(f"Added TOC #{idx} → {toc_path}")
113
+ return idx
114
+
115
+ def update_toc(self, idx: int, html: str) -> None:
116
+ toc_path = self.novel_toc_dir / f"toc_{idx}.html"
117
+ if not toc_path.exists():
118
+ raise FileManagerError(f"TOC #{idx} not found")
119
+
120
+ FileOps.save_text(toc_path, html)
121
+
122
+ toc_index = self._load_toc_index()
123
+ for entry in toc_index["entries"]:
124
+ if entry["file"] == toc_path.name:
125
+ entry["updated"] = now_iso()
126
+ break
127
+ self._store_toc_index(toc_index)
128
+ logger.debug(f"Updated TOC #{idx}")
129
+
130
+ def delete_toc(self, idx: Optional[int] = None) -> None:
131
+ """Delete a single TOC by index or all if *idx* is None."""
132
+ toc_index = self._load_toc_index()
133
+
134
+ def _delete(path: Path) -> None:
135
+ FileOps.delete(path)
136
+ logger.debug(f"Deleted {path}")
137
+
138
+ if idx is None: # delete all
139
+ for entry in toc_index["entries"]:
140
+ _delete(self.novel_toc_dir / entry["file"])
141
+ toc_index["entries"] = []
167
142
  else:
168
- logger.error(f'TOC file not found: {toc_path}')
169
-
170
- def get_toc(self, pos_idx: int):
171
- toc_filename = f"{self.toc_preffix}_{pos_idx}.html"
172
- toc_path = self.novel_data_dir / toc_filename
173
- logger.debug(f'Loading TOC at index {pos_idx}')
174
- if toc_path.exists():
175
- return _read_content_from_file(toc_path)
176
- logger.debug(f'No TOC found at index {pos_idx}')
177
-
178
- def get_all_toc(self):
179
- logger.debug('Loading all TOC entries')
180
- pos = 0
181
- tocs = []
182
- while True:
183
- toc_content = self.get_toc(pos)
184
- if toc_content:
185
- tocs.append(toc_content)
186
- pos += 1
187
- else:
188
- logger.info(f'Found {len(tocs)} TOC entries')
189
- return tocs
143
+ toc_path = self.novel_toc_dir / f"toc_{idx}.html"
144
+ _delete(toc_path)
145
+ toc_index["entries"] = [
146
+ e for e in toc_index["entries"] if e["file"] != toc_path.name
147
+ ]
148
+ self._store_toc_index(toc_index)
149
+
150
+ def get_toc(self, idx: int) -> Optional[str]:
151
+ """Return TOC HTML content or None."""
152
+ return FileOps.read_text(self.novel_toc_dir / f"toc_{idx}.html")
153
+
154
+ def get_all_toc(self) -> list[str]:
155
+ """Return all TOC fragments in order."""
156
+ toc_index = self._load_toc_index()
157
+ contents: list[str] = []
158
+ for entry in toc_index["entries"]:
159
+ html = FileOps.read_text(self.novel_toc_dir / entry["file"])
160
+ if html is not None:
161
+ contents.append(html)
162
+ return contents
190
163
 
191
164
  def save_book(self, book: epub.EpubBook, filename: str) -> bool:
192
165
  book_path = self.novel_base_dir / filename
193
166
  logger.debug(f'Attempting to save book to {book_path}')
194
- try:
167
+ try:
195
168
  epub.write_epub(str(book_path), book)
196
169
  logger.info(f'Book saved successfully to {book_path}')
197
170
  return True
198
-
171
+
199
172
  except PermissionError as e:
200
173
  logger.error(f'Permission denied when saving book to {book_path}: {e}')
201
174
  return False
@@ -206,95 +179,91 @@ class FileManager:
206
179
  logger.critical(f'Unexpected error saving book to {book_path}: {e}')
207
180
  return False
208
181
 
209
- def _check_path(dir_path: str) -> Path:
210
- try:
211
- dir_path = Path(dir_path)
212
- return dir_path
213
- except TypeError as e:
214
- logger.error(f"Invalid path type: {e}")
215
- raise
216
- except Exception as e:
217
- logger.error(f"Unexpected error converting path: {e}", exc_info=True)
218
- raise
219
-
220
- def _create_path_if_not_exists(dir_path: str) -> Path:
221
- try:
222
- dir_path = _check_path(dir_path)
223
- dir_path.mkdir(parents=True, exist_ok=True)
224
- return dir_path
225
- except OSError as e:
226
- logger.error(f"Error with directory creation: {e}")
227
- # Change this to raise for debugging
228
- sys.exit(1)
229
- except Exception as e:
230
- logger.error(f"Unexpected error: {e}", exc_info=True)
231
- raise
232
-
233
-
234
- def _save_content_to_file(filepath: Path, content: str | dict, is_json: bool = False) -> None:
235
- try:
236
- if is_json:
237
- with open(filepath, 'w', encoding='utf-8') as file:
238
- json.dump(content, file, indent=2, ensure_ascii=False)
239
- else:
240
- with open(filepath, 'w', encoding='UTF-8') as file:
241
- file.write(content)
242
- logger.info(f'File saved successfully: {filepath}')
243
- except (OSError, IOError) as e:
244
- logger.error(f'Error saving file "{filepath}": {e}')
245
- except Exception as e:
246
- logger.error(f'Unexpected error saving file "{filepath}": {e}', exc_info=True)
247
-
248
-
249
- def _read_content_from_file(filepath: Path, bytes: bool = False) -> str:
250
- try:
251
- # Read the file
252
- read_mode = 'rb' if bytes else 'r'
253
- encoding = None if bytes else 'utf-8'
254
- with open(filepath, read_mode, encoding=encoding) as file:
255
- content = file.read()
256
- logger.info(f'File read successfully: {filepath}')
257
- return content
258
- except FileNotFoundError as e:
259
- # Log if the file doesn't exist
260
- logger.error(f'File not found: "{filepath}": {e}')
261
- except (OSError, IOError) as e:
262
- logger.error(f'Error reading file "{filepath}": {e}')
263
- except Exception as e:
264
- # Log for unexpected errors
265
- logger.error(f'Unexpected error reading file "{filepath}": {e}', exc_info=True)
266
-
267
-
268
- def _delete_file(filepath: Path) -> None:
269
- try:
270
- # Delete the file
271
- filepath.unlink() # Remove the file
272
- logger.info(f'File deleted successfully: {filepath}')
273
- except FileNotFoundError as e:
274
- # Log if the file doesn't exist
275
- logger.error(f'File not found for deletion: "{filepath}": {e}')
276
- except (OSError, IOError) as e:
277
- # Log errors related to file system operations
278
- logger.error(f'Error deleting file "{filepath}": {e}')
279
- except Exception as e:
280
- # Log any unexpected errors
281
- logger.error(f'Unexpected error deleting file "{filepath}": {e}', exc_info=True)
282
-
283
-
284
- def _copy_file(source: Path, destination: Path) -> bool:
285
- try:
286
- # Copy the file
287
- shutil.copy(source, destination)
288
- logger.info(f'File copied successfully from {source} to {destination}')
289
- return True
290
-
291
- except FileNotFoundError:
292
- logger.error(f'Source file not found: {source}')
293
- except PermissionError as e:
294
- logger.error(f'Permission denied when copying file: {e}')
295
- except shutil.SameFileError:
296
- logger.warning(f'Source and destination are the same file: {source}')
297
- except Exception as e:
298
- logger.error(f'Unexpected error copying file from {source} to {destination}: {e}',
299
- exc_info=True)
300
- return False
182
+ def _load_toc_index(self) -> dict:
183
+ """Return the toc.json structure (creates a blank one if missing)."""
184
+ idx = FileOps.read_json(self.novel_toc_dir / "toc.json") or {
185
+ "updated": now_iso(),
186
+ "entries": [],
187
+ }
188
+ return idx
189
+
190
+ def _store_toc_index(self, idx: dict) -> None:
191
+ """Persist toc.json with a fresh root timestamp."""
192
+ idx["updated"] = now_iso()
193
+ FileOps.save_json(self.novel_toc_dir / "toc.json", idx)
194
+
195
+ def _next_toc_idx(self) -> int:
196
+ existing = (
197
+ int(p.stem.split("_")[1]) for p in self.novel_toc_dir.glob("toc_*.html")
198
+ )
199
+ return max(existing, default=-1) + 1
200
+
201
+ @staticmethod
202
+ def _get_novel_base_dir(
203
+ title: str,
204
+ base_novels_dir: str,
205
+ novel_base_dir: str | None = None
206
+ ) -> Path:
207
+ """
208
+ Resolve the base directory for *title* without creating any directories.
209
+
210
+ Priority:
211
+ 1. Explicit *novel_base_dir* argument.
212
+ 2. Stored value in <base_novels_dir>/meta.json.
213
+ 3. New path derived from normalized title, recorded back to meta.json.
214
+ """
215
+ base_dir_path = Path(base_novels_dir)
216
+ if not base_dir_path.exists():
217
+ raise FileManagerError(f"Base novels directory does not exist: {base_dir_path}")
218
+
219
+ # — 1. If the caller supplied a path, return it
220
+ if novel_base_dir:
221
+ return Path(novel_base_dir)
222
+
223
+ # — 2. Try to read meta.json
224
+ meta_path = base_dir_path / "meta.json"
225
+ if meta_path.exists():
226
+ try:
227
+ meta: Dict[str, Dict[str, str]] = FileOps.read_json(meta_path)
228
+ if title in meta and meta[title].get("novel_base_dir"):
229
+ return Path(meta[title]["novel_base_dir"])
230
+ except Exception as exc: # malformed JSON → ignore
231
+ logger.warning(f"Failed to read {meta_path}: {exc}")
232
+
233
+ # — 3. Fallback, generate a new directory name
234
+ clean_title = _normalize_dirname(title)
235
+
236
+ return base_dir_path / clean_title
237
+
238
+ @staticmethod
239
+ def _store_novel_base_dir(
240
+ title: str,
241
+ resolved_path: Path,
242
+ base_novels_dir: str,
243
+ ) -> None:
244
+ """
245
+ Persist <title, resolved_path> in <base_novels_dir>/meta.json.
246
+ """
247
+ meta_path = Path(base_novels_dir) / "meta.json"
248
+ try:
249
+ # Load existing metadata (ignore errors, start fresh on corruption)
250
+ meta: Dict[str, Dict[str, str]] = {}
251
+ if meta_path.exists():
252
+ try:
253
+ meta = FileOps.read_json(meta_path)
254
+ except Exception as exc:
255
+ logger.warning(f"meta.json corrupted, regenerating: {exc}")
256
+
257
+ # Skip write if up to date
258
+ current = meta.get(title, {}).get("novel_base_dir")
259
+ if current == str(resolved_path):
260
+ logger.debug(f"meta.json already has correct path for '{title}'; no update needed.")
261
+ return
262
+
263
+ # Update and persist
264
+ meta.setdefault(title, {})["novel_base_dir"] = str(resolved_path)
265
+ meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
266
+ logger.info(f"Recorded/updated novel dir in {meta_path}: {resolved_path}")
267
+
268
+ except Exception as exc:
269
+ logger.warning(f"Unable to update {meta_path}: {exc}")