web-novel-scraper 1.1.1__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web_novel_scraper/__main__.py +116 -94
- web_novel_scraper/config_manager.py +84 -0
- web_novel_scraper/decode.py +30 -44
- web_novel_scraper/decode_guide/decode_guide.json +47 -0
- web_novel_scraper/file_manager.py +226 -257
- web_novel_scraper/novel_scraper.py +64 -41
- web_novel_scraper/request_manager.py +2 -2
- web_novel_scraper/utils.py +132 -2
- web_novel_scraper/version.py +1 -1
- {web_novel_scraper-1.1.1.dist-info → web_novel_scraper-2.0.0.dist-info}/METADATA +1 -1
- web_novel_scraper-2.0.0.dist-info/RECORD +19 -0
- web_novel_scraper-1.1.1.dist-info/RECORD +0 -18
- {web_novel_scraper-1.1.1.dist-info → web_novel_scraper-2.0.0.dist-info}/WHEEL +0 -0
- {web_novel_scraper-1.1.1.dist-info → web_novel_scraper-2.0.0.dist-info}/entry_points.txt +0 -0
@@ -1,201 +1,174 @@
|
|
1
|
-
import os
|
2
1
|
import json
|
3
|
-
import sys
|
4
2
|
|
5
|
-
import platformdirs
|
6
3
|
from pathlib import Path
|
7
|
-
import shutil
|
8
|
-
from dotenv import load_dotenv
|
9
4
|
from ebooklib import epub
|
5
|
+
from typing import Optional, Dict
|
10
6
|
import unicodedata
|
11
7
|
|
12
|
-
from . import
|
8
|
+
from .logger_manager import create_logger
|
9
|
+
from .utils import _normalize_dirname, FileOps, now_iso, FileManagerError
|
13
10
|
|
14
|
-
|
11
|
+
NOVEL_JSON_FILENAME = 'main.json'
|
12
|
+
NOVEL_COVER_FILENAME = 'cover.jpg'
|
15
13
|
|
16
|
-
|
17
|
-
app_name = "web-novel-scraper"
|
14
|
+
logger = create_logger('FILE MANAGER')
|
18
15
|
|
19
16
|
|
20
|
-
CURRENT_DIR = Path(__file__).resolve().parent
|
21
|
-
|
22
|
-
SCRAPER_BASE_CONFIG_DIR = os.getenv(
|
23
|
-
'SCRAPER_BASE_CONFIG_DIR', platformdirs.user_config_dir(app_name, app_author))
|
24
|
-
SCRAPER_BASE_DATA_DIR = os.getenv(
|
25
|
-
'SCRAPER_BASE_DATA_DIR', platformdirs.user_data_dir(app_name, app_author))
|
26
|
-
|
27
|
-
logger = logger_manager.create_logger('FILE MANAGER')
|
28
|
-
|
29
17
|
class FileManager:
|
30
18
|
novel_base_dir: Path
|
31
19
|
novel_data_dir: Path
|
32
|
-
novel_config_dir: Path
|
33
20
|
novel_chapters_dir: Path
|
21
|
+
novel_toc_dir: Path
|
34
22
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
novel_json_filename: str = "main.json"
|
39
|
-
novel_cover_filename: str = "cover.jpg"
|
40
|
-
toc_preffix: str = "toc"
|
23
|
+
novel_json_file: Path
|
24
|
+
novel_cover_file: Path = None
|
41
25
|
|
42
26
|
def __init__(self,
|
43
|
-
|
27
|
+
title: str,
|
28
|
+
base_novels_dir: str,
|
44
29
|
novel_base_dir: str = None,
|
45
|
-
novel_config_dir: str = None,
|
46
30
|
read_only: bool = False):
|
47
|
-
logger.debug(f'Initializing FileManager for novel: {
|
48
|
-
novel_base_dir =
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
self.
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
self.novel_chapters_dir = _create_path_if_not_exists(
|
67
|
-
f'{self.novel_data_dir}/chapters')
|
68
|
-
self.novel_config_dir = _create_path_if_not_exists(novel_config_dir)
|
69
|
-
logger.info(f'Created directory structure for novel: {novel_title}')
|
70
|
-
except Exception as e:
|
71
|
-
logger.critical(f'Failed to create directory structure: {e}')
|
72
|
-
raise
|
73
|
-
|
74
|
-
self.novel_json_filepath = self.novel_data_dir / self.novel_json_filename
|
75
|
-
self.novel_cover_filepath = self.novel_data_dir / self.novel_cover_filename
|
76
|
-
logger.debug(f'Set json path: {self.novel_json_filepath}, cover path: {self.novel_cover_filepath}')
|
77
|
-
|
78
|
-
def save_chapter_html(self, filename: str, content: str):
|
79
|
-
full_path = self.novel_chapters_dir / filename
|
31
|
+
logger.debug(f'Initializing FileManager for novel: {title}')
|
32
|
+
self.novel_base_dir = self._get_novel_base_dir(title, base_novels_dir, novel_base_dir)
|
33
|
+
logger.debug(f'Novel base dir: {self.novel_base_dir}')
|
34
|
+
self.novel_data_dir = self.novel_base_dir / 'data'
|
35
|
+
self.novel_chapters_dir = self.novel_data_dir / 'chapters'
|
36
|
+
self.novel_toc_dir = self.novel_data_dir / "toc"
|
37
|
+
self.novel_json_file = self.novel_data_dir / NOVEL_JSON_FILENAME
|
38
|
+
self.novel_cover_file = self.novel_data_dir / NOVEL_COVER_FILENAME
|
39
|
+
|
40
|
+
if not read_only:
|
41
|
+
FileOps.ensure_dir(self.novel_base_dir)
|
42
|
+
if novel_base_dir is None:
|
43
|
+
self._store_novel_base_dir(title, self.novel_base_dir, base_novels_dir)
|
44
|
+
FileOps.ensure_dir(self.novel_data_dir)
|
45
|
+
FileOps.ensure_dir(self.novel_chapters_dir)
|
46
|
+
FileOps.ensure_dir(self.novel_toc_dir)
|
47
|
+
|
48
|
+
def save_chapter_html(self, chapter_filename: str, content: str) -> None:
|
49
|
+
full_path = self.novel_chapters_dir / chapter_filename
|
80
50
|
logger.debug(f'Saving chapter to {full_path}')
|
81
51
|
content = unicodedata.normalize('NFKC', content)
|
82
|
-
|
83
|
-
"â": "'", # Reemplazar â con apóstrofe
|
84
|
-
"\u2018": "'", # Comillda simple izquierda Unicode
|
85
|
-
"\u2019": "'", # Comilla simple derecha Unicode
|
86
|
-
"\u201C": '"', # Comilla doble izquierda Unicode
|
87
|
-
"\u201D": '"', # Comilla doble derecha Unicode
|
88
|
-
}
|
89
|
-
for old_char, new_char in char_replacements.items():
|
90
|
-
content = content.replace(old_char, new_char)
|
91
|
-
_save_content_to_file(full_path, content)
|
92
|
-
|
93
|
-
def load_chapter_html(self, filename: str):
|
94
|
-
full_path = self.novel_chapters_dir / filename
|
95
|
-
logger.debug(f'Loading chapter from {full_path}')
|
96
|
-
if full_path.exists():
|
97
|
-
return _read_content_from_file(full_path)
|
98
|
-
logger.warning(f'Chapter file not found: {filename}')
|
99
|
-
return None
|
100
|
-
|
101
|
-
def delete_chapter_html(self, filename: str):
|
102
|
-
full_path = self.novel_chapters_dir / filename
|
103
|
-
logger.debug(f'Attempting to delete chapter: {filename}')
|
104
|
-
if full_path.exists():
|
105
|
-
_delete_file(full_path)
|
106
|
-
else:
|
107
|
-
logger.warning(f'Chapter file not found for deletion: {filename}')
|
52
|
+
FileOps.save_text(full_path, content)
|
108
53
|
|
109
|
-
def
|
110
|
-
|
111
|
-
|
54
|
+
def chapter_file_exists(self, chapter_filename: str) -> bool:
|
55
|
+
full_path = self.novel_chapters_dir / chapter_filename
|
56
|
+
return full_path.exists()
|
112
57
|
|
113
|
-
def
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
58
|
+
def load_chapter_html(self, chapter_filename: str) -> Optional[str]:
|
59
|
+
full_path = self.novel_chapters_dir / chapter_filename
|
60
|
+
logger.debug(f'Loading chapter from {full_path}')
|
61
|
+
chapter_content = FileOps.read_text(full_path)
|
62
|
+
if not chapter_content:
|
63
|
+
logger.debug(f'Chapter content not found: {chapter_filename}')
|
64
|
+
return chapter_content
|
65
|
+
|
66
|
+
def delete_chapter_html(self, chapter_filename: str) -> None:
|
67
|
+
full_path = self.novel_chapters_dir / chapter_filename
|
68
|
+
logger.debug(f'Attempting to delete chapter: {chapter_filename}')
|
69
|
+
FileOps.delete(full_path)
|
70
|
+
|
71
|
+
def save_novel_json(self, novel_data: dict) -> None:
|
72
|
+
logger.debug(f'Saving novel data to {self.novel_json_file}')
|
73
|
+
FileOps.save_json(self.novel_json_file, novel_data)
|
74
|
+
|
75
|
+
def load_novel_json(self) -> Optional[str]:
|
76
|
+
logger.debug(f'Loading novel data from {self.novel_json_file}')
|
77
|
+
novel_json = FileOps.read_text(self.novel_json_file)
|
78
|
+
if novel_json is None:
|
79
|
+
logger.debug('Could not read novel JSON file')
|
80
|
+
return novel_json
|
81
|
+
|
82
|
+
def save_novel_cover(self, source_cover_path: str) -> None:
|
120
83
|
source_cover_path = Path(source_cover_path)
|
121
84
|
logger.debug(f'Attempting to save cover from {source_cover_path}')
|
122
|
-
if source_cover_path.exists():
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
def load_novel_cover(self):
|
128
|
-
|
129
|
-
|
130
|
-
return
|
131
|
-
logger.
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
logger.debug(
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
85
|
+
if not source_cover_path.exists():
|
86
|
+
logger.critical(f'No cover found on {source_cover_path}')
|
87
|
+
raise ValueError(f'No cover found on {source_cover_path}')
|
88
|
+
FileOps.copy(source_cover_path, self.novel_cover_file)
|
89
|
+
|
90
|
+
def load_novel_cover(self) -> Optional[bytes]:
|
91
|
+
if self.novel_cover_file is None:
|
92
|
+
logger.debug(f'No cover found')
|
93
|
+
return None
|
94
|
+
logger.debug(f'Loading cover from {self.novel_cover_file}')
|
95
|
+
cover = FileOps.read_binary(self.novel_cover_file)
|
96
|
+
if cover is None:
|
97
|
+
logger.debug(f'Could not read cover from {self.novel_cover_file}')
|
98
|
+
return cover
|
99
|
+
|
100
|
+
## TOC API
|
101
|
+
|
102
|
+
def add_toc(self, html: str) -> int:
|
103
|
+
"""Add a new TOC fragment, return its index."""
|
104
|
+
idx = self._next_toc_idx()
|
105
|
+
toc_path = self.novel_toc_dir / f"toc_{idx}.html"
|
106
|
+
FileOps.save_text(toc_path, html)
|
107
|
+
|
108
|
+
toc_index = self._load_toc_index()
|
109
|
+
toc_index["entries"].append({"file": toc_path.name, "updated": now_iso()})
|
110
|
+
self._store_toc_index(toc_index)
|
111
|
+
|
112
|
+
logger.debug(f"Added TOC #{idx} → {toc_path}")
|
113
|
+
return idx
|
114
|
+
|
115
|
+
def update_toc(self, idx: int, html: str) -> None:
|
116
|
+
toc_path = self.novel_toc_dir / f"toc_{idx}.html"
|
117
|
+
if not toc_path.exists():
|
118
|
+
raise FileManagerError(f"TOC #{idx} not found")
|
119
|
+
|
120
|
+
FileOps.save_text(toc_path, html)
|
121
|
+
|
122
|
+
toc_index = self._load_toc_index()
|
123
|
+
for entry in toc_index["entries"]:
|
124
|
+
if entry["file"] == toc_path.name:
|
125
|
+
entry["updated"] = now_iso()
|
126
|
+
break
|
127
|
+
self._store_toc_index(toc_index)
|
128
|
+
logger.debug(f"Updated TOC #{idx}")
|
129
|
+
|
130
|
+
def delete_toc(self, idx: Optional[int] = None) -> None:
|
131
|
+
"""Delete a single TOC by index or all if *idx* is None."""
|
132
|
+
toc_index = self._load_toc_index()
|
133
|
+
|
134
|
+
def _delete(path: Path) -> None:
|
135
|
+
FileOps.delete(path)
|
136
|
+
logger.debug(f"Deleted {path}")
|
137
|
+
|
138
|
+
if idx is None: # delete all
|
139
|
+
for entry in toc_index["entries"]:
|
140
|
+
_delete(self.novel_toc_dir / entry["file"])
|
141
|
+
toc_index["entries"] = []
|
167
142
|
else:
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
logger.info(f'Found {len(tocs)} TOC entries')
|
189
|
-
return tocs
|
143
|
+
toc_path = self.novel_toc_dir / f"toc_{idx}.html"
|
144
|
+
_delete(toc_path)
|
145
|
+
toc_index["entries"] = [
|
146
|
+
e for e in toc_index["entries"] if e["file"] != toc_path.name
|
147
|
+
]
|
148
|
+
self._store_toc_index(toc_index)
|
149
|
+
|
150
|
+
def get_toc(self, idx: int) -> Optional[str]:
|
151
|
+
"""Return TOC HTML content or None."""
|
152
|
+
return FileOps.read_text(self.novel_toc_dir / f"toc_{idx}.html")
|
153
|
+
|
154
|
+
def get_all_toc(self) -> list[str]:
|
155
|
+
"""Return all TOC fragments in order."""
|
156
|
+
toc_index = self._load_toc_index()
|
157
|
+
contents: list[str] = []
|
158
|
+
for entry in toc_index["entries"]:
|
159
|
+
html = FileOps.read_text(self.novel_toc_dir / entry["file"])
|
160
|
+
if html is not None:
|
161
|
+
contents.append(html)
|
162
|
+
return contents
|
190
163
|
|
191
164
|
def save_book(self, book: epub.EpubBook, filename: str) -> bool:
|
192
165
|
book_path = self.novel_base_dir / filename
|
193
166
|
logger.debug(f'Attempting to save book to {book_path}')
|
194
|
-
try:
|
167
|
+
try:
|
195
168
|
epub.write_epub(str(book_path), book)
|
196
169
|
logger.info(f'Book saved successfully to {book_path}')
|
197
170
|
return True
|
198
|
-
|
171
|
+
|
199
172
|
except PermissionError as e:
|
200
173
|
logger.error(f'Permission denied when saving book to {book_path}: {e}')
|
201
174
|
return False
|
@@ -206,95 +179,91 @@ class FileManager:
|
|
206
179
|
logger.critical(f'Unexpected error saving book to {book_path}: {e}')
|
207
180
|
return False
|
208
181
|
|
209
|
-
def
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
except Exception as e:
|
298
|
-
logger.error(f'Unexpected error copying file from {source} to {destination}: {e}',
|
299
|
-
exc_info=True)
|
300
|
-
return False
|
182
|
+
def _load_toc_index(self) -> dict:
|
183
|
+
"""Return the toc.json structure (creates a blank one if missing)."""
|
184
|
+
idx = FileOps.read_json(self.novel_toc_dir / "toc.json") or {
|
185
|
+
"updated": now_iso(),
|
186
|
+
"entries": [],
|
187
|
+
}
|
188
|
+
return idx
|
189
|
+
|
190
|
+
def _store_toc_index(self, idx: dict) -> None:
|
191
|
+
"""Persist toc.json with a fresh root timestamp."""
|
192
|
+
idx["updated"] = now_iso()
|
193
|
+
FileOps.save_json(self.novel_toc_dir / "toc.json", idx)
|
194
|
+
|
195
|
+
def _next_toc_idx(self) -> int:
|
196
|
+
existing = (
|
197
|
+
int(p.stem.split("_")[1]) for p in self.novel_toc_dir.glob("toc_*.html")
|
198
|
+
)
|
199
|
+
return max(existing, default=-1) + 1
|
200
|
+
|
201
|
+
@staticmethod
|
202
|
+
def _get_novel_base_dir(
|
203
|
+
title: str,
|
204
|
+
base_novels_dir: str,
|
205
|
+
novel_base_dir: str | None = None
|
206
|
+
) -> Path:
|
207
|
+
"""
|
208
|
+
Resolve the base directory for *title* without creating any directories.
|
209
|
+
|
210
|
+
Priority:
|
211
|
+
1. Explicit *novel_base_dir* argument.
|
212
|
+
2. Stored value in <base_novels_dir>/meta.json.
|
213
|
+
3. New path derived from normalized title, recorded back to meta.json.
|
214
|
+
"""
|
215
|
+
base_dir_path = Path(base_novels_dir)
|
216
|
+
if not base_dir_path.exists():
|
217
|
+
raise FileManagerError(f"Base novels directory does not exist: {base_dir_path}")
|
218
|
+
|
219
|
+
# — 1. If the caller supplied a path, return it
|
220
|
+
if novel_base_dir:
|
221
|
+
return Path(novel_base_dir)
|
222
|
+
|
223
|
+
# — 2. Try to read meta.json
|
224
|
+
meta_path = base_dir_path / "meta.json"
|
225
|
+
if meta_path.exists():
|
226
|
+
try:
|
227
|
+
meta: Dict[str, Dict[str, str]] = FileOps.read_json(meta_path)
|
228
|
+
if title in meta and meta[title].get("novel_base_dir"):
|
229
|
+
return Path(meta[title]["novel_base_dir"])
|
230
|
+
except Exception as exc: # malformed JSON → ignore
|
231
|
+
logger.warning(f"Failed to read {meta_path}: {exc}")
|
232
|
+
|
233
|
+
# — 3. Fallback, generate a new directory name
|
234
|
+
clean_title = _normalize_dirname(title)
|
235
|
+
|
236
|
+
return base_dir_path / clean_title
|
237
|
+
|
238
|
+
@staticmethod
|
239
|
+
def _store_novel_base_dir(
|
240
|
+
title: str,
|
241
|
+
resolved_path: Path,
|
242
|
+
base_novels_dir: str,
|
243
|
+
) -> None:
|
244
|
+
"""
|
245
|
+
Persist <title, resolved_path> in <base_novels_dir>/meta.json.
|
246
|
+
"""
|
247
|
+
meta_path = Path(base_novels_dir) / "meta.json"
|
248
|
+
try:
|
249
|
+
# Load existing metadata (ignore errors, start fresh on corruption)
|
250
|
+
meta: Dict[str, Dict[str, str]] = {}
|
251
|
+
if meta_path.exists():
|
252
|
+
try:
|
253
|
+
meta = FileOps.read_json(meta_path)
|
254
|
+
except Exception as exc:
|
255
|
+
logger.warning(f"meta.json corrupted, regenerating: {exc}")
|
256
|
+
|
257
|
+
# Skip write if up to date
|
258
|
+
current = meta.get(title, {}).get("novel_base_dir")
|
259
|
+
if current == str(resolved_path):
|
260
|
+
logger.debug(f"meta.json already has correct path for '{title}'; no update needed.")
|
261
|
+
return
|
262
|
+
|
263
|
+
# Update and persist
|
264
|
+
meta.setdefault(title, {})["novel_base_dir"] = str(resolved_path)
|
265
|
+
meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
|
266
|
+
logger.info(f"Recorded/updated novel dir in {meta_path}: {resolved_path}")
|
267
|
+
|
268
|
+
except Exception as exc:
|
269
|
+
logger.warning(f"Unable to update {meta_path}: {exc}")
|