web-novel-scraper 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,213 @@
1
+ [
2
+ {
3
+ "host": "default",
4
+ "has_pagination": false,
5
+ "title": {
6
+ "element": "h4",
7
+ "id": null,
8
+ "class": null,
9
+ "selector": null,
10
+ "attributes": null,
11
+ "array": true
12
+ },
13
+ "content": {
14
+ "element": "p",
15
+ "id": null,
16
+ "class": null,
17
+ "selector": null,
18
+ "attributes": null,
19
+ "array": true
20
+ },
21
+ "index": {
22
+ "element": "p",
23
+ "id": null,
24
+ "class": null,
25
+ "selector": null,
26
+ "attributes": null,
27
+ "array": true
28
+ },
29
+ "next_page": {
30
+ "element": "p",
31
+ "id": null,
32
+ "class": null,
33
+ "selector": null,
34
+ "attributes": null,
35
+ "array": true
36
+ }
37
+ },
38
+ {
39
+ "host": "novellive.net",
40
+ "has_pagination": true,
41
+ "title": {
42
+ "element": "h4",
43
+ "id": null,
44
+ "class": null,
45
+ "selector": null,
46
+ "attributes": null,
47
+ "array": false
48
+ },
49
+ "content": {
50
+ "element": "p",
51
+ "id": null,
52
+ "class": null,
53
+ "selector": null,
54
+ "attributes": null,
55
+ "array": true
56
+ },
57
+ "index": {
58
+ "element": null,
59
+ "id": null,
60
+ "class": null,
61
+ "selector": "div.m-newest2 ul li a",
62
+ "attributes": null,
63
+ "array": true
64
+ },
65
+ "next_page": {
66
+ "element": null,
67
+ "id": null,
68
+ "class": null,
69
+ "selector": "div.page > a.index-container-btn[href*='novellive']:nth-last-of-type(2)",
70
+ "attributes": null,
71
+ "array": true
72
+ }
73
+ },
74
+ {
75
+ "host": "royalroad.com",
76
+ "has_pagination": false,
77
+ "title": {
78
+ "element": null,
79
+ "id": null,
80
+ "class": null,
81
+ "selector": "h1.break-word",
82
+ "attributes": null,
83
+ "array": false,
84
+ "extract": {
85
+ "type": "text",
86
+ "key": "text"
87
+ }
88
+ },
89
+ "content": {
90
+ "element": null,
91
+ "id": null,
92
+ "class": null,
93
+ "selector": "p:not(div.author-note p):not(div.page-prefooter p):not(div.profile-info p) XOR div.chapter-content",
94
+ "attributes": null,
95
+ "array": true
96
+ },
97
+ "index": {
98
+ "element": null,
99
+ "id": null,
100
+ "class": null,
101
+ "selector": "tr.chapter-row td a",
102
+ "attributes": null,
103
+ "array": true
104
+ },
105
+ "next_page": {
106
+ "element": null,
107
+ "id": null,
108
+ "class": null,
109
+ "selector": null,
110
+ "attributes": null,
111
+ "array": true
112
+ }
113
+ },
114
+ {
115
+ "host": "novelbin.me",
116
+ "has_pagination": false,
117
+ "title": {
118
+ "element": "h2 a.chr-title",
119
+ "id": null,
120
+ "class": null,
121
+ "selector": null,
122
+ "attributes": null,
123
+ "array": false,
124
+ "extract": {
125
+ "type": "attr",
126
+ "key": "title"
127
+ }
128
+ },
129
+ "content": {
130
+ "element": "div#chr-content",
131
+ "id": null,
132
+ "class": null,
133
+ "selector": null,
134
+ "attributes": null,
135
+ "array": true
136
+ },
137
+ "index": {
138
+ "element": null,
139
+ "id": null,
140
+ "class": null,
141
+ "selector": "ul.list-chapter li a",
142
+ "attributes": null,
143
+ "array": true
144
+ },
145
+ "next_page": {
146
+ "element": null,
147
+ "id": null,
148
+ "class": null,
149
+ "selector": null,
150
+ "attributes": null,
151
+ "array": true
152
+ }
153
+ },
154
+ {
155
+ "host": "novelbin.com",
156
+ "has_pagination": false,
157
+ "title": {
158
+ "element": "h2 a.chr-title",
159
+ "id": null,
160
+ "class": null,
161
+ "selector": null,
162
+ "attributes": null,
163
+ "array": false,
164
+ "extract": {
165
+ "type": "attr",
166
+ "key": "title"
167
+ }
168
+ },
169
+ "content": {
170
+ "element": "div#chr-content",
171
+ "id": null,
172
+ "class": null,
173
+ "selector": null,
174
+ "attributes": null,
175
+ "array": true
176
+ },
177
+ "index": {
178
+ "element": null,
179
+ "id": null,
180
+ "class": null,
181
+ "selector": "ul.list-chapter li a",
182
+ "attributes": null,
183
+ "array": true
184
+ },
185
+ "next_page": {
186
+ "element": null,
187
+ "id": null,
188
+ "class": null,
189
+ "selector": null,
190
+ "attributes": null,
191
+ "array": true
192
+ }
193
+ },
194
+ {
195
+ "host": "hiraethtranslation.com",
196
+ "has_pagination": false,
197
+ "title": {
198
+ "element": "h1",
199
+ "extract": {
200
+ "type": "text"
201
+ }
202
+ },
203
+ "content": {
204
+ "element": "div.reading-content p",
205
+ "array": true
206
+ },
207
+ "index": {
208
+ "element": "ul.main li a",
209
+ "array": true,
210
+ "inverted": true
211
+ }
212
+ }
213
+ ]
@@ -0,0 +1,292 @@
1
+ import os
2
+ import json
3
+ import sys
4
+
5
+ import platformdirs
6
+ from pathlib import Path
7
+ import shutil
8
+ from dotenv import load_dotenv
9
+ from ebooklib import epub
10
+
11
+ from . import logger_manager
12
+
13
+ load_dotenv()
14
+
15
+ app_author = "ImagineBrkr"
16
+ app_name = "web-novel-scraper"
17
+
18
+
19
+ CURRENT_DIR = Path(__file__).resolve().parent
20
+
21
+ SCRAPER_BASE_CONFIG_DIR = os.getenv(
22
+ 'SCRAPER_BASE_CONFIG_DIR', platformdirs.user_config_dir(app_name, app_author))
23
+ SCRAPER_BASE_DATA_DIR = os.getenv(
24
+ 'SCRAPER_BASE_DATA_DIR', platformdirs.user_data_dir(app_name, app_author))
25
+
26
+ logger = logger_manager.create_logger('FILE MANAGER')
27
+
28
+ class FileManager:
29
+ novel_base_dir: Path
30
+ novel_data_dir: Path
31
+ novel_config_dir: Path
32
+ novel_chapters_dir: Path
33
+
34
+ novel_json_filepath: Path
35
+ novel_cover_filepath: Path
36
+
37
+ novel_json_filename: str = "main.json"
38
+ novel_cover_filename: str = "cover.jpg"
39
+ toc_preffix: str = "toc"
40
+
41
+ def __init__(self,
42
+ novel_title: str,
43
+ novel_base_dir: str = None,
44
+ novel_config_dir: str = None,
45
+ read_only: bool = False):
46
+ logger.debug(f'Initializing FileManager for novel: {novel_title}, read_only: {read_only}')
47
+ novel_base_dir = novel_base_dir if novel_base_dir else f'{
48
+ SCRAPER_BASE_DATA_DIR}/{novel_title}'
49
+ novel_config_dir = novel_config_dir if novel_config_dir else f'{
50
+ SCRAPER_BASE_CONFIG_DIR}/{novel_title}'
51
+
52
+ logger.debug(f'Using base dir: {novel_base_dir}, config dir: {novel_config_dir}')
53
+
54
+ if read_only:
55
+ self.novel_base_dir = _check_path(novel_base_dir)
56
+ self.novel_data_dir = _check_path(f'{novel_base_dir}/data')
57
+ self.novel_chapters_dir = _check_path(f'{self.novel_data_dir}/chapters')
58
+ self.novel_config_dir = _check_path(str(novel_config_dir))
59
+ logger.info(f'Initialized read-only FileManager for {novel_title}')
60
+ else:
61
+ try:
62
+ self.novel_base_dir = _create_path_if_not_exists(novel_base_dir)
63
+ self.novel_data_dir = _create_path_if_not_exists(
64
+ f'{novel_base_dir}/data')
65
+ self.novel_chapters_dir = _create_path_if_not_exists(
66
+ f'{self.novel_data_dir}/chapters')
67
+ self.novel_config_dir = _create_path_if_not_exists(novel_config_dir)
68
+ logger.info(f'Created directory structure for novel: {novel_title}')
69
+ except Exception as e:
70
+ logger.critical(f'Failed to create directory structure: {e}')
71
+ raise
72
+
73
+ self.novel_json_filepath = self.novel_data_dir / self.novel_json_filename
74
+ self.novel_cover_filepath = self.novel_data_dir / self.novel_cover_filename
75
+ logger.debug(f'Set json path: {self.novel_json_filepath}, cover path: {self.novel_cover_filepath}')
76
+
77
+ def save_chapter_html(self, filename: str, content: str):
78
+ full_path = self.novel_chapters_dir / filename
79
+ logger.debug(f'Saving chapter to {full_path}')
80
+ _save_content_to_file(full_path, content)
81
+
82
+ def load_chapter_html(self, filename: str):
83
+ full_path = self.novel_chapters_dir / filename
84
+ logger.debug(f'Loading chapter from {full_path}')
85
+ if full_path.exists():
86
+ return _read_content_from_file(full_path)
87
+ logger.warning(f'Chapter file not found: {filename}')
88
+ return None
89
+
90
+ def delete_chapter_html(self, filename: str):
91
+ full_path = self.novel_chapters_dir / filename
92
+ logger.debug(f'Attempting to delete chapter: {filename}')
93
+ if full_path.exists():
94
+ _delete_file(full_path)
95
+ else:
96
+ logger.warning(f'Chapter file not found for deletion: {filename}')
97
+
98
+ def save_novel_json(self, novel_data: dict):
99
+ logger.debug(f'Saving novel data to {self.novel_json_filepath}')
100
+ _save_content_to_file(self.novel_json_filepath, novel_data, is_json=True)
101
+
102
+ def load_novel_json(self):
103
+ logger.debug(f'Loading novel data from {self.novel_json_filepath}')
104
+ if self.novel_json_filepath.exists():
105
+ return _read_content_from_file(self.novel_json_filepath)
106
+ logger.warning('Novel JSON file not found')
107
+
108
+ def save_novel_cover(self, source_cover_path: str):
109
+ source_cover_path = Path(source_cover_path)
110
+ logger.debug(f'Attempting to save cover from {source_cover_path}')
111
+ if source_cover_path.exists():
112
+ return _copy_file(source_cover_path, self.novel_cover_filepath)
113
+ logger.error(f'Source cover path {source_cover_path} not found')
114
+ return False
115
+
116
+ def load_novel_cover(self):
117
+ logger.debug(f'Loading cover from {self.novel_cover_filepath}')
118
+ if self.novel_cover_filepath.exists():
119
+ return _read_content_from_file(self.novel_cover_filepath, bytes=True)
120
+ logger.warning('Cover file not found')
121
+
122
+ def delete_toc(self):
123
+ logger.debug('Starting TOC deletion process')
124
+ toc_pos = 0
125
+ toc_exists = True
126
+ deleted_count = 0
127
+ while toc_exists:
128
+ toc_filename = f"{self.toc_preffix}_{toc_pos}.html"
129
+ toc_path = self.novel_data_dir / toc_filename
130
+ toc_exists = toc_path.exists()
131
+ if toc_exists:
132
+ _delete_file(toc_path)
133
+ deleted_count += 1
134
+ toc_pos += 1
135
+ logger.info(f'Deleted {deleted_count} TOC files')
136
+
137
+ def add_toc(self, content: str):
138
+ logger.debug('Adding new TOC entry')
139
+ toc_pos = 0
140
+ toc_exists = True
141
+ while toc_exists:
142
+ toc_filename = f"{self.toc_preffix}_{toc_pos}.html"
143
+ toc_path = self.novel_data_dir / toc_filename
144
+ toc_exists = toc_path.exists()
145
+ if toc_exists:
146
+ toc_pos += 1
147
+ _save_content_to_file(toc_path, content)
148
+ logger.info(f'Added TOC entry at position {toc_pos}')
149
+
150
+ def update_toc(self, content: str, toc_idx: int):
151
+ toc_filename = f"{self.toc_preffix}_{toc_idx}.html"
152
+ toc_path = self.novel_data_dir / toc_filename
153
+ logger.debug(f'Updating TOC at index {toc_idx}')
154
+ if toc_path.exists():
155
+ _save_content_to_file(toc_path, content)
156
+ else:
157
+ logger.error(f'TOC file not found: {toc_path}')
158
+
159
+ def get_toc(self, pos_idx: int):
160
+ toc_filename = f"{self.toc_preffix}_{pos_idx}.html"
161
+ toc_path = self.novel_data_dir / toc_filename
162
+ logger.debug(f'Loading TOC at index {pos_idx}')
163
+ if toc_path.exists():
164
+ return _read_content_from_file(toc_path)
165
+ logger.debug(f'No TOC found at index {pos_idx}')
166
+
167
+ def get_all_toc(self):
168
+ logger.debug('Loading all TOC entries')
169
+ pos = 0
170
+ tocs = []
171
+ while True:
172
+ toc_content = self.get_toc(pos)
173
+ if toc_content:
174
+ tocs.append(toc_content)
175
+ pos += 1
176
+ else:
177
+ logger.info(f'Found {len(tocs)} TOC entries')
178
+ return tocs
179
+
180
+ def save_book(self, book: epub.EpubBook, filename: str) -> bool:
181
+ book_path = self.novel_base_dir / filename
182
+ logger.debug(f'Attempting to save book to {book_path}')
183
+ try:
184
+ epub.write_epub(str(book_path), book)
185
+ logger.info(f'Book saved successfully to {book_path}')
186
+ return True
187
+
188
+ except PermissionError as e:
189
+ logger.error(f'Permission denied when saving book to {book_path}: {e}')
190
+ return False
191
+ except OSError as e:
192
+ logger.error(f'OS error when saving book to {book_path}: {e}')
193
+ return False
194
+ except Exception as e:
195
+ logger.critical(f'Unexpected error saving book to {book_path}: {e}')
196
+ return False
197
+
198
+ def _check_path(dir_path: str) -> Path:
199
+ try:
200
+ dir_path = Path(dir_path)
201
+ return dir_path
202
+ except TypeError as e:
203
+ logger.error(f"Invalid path type: {e}")
204
+ raise
205
+ except Exception as e:
206
+ logger.error(f"Unexpected error converting path: {e}", exc_info=True)
207
+ raise
208
+
209
+ def _create_path_if_not_exists(dir_path: str) -> Path:
210
+ try:
211
+ dir_path = _check_path(dir_path)
212
+ dir_path.mkdir(parents=True, exist_ok=True)
213
+ return dir_path
214
+ except OSError as e:
215
+ logger.error(f"Error with directory creation: {e}")
216
+ # Change this to raise for debugging
217
+ sys.exit(1)
218
+ except Exception as e:
219
+ logger.error(f"Unexpected error: {e}", exc_info=True)
220
+ raise
221
+
222
+
223
+ def _save_content_to_file(filepath: Path, content: str | dict, is_json: bool = False) -> None:
224
+ try:
225
+ if is_json:
226
+ with open(filepath, 'w', encoding='utf-8') as file:
227
+ json.dump(content, file, indent=2, ensure_ascii=False)
228
+ else:
229
+ with open(filepath, 'w', encoding='UTF-8') as file:
230
+ file.write(content)
231
+ logger.info(f'File saved successfully: {filepath}')
232
+ except (OSError, IOError) as e:
233
+ logger.error(f'Error saving file "{filepath}": {e}')
234
+ except Exception as e:
235
+ logger.error(f'Unexpected error saving file "{
236
+ filepath}": {e}', exc_info=True)
237
+
238
+
239
+ def _read_content_from_file(filepath: Path, bytes: bool = False) -> str:
240
+ try:
241
+ # Read the file
242
+ read_mode = 'rb' if bytes else 'r'
243
+ encoding = None if bytes else 'utf-8'
244
+ with open(filepath, read_mode, encoding=encoding) as file:
245
+ content = file.read()
246
+ logger.info(f'File read successfully: {filepath}')
247
+ return content
248
+ except FileNotFoundError as e:
249
+ # Log if the file doesn't exist
250
+ logger.error(f'File not found: "{filepath}": {e}')
251
+ except (OSError, IOError) as e:
252
+ logger.error(f'Error reading file "{filepath}": {e}')
253
+ except Exception as e:
254
+ # Log for unexpected errors
255
+ logger.error(f'Unexpected error reading file "{
256
+ filepath}": {e}', exc_info=True)
257
+
258
+
259
+ def _delete_file(filepath: Path) -> None:
260
+ try:
261
+ # Delete the file
262
+ filepath.unlink() # Remove the file
263
+ logger.info(f'File deleted successfully: {filepath}')
264
+ except FileNotFoundError as e:
265
+ # Log if the file doesn't exist
266
+ logger.error(f'File not found for deletion: "{filepath}": {e}')
267
+ except (OSError, IOError) as e:
268
+ # Log errors related to file system operations
269
+ logger.error(f'Error deleting file "{filepath}": {e}')
270
+ except Exception as e:
271
+ # Log any unexpected errors
272
+ logger.error(f'Unexpected error deleting file "{
273
+ filepath}": {e}', exc_info=True)
274
+
275
+
276
+ def _copy_file(source: Path, destination: Path) -> bool:
277
+ try:
278
+ # Copy the file
279
+ shutil.copy(source, destination)
280
+ logger.info(f'File copied successfully from {source} to {destination}')
281
+ return True
282
+
283
+ except FileNotFoundError:
284
+ logger.error(f'Source file not found: {source}')
285
+ except PermissionError as e:
286
+ logger.error(f'Permission denied when copying file: {e}')
287
+ except shutil.SameFileError:
288
+ logger.warning(f'Source and destination are the same file: {source}')
289
+ except Exception as e:
290
+ logger.error(f'Unexpected error copying file from {source} to {destination}: {e}',
291
+ exc_info=True)
292
+ return False
@@ -0,0 +1,72 @@
1
+ import logging
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ logging_levels = {
7
+ "DEBUG": logging.DEBUG,
8
+ "INFO": logging.INFO,
9
+ "WARNING": logging.WARNING,
10
+ "ERROR": logging.ERROR,
11
+ "CRITICAL": logging.CRITICAL,
12
+ "DEFAULT": logging.CRITICAL + 1
13
+ }
14
+ LOGGING_LEVEL = os.getenv('SCRAPER_LOGGING_LEVEL', 'DEFAULT').upper()
15
+ LOGGING_FILE = os.getenv('SCRAPER_LOGGING_FILE', None)
16
+
17
+ if LOGGING_LEVEL in logging_levels:
18
+ LOGGING_LEVEL = logging_levels[LOGGING_LEVEL]
19
+ else:
20
+ LOGGING_LEVEL = logging_levels['DEFAULT']
21
+
22
+ process = "main"
23
+
24
+ class CustomFormatter(logging.Formatter):
25
+
26
+ grey = "\x1b[38;20m"
27
+ yellow = "\x1b[33;20m"
28
+ red = "\x1b[31;20m"
29
+ bold_red = "\x1b[31;1m"
30
+ green = "\x1b[32;20m"
31
+ reset = "\x1b[0m"
32
+ format_str = f"%(asctime)s - %(levelname)s {reset}- %(operation)s - %(message)s"
33
+
34
+ FORMATS = {
35
+ logging.DEBUG: grey + format_str + reset,
36
+ logging.INFO: green + format_str + reset,
37
+ logging.WARNING: yellow + format_str + reset,
38
+ logging.ERROR: red + format_str + reset,
39
+ logging.CRITICAL: bold_red + format_str + reset
40
+ }
41
+
42
+ def format(self, record):
43
+ if LOGGING_FILE:
44
+ log_fmt = self.format_str.replace(self.reset, '')
45
+ else:
46
+ log_fmt = self.FORMATS.get(record.levelno)
47
+ formatter = logging.Formatter(log_fmt)
48
+ return formatter.format(record)
49
+
50
+
51
+ def create_logger(operation):
52
+ logger = logging.getLogger(process)
53
+ logger.setLevel(LOGGING_LEVEL)
54
+
55
+ if not logger.handlers:
56
+ if LOGGING_FILE:
57
+ lh = logging.FileHandler(LOGGING_FILE, encoding='utf-8')
58
+ else:
59
+ lh = logging.StreamHandler()
60
+ lh.setLevel(LOGGING_LEVEL)
61
+ lh.setFormatter(CustomFormatter())
62
+ logger.addHandler(lh)
63
+
64
+ extra = {'operation': operation}
65
+ logger = logging.LoggerAdapter(logger, extra)
66
+
67
+ return logger
68
+
69
+
70
+ def set_process(new_process):
71
+ global process
72
+ process = new_process