web-novel-scraper 2.0.2__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web_novel_scraper/__main__.py +123 -68
- web_novel_scraper/config_manager.py +12 -12
- web_novel_scraper/decode.py +225 -80
- web_novel_scraper/decode_guide/decode_guide.json +29 -0
- web_novel_scraper/file_manager.py +292 -110
- web_novel_scraper/models.py +76 -0
- web_novel_scraper/novel_scraper.py +895 -424
- web_novel_scraper/request_manager.py +50 -17
- web_novel_scraper/utils.py +22 -1
- web_novel_scraper/version.py +1 -1
- {web_novel_scraper-2.0.2.dist-info → web_novel_scraper-2.1.0.dist-info}/METADATA +1 -1
- web_novel_scraper-2.1.0.dist-info/RECORD +20 -0
- web_novel_scraper-2.0.2.dist-info/RECORD +0 -19
- {web_novel_scraper-2.0.2.dist-info → web_novel_scraper-2.1.0.dist-info}/WHEEL +0 -0
- {web_novel_scraper-2.0.2.dist-info → web_novel_scraper-2.1.0.dist-info}/entry_points.txt +0 -0
web_novel_scraper/decode.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
import json
|
2
2
|
from typing import Optional
|
3
3
|
|
4
|
+
from pathlib import Path
|
5
|
+
|
4
6
|
from . import logger_manager
|
5
7
|
from .custom_processor.custom_processor import ProcessorRegistry
|
6
|
-
from .utils import FileOps
|
8
|
+
from .utils import FileOps, DecodeError, ValidationError
|
7
9
|
|
8
10
|
from bs4 import BeautifulSoup
|
9
11
|
|
@@ -18,23 +20,48 @@ DEFAULT_REQUEST_CONFIG = {
|
|
18
20
|
"request_time_between_retries": 3
|
19
21
|
}
|
20
22
|
|
23
|
+
|
24
|
+
class HTMLParseError(DecodeError):
|
25
|
+
"""Raised when HTML parsing fails"""
|
26
|
+
|
27
|
+
|
28
|
+
class DecodeGuideError(DecodeError):
|
29
|
+
"""Raised when there are issues with decode guide configuration"""
|
30
|
+
|
31
|
+
|
32
|
+
class ContentExtractionError(DecodeError):
|
33
|
+
"""Raised when content extraction fails"""
|
34
|
+
|
35
|
+
|
21
36
|
class Decoder:
|
22
37
|
host: str
|
23
|
-
decode_guide_file:
|
38
|
+
decode_guide_file: Path
|
24
39
|
decode_guide: json
|
25
40
|
request_config: dict
|
26
41
|
|
27
|
-
def __init__(self, host: str, decode_guide_file:
|
42
|
+
def __init__(self, host: str, decode_guide_file: Path):
|
28
43
|
self.decode_guide_file = decode_guide_file
|
29
44
|
self.set_host(host)
|
30
45
|
|
31
46
|
def set_host(self, host: str) -> None:
|
32
47
|
self.host = host
|
33
|
-
|
48
|
+
try:
|
49
|
+
self._set_decode_guide()
|
50
|
+
except ValidationError:
|
51
|
+
raise
|
52
|
+
|
34
53
|
host_request_config = self.get_request_config()
|
35
54
|
self.request_config = DEFAULT_REQUEST_CONFIG | host_request_config
|
36
55
|
|
37
56
|
def get_request_config(self) -> dict:
|
57
|
+
"""
|
58
|
+
Retrieves the request configuration for the current host.
|
59
|
+
|
60
|
+
Returns:
|
61
|
+
dict: Request configuration parameters for the current host.
|
62
|
+
Returns DEFAULT_REQUEST_CONFIG if no custom configuration exists.
|
63
|
+
"""
|
64
|
+
|
38
65
|
request_config = self.decode_guide.get('request_config')
|
39
66
|
if request_config:
|
40
67
|
logger.debug(f'Host "{self.host}" has a custom request configuration on the Decode Guide file.')
|
@@ -43,110 +70,192 @@ class Decoder:
|
|
43
70
|
return DEFAULT_REQUEST_CONFIG
|
44
71
|
|
45
72
|
def is_index_inverted(self) -> bool:
|
73
|
+
"""
|
74
|
+
Checks if the index order should be inverted for the current host.
|
75
|
+
|
76
|
+
Returns:
|
77
|
+
bool: True if the index should be processed in reverse order, False otherwise.
|
78
|
+
"""
|
79
|
+
|
80
|
+
logger.debug('Checking if index should be inverted...')
|
46
81
|
return self.decode_guide.get('index', {}).get('inverted', False)
|
47
82
|
|
48
83
|
def save_title_to_content(self) -> bool:
|
49
|
-
|
84
|
+
"""
|
85
|
+
Checks if the title should be included in the content for the current host.
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
bool: True if the title should be saved with the content, False otherwise.
|
89
|
+
"""
|
90
|
+
logger.debug('Checking if title should be saved to content...')
|
91
|
+
try:
|
92
|
+
return self.decode_guide.get('save_title_to_content', False)
|
93
|
+
except DecodeError:
|
94
|
+
raise
|
50
95
|
|
51
96
|
def add_host_to_chapter(self) -> bool:
|
97
|
+
"""
|
98
|
+
Checks if the host information should be added to chapter url.
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
bool: True if host information should be included in chapter url, False otherwise.
|
102
|
+
"""
|
103
|
+
logger.debug('Checking if host should be added to chapter url...')
|
52
104
|
return self.decode_guide.get('add_host_to_chapter', False)
|
53
105
|
|
54
106
|
def get_chapter_urls(self, html: str) -> list[str]:
|
55
|
-
|
56
|
-
|
107
|
+
"""
|
108
|
+
Extracts chapter URLs from the table of contents HTML.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
html (str): The HTML content of the table of contents
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
list[str]: List of chapter URLs found in the HTML
|
57
115
|
|
58
|
-
|
59
|
-
|
60
|
-
|
116
|
+
Raises:
|
117
|
+
ContentExtractionError: If chapter URLs cannot be extracted.
|
118
|
+
HTMLParseError: If HTML parsing fails.
|
119
|
+
"""
|
120
|
+
try:
|
121
|
+
logger.debug('Obtaining chapter URLs...')
|
122
|
+
chapter_urls = self.decode_html(html, 'index')
|
123
|
+
|
124
|
+
if chapter_urls is None:
|
125
|
+
msg = f"Failed to obtain chapter URLs for {self.host}"
|
126
|
+
logger.error(msg)
|
127
|
+
raise ContentExtractionError(msg)
|
61
128
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
chapter_urls = [chapter_urls]
|
129
|
+
if isinstance(chapter_urls, str):
|
130
|
+
logger.warning('Expected List of URLs but got String, converting to single-item list')
|
131
|
+
chapter_urls = [chapter_urls]
|
66
132
|
|
67
|
-
|
133
|
+
return chapter_urls
|
134
|
+
except DecodeError:
|
135
|
+
raise
|
136
|
+
except Exception as e:
|
137
|
+
msg = f"Error extracting chapter URLs: {e}"
|
138
|
+
logger.error(msg)
|
139
|
+
raise ContentExtractionError(msg) from e
|
68
140
|
|
69
141
|
def get_toc_next_page_url(self, html: str) -> Optional[str]:
|
142
|
+
"""
|
143
|
+
Extracts the URL for the next page of the table of contents.
|
144
|
+
|
145
|
+
Args:
|
146
|
+
html (str): The HTML content of the current TOC page
|
147
|
+
|
148
|
+
Returns:
|
149
|
+
Optional[str]: URL of the next page if it exists, None otherwise
|
150
|
+
|
151
|
+
Raises:
|
152
|
+
HTMLParseError: If HTML parsing fails
|
153
|
+
ContentExtractionError: If URL extraction fails
|
154
|
+
"""
|
155
|
+
|
70
156
|
logger.debug('Obtaining toc next page URL...')
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
157
|
+
try:
|
158
|
+
toc_next_page_url = self.decode_html(html, 'next_page')
|
159
|
+
if toc_next_page_url is None:
|
160
|
+
logger.debug('No next page URL found, assuming last page...')
|
161
|
+
return None
|
162
|
+
return toc_next_page_url
|
163
|
+
except DecodeError:
|
164
|
+
raise
|
76
165
|
|
77
166
|
def get_chapter_title(self, html: str) -> Optional[str]:
|
78
|
-
|
79
|
-
|
80
|
-
if chapter_title is None:
|
81
|
-
logger.debug(f'No chapter_title found.')
|
82
|
-
return chapter_title
|
167
|
+
"""
|
168
|
+
Extracts the chapter title from HTML content.
|
83
169
|
|
84
|
-
|
85
|
-
|
86
|
-
full_chapter_content = ""
|
87
|
-
chapter_content = self.decode_html(html, 'content')
|
88
|
-
|
89
|
-
if chapter_content is None:
|
90
|
-
logger.critical('No content found on chapter')
|
91
|
-
raise ValueError('No content found on chapter')
|
92
|
-
|
93
|
-
if save_title_to_content:
|
94
|
-
logger.debug('Saving chapter title to content...')
|
95
|
-
full_chapter_content += f'<h4>{chapter_title}</h4>'
|
96
|
-
|
97
|
-
if isinstance(chapter_content, list):
|
98
|
-
logger.debug(f'{len(chapter_content)} paragraphs found in chapter')
|
99
|
-
logger.debug('Converting list of paragraphs to a single string')
|
100
|
-
for paragraph in chapter_content:
|
101
|
-
full_chapter_content += str(paragraph)
|
102
|
-
else:
|
103
|
-
logger.debug('Chapter content is not a list, no conversion made')
|
104
|
-
full_chapter_content += str(chapter_content)
|
105
|
-
return full_chapter_content
|
170
|
+
Args:
|
171
|
+
html (str): The HTML content of the chapter
|
106
172
|
|
107
|
-
|
108
|
-
|
109
|
-
logger.debug(f'Content type: {content_type}')
|
110
|
-
logger.debug(f'Decode guide: {self.decode_guide_file}')
|
111
|
-
logger.debug(f'Host: {self.host}')
|
112
|
-
if not content_type in self.decode_guide:
|
113
|
-
logger.critical(f'{content_type} key does not exists on decode guide {self.decode_guide_file}'
|
114
|
-
f'for host {self.host}')
|
115
|
-
raise ValueError(f'{content_type} key does not exists on decode guide {self.decode_guide_file}'
|
116
|
-
f'for host {self.host}')
|
173
|
+
Returns:
|
174
|
+
Optional[str]: The extracted title, or None if not found
|
117
175
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
return processor.process(html)
|
176
|
+
Raises:
|
177
|
+
HTMLParseError: If HTML parsing fails
|
178
|
+
"""
|
122
179
|
|
123
|
-
logger.debug('Starting HTML parsing...')
|
124
180
|
try:
|
125
|
-
|
181
|
+
logger.debug('Obtaining chapter title...')
|
182
|
+
chapter_title = self.decode_html(html, 'title')
|
183
|
+
|
184
|
+
if chapter_title is None:
|
185
|
+
logger.debug('No chapter title found')
|
186
|
+
return None
|
187
|
+
|
188
|
+
return str(chapter_title).strip()
|
189
|
+
except DecodeError as e:
|
190
|
+
logger.warning(f"Error when trying to extract chapter title: {e}")
|
191
|
+
return None
|
126
192
|
except Exception as e:
|
127
|
-
|
128
|
-
|
193
|
+
msg = f"Error extracting chapter title: {e}"
|
194
|
+
logger.error(msg)
|
195
|
+
raise HTMLParseError(msg) from e
|
129
196
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
logger.debug(f'{content_type} not found on html using {self.decode_guide_file} '
|
134
|
-
f'for host {self.host}')
|
197
|
+
def get_chapter_content(self, html: str, save_title_to_content: bool, chapter_title: str) -> str:
|
198
|
+
"""
|
199
|
+
Extracts and processes chapter content from HTML.
|
135
200
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
201
|
+
Args:
|
202
|
+
html (str): The HTML content of the chapter
|
203
|
+
save_title_to_content (bool): Whether to include the title in the content
|
204
|
+
chapter_title (str): The chapter title to include if save_title_to_content is True
|
205
|
+
|
206
|
+
Returns:
|
207
|
+
str: The processed chapter content with HTML formatting
|
208
|
+
|
209
|
+
Raises:
|
210
|
+
ContentExtractionError: If content cannot be extracted,
|
211
|
+
HTMLParseError: If HTML parsing fails
|
212
|
+
"""
|
213
|
+
try:
|
214
|
+
logger.debug('Obtaining chapter content...')
|
215
|
+
full_chapter_content = ""
|
216
|
+
chapter_content = self.decode_html(html, 'content')
|
217
|
+
|
218
|
+
if chapter_content is None:
|
219
|
+
msg = 'No content found in chapter'
|
220
|
+
logger.error(msg)
|
221
|
+
raise ContentExtractionError(msg)
|
222
|
+
|
223
|
+
if save_title_to_content:
|
224
|
+
logger.debug('Adding chapter title to content...')
|
225
|
+
full_chapter_content += f'<h4>{chapter_title}</h4>'
|
226
|
+
|
227
|
+
if isinstance(chapter_content, list):
|
228
|
+
logger.debug(f'Processing {len(chapter_content)} content paragraphs')
|
229
|
+
full_chapter_content += ''.join(str(p) for p in chapter_content)
|
230
|
+
else:
|
231
|
+
logger.debug('Processing single content block')
|
232
|
+
full_chapter_content += str(chapter_content)
|
233
|
+
|
234
|
+
return full_chapter_content
|
235
|
+
except DecodeError:
|
236
|
+
raise
|
237
|
+
except Exception as e:
|
238
|
+
msg = f"Error extracting chapter content: {e}"
|
239
|
+
logger.error(msg)
|
240
|
+
raise ContentExtractionError(msg) from e
|
141
241
|
|
142
242
|
def has_pagination(self) -> bool:
|
143
|
-
|
243
|
+
"""
|
244
|
+
Checks if the current host's content uses pagination.
|
245
|
+
|
246
|
+
Returns:
|
247
|
+
bool: True if the host uses pagination, False otherwise.
|
248
|
+
"""
|
249
|
+
logger.debug('Checking if index has pagination...')
|
250
|
+
return self.decode_guide.get('has_pagination', False)
|
144
251
|
|
145
252
|
def clean_html(self, html: str, hard_clean: bool = False):
|
146
253
|
tags_for_soft_clean = ['script', 'style', 'link',
|
147
254
|
'form', 'meta', 'hr', 'noscript', 'button']
|
148
|
-
tags_for_hard_clean = ['header', 'footer', 'nav', 'aside', 'iframe', 'object', 'embed', 'svg', 'canvas', 'map',
|
149
|
-
'
|
255
|
+
tags_for_hard_clean = ['header', 'footer', 'nav', 'aside', 'iframe', 'object', 'embed', 'svg', 'canvas', 'map',
|
256
|
+
'area',
|
257
|
+
'audio', 'video', 'track', 'source', 'applet', 'frame', 'frameset', 'noframes',
|
258
|
+
'noembed', 'blink', 'marquee']
|
150
259
|
|
151
260
|
tags_for_custom_clean = []
|
152
261
|
if 'clean' in self.decode_guide:
|
@@ -162,12 +271,48 @@ class Decoder:
|
|
162
271
|
|
163
272
|
return "\n".join([line.strip() for line in str(soup).splitlines() if line.strip()])
|
164
273
|
|
274
|
+
def decode_html(self, html: str, content_type: str) -> str | list[str] | None:
|
275
|
+
logger.debug(f'Decoding HTML...')
|
276
|
+
logger.debug(f'Content type: {content_type}')
|
277
|
+
logger.debug(f'Decode guide: {self.decode_guide_file}')
|
278
|
+
logger.debug(f'Host: {self.host}')
|
279
|
+
if content_type not in self.decode_guide:
|
280
|
+
msg = f'No decode rules found for {content_type} in guide {self.decode_guide_file}'
|
281
|
+
logger.critical(msg)
|
282
|
+
raise DecodeGuideError(msg)
|
283
|
+
|
284
|
+
if ProcessorRegistry.has_processor(self.host, content_type):
|
285
|
+
logger.debug(f'Using custom processor for {self.host}')
|
286
|
+
return ProcessorRegistry.get_processor(self.host, content_type).process(html)
|
287
|
+
|
288
|
+
logger.debug('Parsing HTML...')
|
289
|
+
try:
|
290
|
+
soup = BeautifulSoup(html, 'html.parser')
|
291
|
+
except Exception as e:
|
292
|
+
logger.error(f'Error parsing HTML with BeautifulSoup: {e}')
|
293
|
+
raise HTMLParseError(f'Error parsing HTML with BeautifulSoup: {e}')
|
294
|
+
|
295
|
+
decoder = self.decode_guide.get(content_type)
|
296
|
+
if decoder is None:
|
297
|
+
logger.error(f'No decode rules found for {content_type} in guide {self.decode_guide_file}')
|
298
|
+
raise DecodeGuideError(f'No decode rules found for {content_type} in guide {self.decode_guide_file}')
|
299
|
+
elements = self._find_elements(soup, decoder)
|
300
|
+
if not elements:
|
301
|
+
logger.debug(f'No {content_type} found in HTML')
|
302
|
+
return None
|
303
|
+
|
304
|
+
# Investigate this conditional
|
305
|
+
if content_type == 'title' and isinstance(elements, list):
|
306
|
+
logger.debug('Joining multiple title elements')
|
307
|
+
return ' '.join(elements)
|
308
|
+
return elements
|
309
|
+
|
165
310
|
def _set_decode_guide(self) -> None:
|
166
311
|
decode_guide = FileOps.read_json(self.decode_guide_file)
|
167
312
|
self.decode_guide = self._get_element_by_key(decode_guide, 'host', self.host)
|
168
313
|
if self.decode_guide is None:
|
169
|
-
logger.
|
170
|
-
raise
|
314
|
+
logger.error(f'No decode guide found for host {self.host}')
|
315
|
+
raise ValidationError(f'No decode guide found for host {self.host}')
|
171
316
|
|
172
317
|
@staticmethod
|
173
318
|
def _find_elements(soup: BeautifulSoup, decoder: dict):
|
@@ -380,5 +380,34 @@
|
|
380
380
|
"key": "href"
|
381
381
|
}
|
382
382
|
}
|
383
|
+
},
|
384
|
+
{
|
385
|
+
"host": "foxaholic.com",
|
386
|
+
"has_pagination": false,
|
387
|
+
"request_config": {
|
388
|
+
"force_flaresolver": "true",
|
389
|
+
"request_timeout": 30
|
390
|
+
},
|
391
|
+
"save_title_to_content": true,
|
392
|
+
"title": {
|
393
|
+
"element": "li",
|
394
|
+
"class": "active",
|
395
|
+
"extract": {
|
396
|
+
"type": "text"
|
397
|
+
}
|
398
|
+
},
|
399
|
+
"content": {
|
400
|
+
"selector": "div.text-left p:not([class])",
|
401
|
+
"array": true
|
402
|
+
},
|
403
|
+
"index": {
|
404
|
+
"selector": "li.free-chap a",
|
405
|
+
"inverted": true,
|
406
|
+
"array": true,
|
407
|
+
"extract": {
|
408
|
+
"type": "attr",
|
409
|
+
"key": "href"
|
410
|
+
}
|
411
|
+
}
|
383
412
|
}
|
384
413
|
]
|