web-novel-scraper 2.0.2__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,11 @@
1
1
  import json
2
2
  from typing import Optional
3
3
 
4
+ from pathlib import Path
5
+
4
6
  from . import logger_manager
5
7
  from .custom_processor.custom_processor import ProcessorRegistry
6
- from .utils import FileOps
8
+ from .utils import FileOps, DecodeError, ValidationError
7
9
 
8
10
  from bs4 import BeautifulSoup
9
11
 
@@ -18,23 +20,48 @@ DEFAULT_REQUEST_CONFIG = {
18
20
  "request_time_between_retries": 3
19
21
  }
20
22
 
23
+
24
+ class HTMLParseError(DecodeError):
25
+ """Raised when HTML parsing fails"""
26
+
27
+
28
+ class DecodeGuideError(DecodeError):
29
+ """Raised when there are issues with decode guide configuration"""
30
+
31
+
32
+ class ContentExtractionError(DecodeError):
33
+ """Raised when content extraction fails"""
34
+
35
+
21
36
  class Decoder:
22
37
  host: str
23
- decode_guide_file: str
38
+ decode_guide_file: Path
24
39
  decode_guide: json
25
40
  request_config: dict
26
41
 
27
- def __init__(self, host: str, decode_guide_file: str):
42
+ def __init__(self, host: str, decode_guide_file: Path):
28
43
  self.decode_guide_file = decode_guide_file
29
44
  self.set_host(host)
30
45
 
31
46
  def set_host(self, host: str) -> None:
32
47
  self.host = host
33
- self._set_decode_guide()
48
+ try:
49
+ self._set_decode_guide()
50
+ except ValidationError:
51
+ raise
52
+
34
53
  host_request_config = self.get_request_config()
35
54
  self.request_config = DEFAULT_REQUEST_CONFIG | host_request_config
36
55
 
37
56
  def get_request_config(self) -> dict:
57
+ """
58
+ Retrieves the request configuration for the current host.
59
+
60
+ Returns:
61
+ dict: Request configuration parameters for the current host.
62
+ Returns DEFAULT_REQUEST_CONFIG if no custom configuration exists.
63
+ """
64
+
38
65
  request_config = self.decode_guide.get('request_config')
39
66
  if request_config:
40
67
  logger.debug(f'Host "{self.host}" has a custom request configuration on the Decode Guide file.')
@@ -43,110 +70,192 @@ class Decoder:
43
70
  return DEFAULT_REQUEST_CONFIG
44
71
 
45
72
  def is_index_inverted(self) -> bool:
73
+ """
74
+ Checks if the index order should be inverted for the current host.
75
+
76
+ Returns:
77
+ bool: True if the index should be processed in reverse order, False otherwise.
78
+ """
79
+
80
+ logger.debug('Checking if index should be inverted...')
46
81
  return self.decode_guide.get('index', {}).get('inverted', False)
47
82
 
48
83
  def save_title_to_content(self) -> bool:
49
- return self.decode_guide.get('save_title_to_content', False)
84
+ """
85
+ Checks if the title should be included in the content for the current host.
86
+
87
+ Returns:
88
+ bool: True if the title should be saved with the content, False otherwise.
89
+ """
90
+ logger.debug('Checking if title should be saved to content...')
91
+ try:
92
+ return self.decode_guide.get('save_title_to_content', False)
93
+ except DecodeError:
94
+ raise
50
95
 
51
96
  def add_host_to_chapter(self) -> bool:
97
+ """
98
+ Checks if the host information should be added to chapter url.
99
+
100
+ Returns:
101
+ bool: True if host information should be included in chapter url, False otherwise.
102
+ """
103
+ logger.debug('Checking if host should be added to chapter url...')
52
104
  return self.decode_guide.get('add_host_to_chapter', False)
53
105
 
54
106
  def get_chapter_urls(self, html: str) -> list[str]:
55
- logger.debug('Obtaining chapter URLs...')
56
- chapter_urls = self.decode_html(html, 'index')
107
+ """
108
+ Extracts chapter URLs from the table of contents HTML.
109
+
110
+ Args:
111
+ html (str): The HTML content of the table of contents
112
+
113
+ Returns:
114
+ list[str]: List of chapter URLs found in the HTML
57
115
 
58
- if chapter_urls is None:
59
- logger.critical(f"Failed to obtain chapter URLs for {self.host}")
60
- raise ValueError(f"Failed to obtain chapter URLs for {self.host}")
116
+ Raises:
117
+ ContentExtractionError: If chapter URLs cannot be extracted.
118
+ HTMLParseError: If HTML parsing fails.
119
+ """
120
+ try:
121
+ logger.debug('Obtaining chapter URLs...')
122
+ chapter_urls = self.decode_html(html, 'index')
123
+
124
+ if chapter_urls is None:
125
+ msg = f"Failed to obtain chapter URLs for {self.host}"
126
+ logger.error(msg)
127
+ raise ContentExtractionError(msg)
61
128
 
62
- if isinstance(chapter_urls, str):
63
- logger.warning('When obtaining chapter urls, obtained a String but expected a List')
64
- logger.warning('Check decode config')
65
- chapter_urls = [chapter_urls]
129
+ if isinstance(chapter_urls, str):
130
+ logger.warning('Expected List of URLs but got String, converting to single-item list')
131
+ chapter_urls = [chapter_urls]
66
132
 
67
- return chapter_urls
133
+ return chapter_urls
134
+ except DecodeError:
135
+ raise
136
+ except Exception as e:
137
+ msg = f"Error extracting chapter URLs: {e}"
138
+ logger.error(msg)
139
+ raise ContentExtractionError(msg) from e
68
140
 
69
141
  def get_toc_next_page_url(self, html: str) -> Optional[str]:
142
+ """
143
+ Extracts the URL for the next page of the table of contents.
144
+
145
+ Args:
146
+ html (str): The HTML content of the current TOC page
147
+
148
+ Returns:
149
+ Optional[str]: URL of the next page if it exists, None otherwise
150
+
151
+ Raises:
152
+ HTMLParseError: If HTML parsing fails
153
+ ContentExtractionError: If URL extraction fails
154
+ """
155
+
70
156
  logger.debug('Obtaining toc next page URL...')
71
- toc_next_page_url = self.decode_html(html, 'next_page')
72
- if toc_next_page_url is None:
73
- logger.debug('No next page URL found, assuming last page...')
74
- return None
75
- return toc_next_page_url
157
+ try:
158
+ toc_next_page_url = self.decode_html(html, 'next_page')
159
+ if toc_next_page_url is None:
160
+ logger.debug('No next page URL found, assuming last page...')
161
+ return None
162
+ return toc_next_page_url
163
+ except DecodeError:
164
+ raise
76
165
 
77
166
  def get_chapter_title(self, html: str) -> Optional[str]:
78
- logger.debug('Obtaining chapter title...')
79
- chapter_title = self.decode_html(html, 'title')
80
- if chapter_title is None:
81
- logger.debug(f'No chapter_title found.')
82
- return chapter_title
167
+ """
168
+ Extracts the chapter title from HTML content.
83
169
 
84
- def get_chapter_content(self, html: str, save_title_to_content: bool, chapter_title: str) -> str:
85
- logger.debug('Obtaining chapter content...')
86
- full_chapter_content = ""
87
- chapter_content = self.decode_html(html, 'content')
88
-
89
- if chapter_content is None:
90
- logger.critical('No content found on chapter')
91
- raise ValueError('No content found on chapter')
92
-
93
- if save_title_to_content:
94
- logger.debug('Saving chapter title to content...')
95
- full_chapter_content += f'<h4>{chapter_title}</h4>'
96
-
97
- if isinstance(chapter_content, list):
98
- logger.debug(f'{len(chapter_content)} paragraphs found in chapter')
99
- logger.debug('Converting list of paragraphs to a single string')
100
- for paragraph in chapter_content:
101
- full_chapter_content += str(paragraph)
102
- else:
103
- logger.debug('Chapter content is not a list, no conversion made')
104
- full_chapter_content += str(chapter_content)
105
- return full_chapter_content
170
+ Args:
171
+ html (str): The HTML content of the chapter
106
172
 
107
- def decode_html(self, html: str, content_type: str) -> str | list[str] | None:
108
- logger.debug(f'Decoding HTML...')
109
- logger.debug(f'Content type: {content_type}')
110
- logger.debug(f'Decode guide: {self.decode_guide_file}')
111
- logger.debug(f'Host: {self.host}')
112
- if not content_type in self.decode_guide:
113
- logger.critical(f'{content_type} key does not exists on decode guide {self.decode_guide_file}'
114
- f'for host {self.host}')
115
- raise ValueError(f'{content_type} key does not exists on decode guide {self.decode_guide_file}'
116
- f'for host {self.host}')
173
+ Returns:
174
+ Optional[str]: The extracted title, or None if not found
117
175
 
118
- if ProcessorRegistry.has_processor(self.host, content_type):
119
- logger.debug(f'Host {self.host} will use a custom processor')
120
- processor = ProcessorRegistry.get_processor(self.host, content_type)
121
- return processor.process(html)
176
+ Raises:
177
+ HTMLParseError: If HTML parsing fails
178
+ """
122
179
 
123
- logger.debug('Starting HTML parsing...')
124
180
  try:
125
- soup = BeautifulSoup(html, 'html.parser')
181
+ logger.debug('Obtaining chapter title...')
182
+ chapter_title = self.decode_html(html, 'title')
183
+
184
+ if chapter_title is None:
185
+ logger.debug('No chapter title found')
186
+ return None
187
+
188
+ return str(chapter_title).strip()
189
+ except DecodeError as e:
190
+ logger.warning(f"Error when trying to extract chapter title: {e}")
191
+ return None
126
192
  except Exception as e:
127
- logger.critical(f'Error parsing HTML with BeautifulSoup: {e}')
128
- raise ValueError(f'Error parsing HTML with BeautifulSoup: {e}')
193
+ msg = f"Error extracting chapter title: {e}"
194
+ logger.error(msg)
195
+ raise HTMLParseError(msg) from e
129
196
 
130
- decoder = self.decode_guide[content_type]
131
- elements = self._find_elements(soup, decoder)
132
- if not elements:
133
- logger.debug(f'{content_type} not found on html using {self.decode_guide_file} '
134
- f'for host {self.host}')
197
+ def get_chapter_content(self, html: str, save_title_to_content: bool, chapter_title: str) -> str:
198
+ """
199
+ Extracts and processes chapter content from HTML.
135
200
 
136
- # Investigate this conditional
137
- if content_type == 'title' and isinstance(elements, list):
138
- logger.debug('Joining titles...')
139
- return ' '.join(elements)
140
- return elements
201
+ Args:
202
+ html (str): The HTML content of the chapter
203
+ save_title_to_content (bool): Whether to include the title in the content
204
+ chapter_title (str): The chapter title to include if save_title_to_content is True
205
+
206
+ Returns:
207
+ str: The processed chapter content with HTML formatting
208
+
209
+ Raises:
210
+ ContentExtractionError: If content cannot be extracted,
211
+ HTMLParseError: If HTML parsing fails
212
+ """
213
+ try:
214
+ logger.debug('Obtaining chapter content...')
215
+ full_chapter_content = ""
216
+ chapter_content = self.decode_html(html, 'content')
217
+
218
+ if chapter_content is None:
219
+ msg = 'No content found in chapter'
220
+ logger.error(msg)
221
+ raise ContentExtractionError(msg)
222
+
223
+ if save_title_to_content:
224
+ logger.debug('Adding chapter title to content...')
225
+ full_chapter_content += f'<h4>{chapter_title}</h4>'
226
+
227
+ if isinstance(chapter_content, list):
228
+ logger.debug(f'Processing {len(chapter_content)} content paragraphs')
229
+ full_chapter_content += ''.join(str(p) for p in chapter_content)
230
+ else:
231
+ logger.debug('Processing single content block')
232
+ full_chapter_content += str(chapter_content)
233
+
234
+ return full_chapter_content
235
+ except DecodeError:
236
+ raise
237
+ except Exception as e:
238
+ msg = f"Error extracting chapter content: {e}"
239
+ logger.error(msg)
240
+ raise ContentExtractionError(msg) from e
141
241
 
142
242
  def has_pagination(self) -> bool:
143
- return self.decode_guide['has_pagination']
243
+ """
244
+ Checks if the current host's content uses pagination.
245
+
246
+ Returns:
247
+ bool: True if the host uses pagination, False otherwise.
248
+ """
249
+ logger.debug('Checking if index has pagination...')
250
+ return self.decode_guide.get('has_pagination', False)
144
251
 
145
252
  def clean_html(self, html: str, hard_clean: bool = False):
146
253
  tags_for_soft_clean = ['script', 'style', 'link',
147
254
  'form', 'meta', 'hr', 'noscript', 'button']
148
- tags_for_hard_clean = ['header', 'footer', 'nav', 'aside', 'iframe', 'object', 'embed', 'svg', 'canvas', 'map', 'area',
149
- 'audio', 'video', 'track', 'source', 'applet', 'frame', 'frameset', 'noframes', 'noembed', 'blink', 'marquee']
255
+ tags_for_hard_clean = ['header', 'footer', 'nav', 'aside', 'iframe', 'object', 'embed', 'svg', 'canvas', 'map',
256
+ 'area',
257
+ 'audio', 'video', 'track', 'source', 'applet', 'frame', 'frameset', 'noframes',
258
+ 'noembed', 'blink', 'marquee']
150
259
 
151
260
  tags_for_custom_clean = []
152
261
  if 'clean' in self.decode_guide:
@@ -162,12 +271,48 @@ class Decoder:
162
271
 
163
272
  return "\n".join([line.strip() for line in str(soup).splitlines() if line.strip()])
164
273
 
274
+ def decode_html(self, html: str, content_type: str) -> str | list[str] | None:
275
+ logger.debug(f'Decoding HTML...')
276
+ logger.debug(f'Content type: {content_type}')
277
+ logger.debug(f'Decode guide: {self.decode_guide_file}')
278
+ logger.debug(f'Host: {self.host}')
279
+ if content_type not in self.decode_guide:
280
+ msg = f'No decode rules found for {content_type} in guide {self.decode_guide_file}'
281
+ logger.critical(msg)
282
+ raise DecodeGuideError(msg)
283
+
284
+ if ProcessorRegistry.has_processor(self.host, content_type):
285
+ logger.debug(f'Using custom processor for {self.host}')
286
+ return ProcessorRegistry.get_processor(self.host, content_type).process(html)
287
+
288
+ logger.debug('Parsing HTML...')
289
+ try:
290
+ soup = BeautifulSoup(html, 'html.parser')
291
+ except Exception as e:
292
+ logger.error(f'Error parsing HTML with BeautifulSoup: {e}')
293
+ raise HTMLParseError(f'Error parsing HTML with BeautifulSoup: {e}')
294
+
295
+ decoder = self.decode_guide.get(content_type)
296
+ if decoder is None:
297
+ logger.error(f'No decode rules found for {content_type} in guide {self.decode_guide_file}')
298
+ raise DecodeGuideError(f'No decode rules found for {content_type} in guide {self.decode_guide_file}')
299
+ elements = self._find_elements(soup, decoder)
300
+ if not elements:
301
+ logger.debug(f'No {content_type} found in HTML')
302
+ return None
303
+
304
+ # Investigate this conditional
305
+ if content_type == 'title' and isinstance(elements, list):
306
+ logger.debug('Joining multiple title elements')
307
+ return ' '.join(elements)
308
+ return elements
309
+
165
310
  def _set_decode_guide(self) -> None:
166
311
  decode_guide = FileOps.read_json(self.decode_guide_file)
167
312
  self.decode_guide = self._get_element_by_key(decode_guide, 'host', self.host)
168
313
  if self.decode_guide is None:
169
- logger.critical(f'No decode guide found for host {self.host}')
170
- raise ValueError(f'No decode guide found for host {self.host}')
314
+ logger.error(f'No decode guide found for host {self.host}')
315
+ raise ValidationError(f'No decode guide found for host {self.host}')
171
316
 
172
317
  @staticmethod
173
318
  def _find_elements(soup: BeautifulSoup, decoder: dict):
@@ -380,5 +380,34 @@
380
380
  "key": "href"
381
381
  }
382
382
  }
383
+ },
384
+ {
385
+ "host": "foxaholic.com",
386
+ "has_pagination": false,
387
+ "request_config": {
388
+ "force_flaresolver": "true",
389
+ "request_timeout": 30
390
+ },
391
+ "save_title_to_content": true,
392
+ "title": {
393
+ "element": "li",
394
+ "class": "active",
395
+ "extract": {
396
+ "type": "text"
397
+ }
398
+ },
399
+ "content": {
400
+ "selector": "div.text-left p:not([class])",
401
+ "array": true
402
+ },
403
+ "index": {
404
+ "selector": "li.free-chap a",
405
+ "inverted": true,
406
+ "array": true,
407
+ "extract": {
408
+ "type": "attr",
409
+ "key": "href"
410
+ }
411
+ }
383
412
  }
384
413
  ]