web-novel-scraper 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,46 +1,55 @@
1
- import os
2
1
  import json
3
- from pathlib import Path
4
2
  from typing import Optional
5
3
 
6
4
  from . import logger_manager
7
5
  from .custom_processor.custom_processor import ProcessorRegistry
6
+ from .utils import FileOps
8
7
 
9
8
  from bs4 import BeautifulSoup
10
9
 
11
10
  logger = logger_manager.create_logger('DECODE HTML')
12
11
 
13
- CURRENT_DIR = Path(__file__).resolve().parent
14
-
15
- DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{CURRENT_DIR}/decode_guide/decode_guide.json')
16
-
17
12
  XOR_SEPARATOR = "XOR"
18
13
 
19
- try:
20
- with open(DECODE_GUIDE_FILE, 'r', encoding='UTF-8') as f:
21
- DECODE_GUIDE = json.load(f)
22
- except FileNotFoundError:
23
- logger.error(f"File {DECODE_GUIDE_FILE} not found.")
24
- raise
25
- except PermissionError:
26
- logger.error(f"Permission error {DECODE_GUIDE_FILE}.")
27
- raise
28
- except json.JSONDecodeError:
29
- logger.error(f"Json Decode error {DECODE_GUIDE_FILE}.")
30
- raise
31
- except Exception as e:
32
- logger.error(f"Error {DECODE_GUIDE_FILE}: {e}")
33
- raise
34
-
14
+ DEFAULT_REQUEST_CONFIG = {
15
+ "force_flaresolver": False,
16
+ "request_retries": 3,
17
+ "request_timeout": 20,
18
+ "request_time_between_retries": 3
19
+ }
35
20
 
36
21
  class Decoder:
37
22
  host: str
23
+ decode_guide_file: str
38
24
  decode_guide: json
25
+ request_config: dict
39
26
 
40
- def __init__(self, host: str):
27
+ def __init__(self, host: str, decode_guide_file: str):
28
+ self.decode_guide_file = decode_guide_file
29
+ self.set_host(host)
30
+
31
+ def set_host(self, host: str) -> None:
41
32
  self.host = host
42
- self.decode_guide = self._get_element_by_key(
43
- DECODE_GUIDE, 'host', host)
33
+ self._set_decode_guide()
34
+ host_request_config = self.get_request_config()
35
+ self.request_config = DEFAULT_REQUEST_CONFIG | host_request_config
36
+
37
+ def get_request_config(self) -> dict:
38
+ request_config = self.decode_guide.get('request_config')
39
+ if request_config:
40
+ logger.debug(f'Host "{self.host}" has a custom request configuration on the Decode Guide file.')
41
+ return request_config
42
+
43
+ return DEFAULT_REQUEST_CONFIG
44
+
45
+ def is_index_inverted(self) -> bool:
46
+ return self.decode_guide.get('index', {}).get('inverted', False)
47
+
48
+ def save_title_to_content(self) -> bool:
49
+ return self.decode_guide.get('save_title_to_content', False)
50
+
51
+ def add_host_to_chapter(self) -> bool:
52
+ return self.decode_guide.get('add_host_to_chapter', False)
44
53
 
45
54
  def get_chapter_urls(self, html: str) -> list[str]:
46
55
  logger.debug('Obtaining chapter URLs...')
@@ -98,12 +107,12 @@ class Decoder:
98
107
  def decode_html(self, html: str, content_type: str) -> str | list[str] | None:
99
108
  logger.debug(f'Decoding HTML...')
100
109
  logger.debug(f'Content type: {content_type}')
101
- logger.debug(f'Decode guide: {DECODE_GUIDE_FILE}')
110
+ logger.debug(f'Decode guide: {self.decode_guide_file}')
102
111
  logger.debug(f'Host: {self.host}')
103
112
  if not content_type in self.decode_guide:
104
- logger.critical(f'{content_type} key does not exists on decode guide {DECODE_GUIDE_FILE}'
113
+ logger.critical(f'{content_type} key does not exists on decode guide {self.decode_guide_file}'
105
114
  f'for host {self.host}')
106
- raise ValueError(f'{content_type} key does not exists on decode guide {DECODE_GUIDE_FILE}'
115
+ raise ValueError(f'{content_type} key does not exists on decode guide {self.decode_guide_file}'
107
116
  f'for host {self.host}')
108
117
 
109
118
  if ProcessorRegistry.has_processor(self.host, content_type):
@@ -121,7 +130,7 @@ class Decoder:
121
130
  decoder = self.decode_guide[content_type]
122
131
  elements = self._find_elements(soup, decoder)
123
132
  if not elements:
124
- logger.warning(f'{content_type} not found on html using {DECODE_GUIDE_FILE} '
133
+ logger.debug(f'{content_type} not found on html using {self.decode_guide_file} '
125
134
  f'for host {self.host}')
126
135
 
127
136
  # Investigate this conditional
@@ -130,11 +139,7 @@ class Decoder:
130
139
  return ' '.join(elements)
131
140
  return elements
132
141
 
133
- def has_pagination(self, host: str = None):
134
- if host:
135
- decode_guide = self._get_element_by_key(DECODE_GUIDE, 'host', host)
136
- return decode_guide['has_pagination']
137
-
142
+ def has_pagination(self) -> bool:
138
143
  return self.decode_guide['has_pagination']
139
144
 
140
145
  def clean_html(self, html: str, hard_clean: bool = False):
@@ -157,6 +162,13 @@ class Decoder:
157
162
 
158
163
  return "\n".join([line.strip() for line in str(soup).splitlines() if line.strip()])
159
164
 
165
+ def _set_decode_guide(self) -> None:
166
+ decode_guide = FileOps.read_json(self.decode_guide_file)
167
+ self.decode_guide = self._get_element_by_key(decode_guide, 'host', self.host)
168
+ if self.decode_guide is None:
169
+ logger.critical(f'No decode guide found for host {self.host}')
170
+ raise ValueError(f'No decode guide found for host {self.host}')
171
+
160
172
  @staticmethod
161
173
  def _find_elements(soup: BeautifulSoup, decoder: dict):
162
174
  logger.debug('Finding elements...')
@@ -227,7 +239,7 @@ class Decoder:
227
239
  elements = [element.string for element in elements]
228
240
 
229
241
  if not elements:
230
- logger.error('No elements found, returning "None"')
242
+ logger.debug('No elements found, returning "None"')
231
243
  return None
232
244
 
233
245
  inverted = decoder.get('inverted')
@@ -245,9 +257,8 @@ class Decoder:
245
257
  return elements[0]
246
258
 
247
259
  @staticmethod
248
- def _get_element_by_key(json_data, key, value):
260
+ def _get_element_by_key(json_data, key: str, value: str) -> Optional[dict]:
249
261
  for item in json_data:
250
262
  if item[key] == value:
251
263
  return item
252
- logger.warning('Host not found, using default decoder.')
253
- return json_data[0]
264
+ return None
@@ -160,6 +160,9 @@
160
160
  },
161
161
  {
162
162
  "host": "novelbin.com",
163
+ "request_config": {
164
+ "force_flaresolver": "true"
165
+ },
163
166
  "has_pagination": false,
164
167
  "title": {
165
168
  "element": "h2 a.chr-title",
@@ -295,5 +298,87 @@
295
298
  "key": "href"
296
299
  }
297
300
  }
301
+ },
302
+ {
303
+ "host": "scribblehub.com",
304
+ "request_config": {
305
+ "force_flaresolver": "true",
306
+ "request_timeout": 60
307
+ },
308
+ "has_pagination": true,
309
+ "title": {
310
+ "selector": "div.chapter-title",
311
+ "extract": {
312
+ "type": "text"
313
+ }
314
+ },
315
+ "content": {
316
+ "selector": "div.chp_raw p",
317
+ "array": true
318
+ },
319
+ "index": {
320
+ "selector": "div.toc ol li a",
321
+ "array": true,
322
+ "inverted": true,
323
+ "extract": {
324
+ "type": "attr",
325
+ "key": "href"
326
+ }
327
+ },
328
+ "next_page": {
329
+ "selector": "div ul.simple-pagination li a.next",
330
+ "array": false,
331
+ "extract": {
332
+ "type": "attr",
333
+ "key": "href"
334
+ }
335
+ }
336
+ },
337
+ {
338
+ "host": "novelcool.com",
339
+ "has_pagination": false,
340
+ "title": {
341
+ "element": "h2",
342
+ "extract": {
343
+ "type": "text"
344
+ }
345
+ },
346
+ "content": {
347
+ "selector": "p:not(.chapter-end-mark)",
348
+ "array": true
349
+ },
350
+ "index": {
351
+ "selector": "div.chp-item a",
352
+ "array": true,
353
+ "inverted": true,
354
+ "extract": {
355
+ "type": "attr",
356
+ "key": "href"
357
+ }
358
+ }
359
+ },
360
+ {
361
+ "host": "freewebnovel.com",
362
+ "has_pagination": false,
363
+ "add_host_to_chapter": "true",
364
+ "title": {
365
+ "element": "span",
366
+ "class": "chapter",
367
+ "extract": {
368
+ "type": "text"
369
+ }
370
+ },
371
+ "content": {
372
+ "selector": "p:not([class])",
373
+ "array": true
374
+ },
375
+ "index": {
376
+ "selector": "ul#idData a",
377
+ "array": true,
378
+ "extract": {
379
+ "type": "attr",
380
+ "key": "href"
381
+ }
382
+ }
298
383
  }
299
384
  ]