web-novel-scraper 1.1.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,19 +1,14 @@
1
- import os
2
1
  import json
3
- from pathlib import Path
4
2
  from typing import Optional
5
3
 
6
4
  from . import logger_manager
7
5
  from .custom_processor.custom_processor import ProcessorRegistry
6
+ from .utils import FileOps
8
7
 
9
8
  from bs4 import BeautifulSoup
10
9
 
11
10
  logger = logger_manager.create_logger('DECODE HTML')
12
11
 
13
- CURRENT_DIR = Path(__file__).resolve().parent
14
-
15
- DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{CURRENT_DIR}/decode_guide/decode_guide.json')
16
-
17
12
  XOR_SEPARATOR = "XOR"
18
13
 
19
14
  DEFAULT_REQUEST_CONFIG = {
@@ -23,32 +18,19 @@ DEFAULT_REQUEST_CONFIG = {
23
18
  "request_time_between_retries": 3
24
19
  }
25
20
 
26
- try:
27
- with open(DECODE_GUIDE_FILE, 'r', encoding='UTF-8') as f:
28
- DECODE_GUIDE = json.load(f)
29
- except FileNotFoundError:
30
- logger.error(f"File {DECODE_GUIDE_FILE} not found.")
31
- raise
32
- except PermissionError:
33
- logger.error(f"Permission error {DECODE_GUIDE_FILE}.")
34
- raise
35
- except json.JSONDecodeError:
36
- logger.error(f"Json Decode error {DECODE_GUIDE_FILE}.")
37
- raise
38
- except Exception as e:
39
- logger.error(f"Error {DECODE_GUIDE_FILE}: {e}")
40
- raise
41
-
42
-
43
21
  class Decoder:
44
22
  host: str
23
+ decode_guide_file: str
45
24
  decode_guide: json
46
25
  request_config: dict
47
26
 
48
- def __init__(self, host: str):
27
+ def __init__(self, host: str, decode_guide_file: str):
28
+ self.decode_guide_file = decode_guide_file
29
+ self.set_host(host)
30
+
31
+ def set_host(self, host: str) -> None:
49
32
  self.host = host
50
- self.decode_guide = self._get_element_by_key(
51
- DECODE_GUIDE, 'host', host)
33
+ self._set_decode_guide()
52
34
  host_request_config = self.get_request_config()
53
35
  self.request_config = DEFAULT_REQUEST_CONFIG | host_request_config
54
36
 
@@ -60,13 +42,14 @@ class Decoder:
60
42
 
61
43
  return DEFAULT_REQUEST_CONFIG
62
44
 
63
- def is_index_inverted(self, host:str = None) -> bool:
64
- if host:
65
- decode_guide = self._get_element_by_key(DECODE_GUIDE, 'host', host)
66
- else:
67
- decode_guide = self.decode_guide
45
+ def is_index_inverted(self) -> bool:
46
+ return self.decode_guide.get('index', {}).get('inverted', False)
47
+
48
+ def save_title_to_content(self) -> bool:
49
+ return self.decode_guide.get('save_title_to_content', False)
68
50
 
69
- return decode_guide.get('index', {}).get('inverted', False)
51
+ def add_host_to_chapter(self) -> bool:
52
+ return self.decode_guide.get('add_host_to_chapter', False)
70
53
 
71
54
  def get_chapter_urls(self, html: str) -> list[str]:
72
55
  logger.debug('Obtaining chapter URLs...')
@@ -124,12 +107,12 @@ class Decoder:
124
107
  def decode_html(self, html: str, content_type: str) -> str | list[str] | None:
125
108
  logger.debug(f'Decoding HTML...')
126
109
  logger.debug(f'Content type: {content_type}')
127
- logger.debug(f'Decode guide: {DECODE_GUIDE_FILE}')
110
+ logger.debug(f'Decode guide: {self.decode_guide_file}')
128
111
  logger.debug(f'Host: {self.host}')
129
112
  if not content_type in self.decode_guide:
130
- logger.critical(f'{content_type} key does not exists on decode guide {DECODE_GUIDE_FILE}'
113
+ logger.critical(f'{content_type} key does not exists on decode guide {self.decode_guide_file}'
131
114
  f'for host {self.host}')
132
- raise ValueError(f'{content_type} key does not exists on decode guide {DECODE_GUIDE_FILE}'
115
+ raise ValueError(f'{content_type} key does not exists on decode guide {self.decode_guide_file}'
133
116
  f'for host {self.host}')
134
117
 
135
118
  if ProcessorRegistry.has_processor(self.host, content_type):
@@ -147,7 +130,7 @@ class Decoder:
147
130
  decoder = self.decode_guide[content_type]
148
131
  elements = self._find_elements(soup, decoder)
149
132
  if not elements:
150
- logger.warning(f'{content_type} not found on html using {DECODE_GUIDE_FILE} '
133
+ logger.debug(f'{content_type} not found on html using {self.decode_guide_file} '
151
134
  f'for host {self.host}')
152
135
 
153
136
  # Investigate this conditional
@@ -156,11 +139,7 @@ class Decoder:
156
139
  return ' '.join(elements)
157
140
  return elements
158
141
 
159
- def has_pagination(self, host: str = None) -> bool:
160
- if host:
161
- decode_guide = self._get_element_by_key(DECODE_GUIDE, 'host', host)
162
- return decode_guide['has_pagination']
163
-
142
+ def has_pagination(self) -> bool:
164
143
  return self.decode_guide['has_pagination']
165
144
 
166
145
  def clean_html(self, html: str, hard_clean: bool = False):
@@ -183,6 +162,13 @@ class Decoder:
183
162
 
184
163
  return "\n".join([line.strip() for line in str(soup).splitlines() if line.strip()])
185
164
 
165
+ def _set_decode_guide(self) -> None:
166
+ decode_guide = FileOps.read_json(self.decode_guide_file)
167
+ self.decode_guide = self._get_element_by_key(decode_guide, 'host', self.host)
168
+ if self.decode_guide is None:
169
+ logger.critical(f'No decode guide found for host {self.host}')
170
+ raise ValueError(f'No decode guide found for host {self.host}')
171
+
186
172
  @staticmethod
187
173
  def _find_elements(soup: BeautifulSoup, decoder: dict):
188
174
  logger.debug('Finding elements...')
@@ -253,7 +239,7 @@ class Decoder:
253
239
  elements = [element.string for element in elements]
254
240
 
255
241
  if not elements:
256
- logger.error('No elements found, returning "None"')
242
+ logger.debug('No elements found, returning "None"')
257
243
  return None
258
244
 
259
245
  inverted = decoder.get('inverted')
@@ -271,8 +257,8 @@ class Decoder:
271
257
  return elements[0]
272
258
 
273
259
  @staticmethod
274
- def _get_element_by_key(json_data, key: str, value: str):
260
+ def _get_element_by_key(json_data, key: str, value: str) -> Optional[dict]:
275
261
  for item in json_data:
276
262
  if item[key] == value:
277
263
  return item
278
- return json_data[0]
264
+ return None
@@ -333,5 +333,52 @@
333
333
  "key": "href"
334
334
  }
335
335
  }
336
+ },
337
+ {
338
+ "host": "novelcool.com",
339
+ "has_pagination": false,
340
+ "title": {
341
+ "element": "h2",
342
+ "extract": {
343
+ "type": "text"
344
+ }
345
+ },
346
+ "content": {
347
+ "selector": "p:not(.chapter-end-mark)",
348
+ "array": true
349
+ },
350
+ "index": {
351
+ "selector": "div.chp-item a",
352
+ "array": true,
353
+ "inverted": true,
354
+ "extract": {
355
+ "type": "attr",
356
+ "key": "href"
357
+ }
358
+ }
359
+ },
360
+ {
361
+ "host": "freewebnovel.com",
362
+ "has_pagination": false,
363
+ "add_host_to_chapter": "true",
364
+ "title": {
365
+ "element": "span",
366
+ "class": "chapter",
367
+ "extract": {
368
+ "type": "text"
369
+ }
370
+ },
371
+ "content": {
372
+ "selector": "p:not([class])",
373
+ "array": true
374
+ },
375
+ "index": {
376
+ "selector": "ul#idData a",
377
+ "array": true,
378
+ "extract": {
379
+ "type": "attr",
380
+ "key": "href"
381
+ }
382
+ }
336
383
  }
337
384
  ]