web-novel-scraper 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web_novel_scraper/__main__.py +116 -94
- web_novel_scraper/config_manager.py +84 -0
- web_novel_scraper/decode.py +49 -38
- web_novel_scraper/decode_guide/decode_guide.json +85 -0
- web_novel_scraper/file_manager.py +226 -257
- web_novel_scraper/novel_scraper.py +90 -46
- web_novel_scraper/request_manager.py +70 -57
- web_novel_scraper/utils.py +139 -2
- web_novel_scraper/version.py +1 -1
- {web_novel_scraper-1.1.0.dist-info → web_novel_scraper-2.0.0.dist-info}/METADATA +1 -1
- web_novel_scraper-2.0.0.dist-info/RECORD +19 -0
- web_novel_scraper-1.1.0.dist-info/RECORD +0 -18
- {web_novel_scraper-1.1.0.dist-info → web_novel_scraper-2.0.0.dist-info}/WHEEL +0 -0
- {web_novel_scraper-1.1.0.dist-info → web_novel_scraper-2.0.0.dist-info}/entry_points.txt +0 -0
web_novel_scraper/decode.py
CHANGED
@@ -1,46 +1,55 @@
|
|
1
|
-
import os
|
2
1
|
import json
|
3
|
-
from pathlib import Path
|
4
2
|
from typing import Optional
|
5
3
|
|
6
4
|
from . import logger_manager
|
7
5
|
from .custom_processor.custom_processor import ProcessorRegistry
|
6
|
+
from .utils import FileOps
|
8
7
|
|
9
8
|
from bs4 import BeautifulSoup
|
10
9
|
|
11
10
|
logger = logger_manager.create_logger('DECODE HTML')
|
12
11
|
|
13
|
-
CURRENT_DIR = Path(__file__).resolve().parent
|
14
|
-
|
15
|
-
DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{CURRENT_DIR}/decode_guide/decode_guide.json')
|
16
|
-
|
17
12
|
XOR_SEPARATOR = "XOR"
|
18
13
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
except PermissionError:
|
26
|
-
logger.error(f"Permission error {DECODE_GUIDE_FILE}.")
|
27
|
-
raise
|
28
|
-
except json.JSONDecodeError:
|
29
|
-
logger.error(f"Json Decode error {DECODE_GUIDE_FILE}.")
|
30
|
-
raise
|
31
|
-
except Exception as e:
|
32
|
-
logger.error(f"Error {DECODE_GUIDE_FILE}: {e}")
|
33
|
-
raise
|
34
|
-
|
14
|
+
DEFAULT_REQUEST_CONFIG = {
|
15
|
+
"force_flaresolver": False,
|
16
|
+
"request_retries": 3,
|
17
|
+
"request_timeout": 20,
|
18
|
+
"request_time_between_retries": 3
|
19
|
+
}
|
35
20
|
|
36
21
|
class Decoder:
|
37
22
|
host: str
|
23
|
+
decode_guide_file: str
|
38
24
|
decode_guide: json
|
25
|
+
request_config: dict
|
39
26
|
|
40
|
-
def __init__(self, host: str):
|
27
|
+
def __init__(self, host: str, decode_guide_file: str):
|
28
|
+
self.decode_guide_file = decode_guide_file
|
29
|
+
self.set_host(host)
|
30
|
+
|
31
|
+
def set_host(self, host: str) -> None:
|
41
32
|
self.host = host
|
42
|
-
self.
|
43
|
-
|
33
|
+
self._set_decode_guide()
|
34
|
+
host_request_config = self.get_request_config()
|
35
|
+
self.request_config = DEFAULT_REQUEST_CONFIG | host_request_config
|
36
|
+
|
37
|
+
def get_request_config(self) -> dict:
|
38
|
+
request_config = self.decode_guide.get('request_config')
|
39
|
+
if request_config:
|
40
|
+
logger.debug(f'Host "{self.host}" has a custom request configuration on the Decode Guide file.')
|
41
|
+
return request_config
|
42
|
+
|
43
|
+
return DEFAULT_REQUEST_CONFIG
|
44
|
+
|
45
|
+
def is_index_inverted(self) -> bool:
|
46
|
+
return self.decode_guide.get('index', {}).get('inverted', False)
|
47
|
+
|
48
|
+
def save_title_to_content(self) -> bool:
|
49
|
+
return self.decode_guide.get('save_title_to_content', False)
|
50
|
+
|
51
|
+
def add_host_to_chapter(self) -> bool:
|
52
|
+
return self.decode_guide.get('add_host_to_chapter', False)
|
44
53
|
|
45
54
|
def get_chapter_urls(self, html: str) -> list[str]:
|
46
55
|
logger.debug('Obtaining chapter URLs...')
|
@@ -98,12 +107,12 @@ class Decoder:
|
|
98
107
|
def decode_html(self, html: str, content_type: str) -> str | list[str] | None:
|
99
108
|
logger.debug(f'Decoding HTML...')
|
100
109
|
logger.debug(f'Content type: {content_type}')
|
101
|
-
logger.debug(f'Decode guide: {
|
110
|
+
logger.debug(f'Decode guide: {self.decode_guide_file}')
|
102
111
|
logger.debug(f'Host: {self.host}')
|
103
112
|
if not content_type in self.decode_guide:
|
104
|
-
logger.critical(f'{content_type} key does not exists on decode guide {
|
113
|
+
logger.critical(f'{content_type} key does not exists on decode guide {self.decode_guide_file}'
|
105
114
|
f'for host {self.host}')
|
106
|
-
raise ValueError(f'{content_type} key does not exists on decode guide {
|
115
|
+
raise ValueError(f'{content_type} key does not exists on decode guide {self.decode_guide_file}'
|
107
116
|
f'for host {self.host}')
|
108
117
|
|
109
118
|
if ProcessorRegistry.has_processor(self.host, content_type):
|
@@ -121,7 +130,7 @@ class Decoder:
|
|
121
130
|
decoder = self.decode_guide[content_type]
|
122
131
|
elements = self._find_elements(soup, decoder)
|
123
132
|
if not elements:
|
124
|
-
logger.
|
133
|
+
logger.debug(f'{content_type} not found on html using {self.decode_guide_file} '
|
125
134
|
f'for host {self.host}')
|
126
135
|
|
127
136
|
# Investigate this conditional
|
@@ -130,11 +139,7 @@ class Decoder:
|
|
130
139
|
return ' '.join(elements)
|
131
140
|
return elements
|
132
141
|
|
133
|
-
def has_pagination(self
|
134
|
-
if host:
|
135
|
-
decode_guide = self._get_element_by_key(DECODE_GUIDE, 'host', host)
|
136
|
-
return decode_guide['has_pagination']
|
137
|
-
|
142
|
+
def has_pagination(self) -> bool:
|
138
143
|
return self.decode_guide['has_pagination']
|
139
144
|
|
140
145
|
def clean_html(self, html: str, hard_clean: bool = False):
|
@@ -157,6 +162,13 @@ class Decoder:
|
|
157
162
|
|
158
163
|
return "\n".join([line.strip() for line in str(soup).splitlines() if line.strip()])
|
159
164
|
|
165
|
+
def _set_decode_guide(self) -> None:
|
166
|
+
decode_guide = FileOps.read_json(self.decode_guide_file)
|
167
|
+
self.decode_guide = self._get_element_by_key(decode_guide, 'host', self.host)
|
168
|
+
if self.decode_guide is None:
|
169
|
+
logger.critical(f'No decode guide found for host {self.host}')
|
170
|
+
raise ValueError(f'No decode guide found for host {self.host}')
|
171
|
+
|
160
172
|
@staticmethod
|
161
173
|
def _find_elements(soup: BeautifulSoup, decoder: dict):
|
162
174
|
logger.debug('Finding elements...')
|
@@ -227,7 +239,7 @@ class Decoder:
|
|
227
239
|
elements = [element.string for element in elements]
|
228
240
|
|
229
241
|
if not elements:
|
230
|
-
logger.
|
242
|
+
logger.debug('No elements found, returning "None"')
|
231
243
|
return None
|
232
244
|
|
233
245
|
inverted = decoder.get('inverted')
|
@@ -245,9 +257,8 @@ class Decoder:
|
|
245
257
|
return elements[0]
|
246
258
|
|
247
259
|
@staticmethod
|
248
|
-
def _get_element_by_key(json_data, key, value):
|
260
|
+
def _get_element_by_key(json_data, key: str, value: str) -> Optional[dict]:
|
249
261
|
for item in json_data:
|
250
262
|
if item[key] == value:
|
251
263
|
return item
|
252
|
-
|
253
|
-
return json_data[0]
|
264
|
+
return None
|
@@ -160,6 +160,9 @@
|
|
160
160
|
},
|
161
161
|
{
|
162
162
|
"host": "novelbin.com",
|
163
|
+
"request_config": {
|
164
|
+
"force_flaresolver": "true"
|
165
|
+
},
|
163
166
|
"has_pagination": false,
|
164
167
|
"title": {
|
165
168
|
"element": "h2 a.chr-title",
|
@@ -295,5 +298,87 @@
|
|
295
298
|
"key": "href"
|
296
299
|
}
|
297
300
|
}
|
301
|
+
},
|
302
|
+
{
|
303
|
+
"host": "scribblehub.com",
|
304
|
+
"request_config": {
|
305
|
+
"force_flaresolver": "true",
|
306
|
+
"request_timeout": 60
|
307
|
+
},
|
308
|
+
"has_pagination": true,
|
309
|
+
"title": {
|
310
|
+
"selector": "div.chapter-title",
|
311
|
+
"extract": {
|
312
|
+
"type": "text"
|
313
|
+
}
|
314
|
+
},
|
315
|
+
"content": {
|
316
|
+
"selector": "div.chp_raw p",
|
317
|
+
"array": true
|
318
|
+
},
|
319
|
+
"index": {
|
320
|
+
"selector": "div.toc ol li a",
|
321
|
+
"array": true,
|
322
|
+
"inverted": true,
|
323
|
+
"extract": {
|
324
|
+
"type": "attr",
|
325
|
+
"key": "href"
|
326
|
+
}
|
327
|
+
},
|
328
|
+
"next_page": {
|
329
|
+
"selector": "div ul.simple-pagination li a.next",
|
330
|
+
"array": false,
|
331
|
+
"extract": {
|
332
|
+
"type": "attr",
|
333
|
+
"key": "href"
|
334
|
+
}
|
335
|
+
}
|
336
|
+
},
|
337
|
+
{
|
338
|
+
"host": "novelcool.com",
|
339
|
+
"has_pagination": false,
|
340
|
+
"title": {
|
341
|
+
"element": "h2",
|
342
|
+
"extract": {
|
343
|
+
"type": "text"
|
344
|
+
}
|
345
|
+
},
|
346
|
+
"content": {
|
347
|
+
"selector": "p:not(.chapter-end-mark)",
|
348
|
+
"array": true
|
349
|
+
},
|
350
|
+
"index": {
|
351
|
+
"selector": "div.chp-item a",
|
352
|
+
"array": true,
|
353
|
+
"inverted": true,
|
354
|
+
"extract": {
|
355
|
+
"type": "attr",
|
356
|
+
"key": "href"
|
357
|
+
}
|
358
|
+
}
|
359
|
+
},
|
360
|
+
{
|
361
|
+
"host": "freewebnovel.com",
|
362
|
+
"has_pagination": false,
|
363
|
+
"add_host_to_chapter": "true",
|
364
|
+
"title": {
|
365
|
+
"element": "span",
|
366
|
+
"class": "chapter",
|
367
|
+
"extract": {
|
368
|
+
"type": "text"
|
369
|
+
}
|
370
|
+
},
|
371
|
+
"content": {
|
372
|
+
"selector": "p:not([class])",
|
373
|
+
"array": true
|
374
|
+
},
|
375
|
+
"index": {
|
376
|
+
"selector": "ul#idData a",
|
377
|
+
"array": true,
|
378
|
+
"extract": {
|
379
|
+
"type": "attr",
|
380
|
+
"key": "href"
|
381
|
+
}
|
382
|
+
}
|
298
383
|
}
|
299
384
|
]
|