web-novel-scraper 1.1.1__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web_novel_scraper/__main__.py +116 -94
- web_novel_scraper/config_manager.py +84 -0
- web_novel_scraper/decode.py +30 -44
- web_novel_scraper/decode_guide/decode_guide.json +47 -0
- web_novel_scraper/file_manager.py +226 -257
- web_novel_scraper/novel_scraper.py +64 -41
- web_novel_scraper/request_manager.py +2 -2
- web_novel_scraper/utils.py +132 -2
- web_novel_scraper/version.py +1 -1
- {web_novel_scraper-1.1.1.dist-info → web_novel_scraper-2.0.0.dist-info}/METADATA +1 -1
- web_novel_scraper-2.0.0.dist-info/RECORD +19 -0
- web_novel_scraper-1.1.1.dist-info/RECORD +0 -18
- {web_novel_scraper-1.1.1.dist-info → web_novel_scraper-2.0.0.dist-info}/WHEEL +0 -0
- {web_novel_scraper-1.1.1.dist-info → web_novel_scraper-2.0.0.dist-info}/entry_points.txt +0 -0
web_novel_scraper/decode.py
CHANGED
@@ -1,19 +1,14 @@
|
|
1
|
-
import os
|
2
1
|
import json
|
3
|
-
from pathlib import Path
|
4
2
|
from typing import Optional
|
5
3
|
|
6
4
|
from . import logger_manager
|
7
5
|
from .custom_processor.custom_processor import ProcessorRegistry
|
6
|
+
from .utils import FileOps
|
8
7
|
|
9
8
|
from bs4 import BeautifulSoup
|
10
9
|
|
11
10
|
logger = logger_manager.create_logger('DECODE HTML')
|
12
11
|
|
13
|
-
CURRENT_DIR = Path(__file__).resolve().parent
|
14
|
-
|
15
|
-
DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{CURRENT_DIR}/decode_guide/decode_guide.json')
|
16
|
-
|
17
12
|
XOR_SEPARATOR = "XOR"
|
18
13
|
|
19
14
|
DEFAULT_REQUEST_CONFIG = {
|
@@ -23,32 +18,19 @@ DEFAULT_REQUEST_CONFIG = {
|
|
23
18
|
"request_time_between_retries": 3
|
24
19
|
}
|
25
20
|
|
26
|
-
try:
|
27
|
-
with open(DECODE_GUIDE_FILE, 'r', encoding='UTF-8') as f:
|
28
|
-
DECODE_GUIDE = json.load(f)
|
29
|
-
except FileNotFoundError:
|
30
|
-
logger.error(f"File {DECODE_GUIDE_FILE} not found.")
|
31
|
-
raise
|
32
|
-
except PermissionError:
|
33
|
-
logger.error(f"Permission error {DECODE_GUIDE_FILE}.")
|
34
|
-
raise
|
35
|
-
except json.JSONDecodeError:
|
36
|
-
logger.error(f"Json Decode error {DECODE_GUIDE_FILE}.")
|
37
|
-
raise
|
38
|
-
except Exception as e:
|
39
|
-
logger.error(f"Error {DECODE_GUIDE_FILE}: {e}")
|
40
|
-
raise
|
41
|
-
|
42
|
-
|
43
21
|
class Decoder:
|
44
22
|
host: str
|
23
|
+
decode_guide_file: str
|
45
24
|
decode_guide: json
|
46
25
|
request_config: dict
|
47
26
|
|
48
|
-
def __init__(self, host: str):
|
27
|
+
def __init__(self, host: str, decode_guide_file: str):
|
28
|
+
self.decode_guide_file = decode_guide_file
|
29
|
+
self.set_host(host)
|
30
|
+
|
31
|
+
def set_host(self, host: str) -> None:
|
49
32
|
self.host = host
|
50
|
-
self.
|
51
|
-
DECODE_GUIDE, 'host', host)
|
33
|
+
self._set_decode_guide()
|
52
34
|
host_request_config = self.get_request_config()
|
53
35
|
self.request_config = DEFAULT_REQUEST_CONFIG | host_request_config
|
54
36
|
|
@@ -60,13 +42,14 @@ class Decoder:
|
|
60
42
|
|
61
43
|
return DEFAULT_REQUEST_CONFIG
|
62
44
|
|
63
|
-
def is_index_inverted(self
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
45
|
+
def is_index_inverted(self) -> bool:
|
46
|
+
return self.decode_guide.get('index', {}).get('inverted', False)
|
47
|
+
|
48
|
+
def save_title_to_content(self) -> bool:
|
49
|
+
return self.decode_guide.get('save_title_to_content', False)
|
68
50
|
|
69
|
-
|
51
|
+
def add_host_to_chapter(self) -> bool:
|
52
|
+
return self.decode_guide.get('add_host_to_chapter', False)
|
70
53
|
|
71
54
|
def get_chapter_urls(self, html: str) -> list[str]:
|
72
55
|
logger.debug('Obtaining chapter URLs...')
|
@@ -124,12 +107,12 @@ class Decoder:
|
|
124
107
|
def decode_html(self, html: str, content_type: str) -> str | list[str] | None:
|
125
108
|
logger.debug(f'Decoding HTML...')
|
126
109
|
logger.debug(f'Content type: {content_type}')
|
127
|
-
logger.debug(f'Decode guide: {
|
110
|
+
logger.debug(f'Decode guide: {self.decode_guide_file}')
|
128
111
|
logger.debug(f'Host: {self.host}')
|
129
112
|
if not content_type in self.decode_guide:
|
130
|
-
logger.critical(f'{content_type} key does not exists on decode guide {
|
113
|
+
logger.critical(f'{content_type} key does not exists on decode guide {self.decode_guide_file}'
|
131
114
|
f'for host {self.host}')
|
132
|
-
raise ValueError(f'{content_type} key does not exists on decode guide {
|
115
|
+
raise ValueError(f'{content_type} key does not exists on decode guide {self.decode_guide_file}'
|
133
116
|
f'for host {self.host}')
|
134
117
|
|
135
118
|
if ProcessorRegistry.has_processor(self.host, content_type):
|
@@ -147,7 +130,7 @@ class Decoder:
|
|
147
130
|
decoder = self.decode_guide[content_type]
|
148
131
|
elements = self._find_elements(soup, decoder)
|
149
132
|
if not elements:
|
150
|
-
logger.
|
133
|
+
logger.debug(f'{content_type} not found on html using {self.decode_guide_file} '
|
151
134
|
f'for host {self.host}')
|
152
135
|
|
153
136
|
# Investigate this conditional
|
@@ -156,11 +139,7 @@ class Decoder:
|
|
156
139
|
return ' '.join(elements)
|
157
140
|
return elements
|
158
141
|
|
159
|
-
def has_pagination(self
|
160
|
-
if host:
|
161
|
-
decode_guide = self._get_element_by_key(DECODE_GUIDE, 'host', host)
|
162
|
-
return decode_guide['has_pagination']
|
163
|
-
|
142
|
+
def has_pagination(self) -> bool:
|
164
143
|
return self.decode_guide['has_pagination']
|
165
144
|
|
166
145
|
def clean_html(self, html: str, hard_clean: bool = False):
|
@@ -183,6 +162,13 @@ class Decoder:
|
|
183
162
|
|
184
163
|
return "\n".join([line.strip() for line in str(soup).splitlines() if line.strip()])
|
185
164
|
|
165
|
+
def _set_decode_guide(self) -> None:
|
166
|
+
decode_guide = FileOps.read_json(self.decode_guide_file)
|
167
|
+
self.decode_guide = self._get_element_by_key(decode_guide, 'host', self.host)
|
168
|
+
if self.decode_guide is None:
|
169
|
+
logger.critical(f'No decode guide found for host {self.host}')
|
170
|
+
raise ValueError(f'No decode guide found for host {self.host}')
|
171
|
+
|
186
172
|
@staticmethod
|
187
173
|
def _find_elements(soup: BeautifulSoup, decoder: dict):
|
188
174
|
logger.debug('Finding elements...')
|
@@ -253,7 +239,7 @@ class Decoder:
|
|
253
239
|
elements = [element.string for element in elements]
|
254
240
|
|
255
241
|
if not elements:
|
256
|
-
logger.
|
242
|
+
logger.debug('No elements found, returning "None"')
|
257
243
|
return None
|
258
244
|
|
259
245
|
inverted = decoder.get('inverted')
|
@@ -271,8 +257,8 @@ class Decoder:
|
|
271
257
|
return elements[0]
|
272
258
|
|
273
259
|
@staticmethod
|
274
|
-
def _get_element_by_key(json_data, key: str, value: str):
|
260
|
+
def _get_element_by_key(json_data, key: str, value: str) -> Optional[dict]:
|
275
261
|
for item in json_data:
|
276
262
|
if item[key] == value:
|
277
263
|
return item
|
278
|
-
return
|
264
|
+
return None
|
@@ -333,5 +333,52 @@
|
|
333
333
|
"key": "href"
|
334
334
|
}
|
335
335
|
}
|
336
|
+
},
|
337
|
+
{
|
338
|
+
"host": "novelcool.com",
|
339
|
+
"has_pagination": false,
|
340
|
+
"title": {
|
341
|
+
"element": "h2",
|
342
|
+
"extract": {
|
343
|
+
"type": "text"
|
344
|
+
}
|
345
|
+
},
|
346
|
+
"content": {
|
347
|
+
"selector": "p:not(.chapter-end-mark)",
|
348
|
+
"array": true
|
349
|
+
},
|
350
|
+
"index": {
|
351
|
+
"selector": "div.chp-item a",
|
352
|
+
"array": true,
|
353
|
+
"inverted": true,
|
354
|
+
"extract": {
|
355
|
+
"type": "attr",
|
356
|
+
"key": "href"
|
357
|
+
}
|
358
|
+
}
|
359
|
+
},
|
360
|
+
{
|
361
|
+
"host": "freewebnovel.com",
|
362
|
+
"has_pagination": false,
|
363
|
+
"add_host_to_chapter": "true",
|
364
|
+
"title": {
|
365
|
+
"element": "span",
|
366
|
+
"class": "chapter",
|
367
|
+
"extract": {
|
368
|
+
"type": "text"
|
369
|
+
}
|
370
|
+
},
|
371
|
+
"content": {
|
372
|
+
"selector": "p:not([class])",
|
373
|
+
"array": true
|
374
|
+
},
|
375
|
+
"index": {
|
376
|
+
"selector": "ul#idData a",
|
377
|
+
"array": true,
|
378
|
+
"extract": {
|
379
|
+
"type": "attr",
|
380
|
+
"key": "href"
|
381
|
+
}
|
382
|
+
}
|
336
383
|
}
|
337
384
|
]
|