web-novel-scraper 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,13 @@ DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{CURRENT_DIR}/decode_guide/
16
16
 
17
17
  XOR_SEPARATOR = "XOR"
18
18
 
19
+ DEFAULT_REQUEST_CONFIG = {
20
+ "force_flaresolver": False,
21
+ "request_retries": 3,
22
+ "request_timeout": 20,
23
+ "request_time_between_retries": 3
24
+ }
25
+
19
26
  try:
20
27
  with open(DECODE_GUIDE_FILE, 'r', encoding='UTF-8') as f:
21
28
  DECODE_GUIDE = json.load(f)
@@ -36,11 +43,30 @@ except Exception as e:
36
43
  class Decoder:
37
44
  host: str
38
45
  decode_guide: json
46
+ request_config: dict
39
47
 
40
48
  def __init__(self, host: str):
41
49
  self.host = host
42
50
  self.decode_guide = self._get_element_by_key(
43
51
  DECODE_GUIDE, 'host', host)
52
+ host_request_config = self.get_request_config()
53
+ self.request_config = DEFAULT_REQUEST_CONFIG | host_request_config
54
+
55
+ def get_request_config(self) -> dict:
56
+ request_config = self.decode_guide.get('request_config')
57
+ if request_config:
58
+ logger.debug(f'Host "{self.host}" has a custom request configuration on the Decode Guide file.')
59
+ return request_config
60
+
61
+ return DEFAULT_REQUEST_CONFIG
62
+
63
+ def is_index_inverted(self, host:str = None) -> bool:
64
+ if host:
65
+ decode_guide = self._get_element_by_key(DECODE_GUIDE, 'host', host)
66
+ else:
67
+ decode_guide = self.decode_guide
68
+
69
+ return decode_guide.get('index', {}).get('inverted', False)
44
70
 
45
71
  def get_chapter_urls(self, html: str) -> list[str]:
46
72
  logger.debug('Obtaining chapter URLs...')
@@ -130,7 +156,7 @@ class Decoder:
130
156
  return ' '.join(elements)
131
157
  return elements
132
158
 
133
- def has_pagination(self, host: str = None):
159
+ def has_pagination(self, host: str = None) -> bool:
134
160
  if host:
135
161
  decode_guide = self._get_element_by_key(DECODE_GUIDE, 'host', host)
136
162
  return decode_guide['has_pagination']
@@ -245,9 +271,8 @@ class Decoder:
245
271
  return elements[0]
246
272
 
247
273
  @staticmethod
248
- def _get_element_by_key(json_data, key, value):
274
+ def _get_element_by_key(json_data, key: str, value: str):
249
275
  for item in json_data:
250
276
  if item[key] == value:
251
277
  return item
252
- logger.warning('Host not found, using default decoder.')
253
278
  return json_data[0]
@@ -160,6 +160,9 @@
160
160
  },
161
161
  {
162
162
  "host": "novelbin.com",
163
+ "request_config": {
164
+ "force_flaresolver": "true"
165
+ },
163
166
  "has_pagination": false,
164
167
  "title": {
165
168
  "element": "h2 a.chr-title",
@@ -295,5 +298,40 @@
295
298
  "key": "href"
296
299
  }
297
300
  }
301
+ },
302
+ {
303
+ "host": "scribblehub.com",
304
+ "request_config": {
305
+ "force_flaresolver": "true",
306
+ "request_timeout": 60
307
+ },
308
+ "has_pagination": true,
309
+ "title": {
310
+ "selector": "div.chapter-title",
311
+ "extract": {
312
+ "type": "text"
313
+ }
314
+ },
315
+ "content": {
316
+ "selector": "div.chp_raw p",
317
+ "array": true
318
+ },
319
+ "index": {
320
+ "selector": "div.toc ol li a",
321
+ "array": true,
322
+ "inverted": true,
323
+ "extract": {
324
+ "type": "attr",
325
+ "key": "href"
326
+ }
327
+ },
328
+ "next_page": {
329
+ "selector": "div ul.simple-pagination li a.next",
330
+ "array": false,
331
+ "extract": {
332
+ "type": "attr",
333
+ "key": "href"
334
+ }
335
+ }
298
336
  }
299
337
  ]
@@ -10,7 +10,7 @@ from .decode import Decoder
10
10
  from .file_manager import FileManager
11
11
  from . import utils
12
12
 
13
- from . import request_manager
13
+ from .request_manager import get_html_content
14
14
 
15
15
  logger = logger_manager.create_logger('NOVEL SCRAPPING')
16
16
 
@@ -277,8 +277,16 @@ class Novel:
277
277
  if chapters_url_from_toc_content is None:
278
278
  logger.error('Chapters url not found on toc_content')
279
279
  return False
280
- self.chapters_url_list = [*self.chapters_url_list,
281
- *chapters_url_from_toc_content]
280
+ # First we save a list of lists in case we need to invert the orderAdd commentMore actions
281
+ self.chapters_url_list.append(chapters_url_from_toc_content)
282
+
283
+ invert = self.decoder.is_index_inverted()
284
+ self.chapters_url_list = [
285
+ chapter
286
+ for chapters_url in (self.chapters_url_list[::-1] if invert else self.chapters_url_list)
287
+ for chapter in chapters_url
288
+ ]
289
+
282
290
  if self.scraper_behavior.auto_add_host:
283
291
  self.chapters_url_list = [
284
292
  f'https://{self.host}{chapter_url}' for chapter_url in self.chapters_url_list]
@@ -464,6 +472,16 @@ class Novel:
464
472
  toc = self.decoder.clean_html(toc, hard_clean=hard_clean)
465
473
  self.file_manager.update_toc(toc, i)
466
474
 
475
+ def _request_html_content(self, url: str) -> Optional[str]:
476
+ request_config = self.decoder.request_config
477
+ force_flaresolver = request_config.get('force_flaresolver') or self.scraper_behavior.force_flaresolver
478
+ html_content = get_html_content(url,
479
+ retries=request_config.get('request_retries'),
480
+ timeout=request_config.get('request_timeout'),
481
+ time_between_retries=request_config.get('request_time_between_retries'),
482
+ force_flaresolver=force_flaresolver)
483
+ return html_content
484
+
467
485
  def _get_chapter(self,
468
486
  chapter: Chapter,
469
487
  reload: bool = False) -> Chapter | None:
@@ -481,8 +499,7 @@ class Novel:
481
499
  return chapter
482
500
 
483
501
  # Fetch fresh content
484
- chapter.chapter_html = request_manager.get_html_content(chapter.chapter_url,
485
- force_flaresolver=self.scraper_behavior.force_flaresolver)
502
+ chapter.chapter_html = self._request_html_content(chapter.chapter_url)
486
503
  if not chapter.chapter_html:
487
504
  logger.error(f'No content found on link {chapter.chapter_url}')
488
505
  return chapter
@@ -501,7 +518,11 @@ class Novel:
501
518
  if content:
502
519
  return content
503
520
 
504
- content = request_manager.get_html_content(url)
521
+ if utils.check_incomplete_url(url):
522
+ url = self.toc_main_url + url
523
+
524
+ # Fetch fresh content
525
+ content = self._request_html_content(url)
505
526
  if not content:
506
527
  logger.warning(f'No content found on link {url}')
507
528
  sys.exit(1)
@@ -4,6 +4,7 @@ from . import logger_manager
4
4
  from dotenv import load_dotenv
5
5
  import json
6
6
  import time
7
+ from typing import Optional
7
8
 
8
9
  load_dotenv()
9
10
 
@@ -13,45 +14,52 @@ FORCE_FLARESOLVER = os.getenv('FORCE_FLARESOLVER', '0') == '1'
13
14
 
14
15
  logger = logger_manager.create_logger('GET HTML CONTENT')
15
16
 
16
- def get_request(url: str,
17
- timeout: int = 20,
18
- retries: int = 3,
19
- time_between_retries: int = 1) -> requests.Response | None:
20
- logger.debug(f'Starting get_request for {url} with timeout={timeout}, retries={retries}, time_between_retries={time_between_retries}')
17
+
18
+ def _get_request(url: str,
19
+ timeout: int,
20
+ retries: int,
21
+ time_between_retries: int) -> Optional[requests.Response]:
22
+ logger.debug(
23
+ f'Starting get_request for "{url}" with timeout={timeout}, '
24
+ f'retries={retries}, '
25
+ f'time_between_retries={time_between_retries}')
21
26
  for attempt in range(retries):
22
- logger.debug(f'Attempt {attempt + 1} for {url}')
27
+ logger.debug(f'Attempt {attempt + 1} for "{url}"')
23
28
  try:
24
29
  response = requests.get(url, timeout=timeout)
25
30
  response.raise_for_status()
26
- logger.debug(f'Successful response for {url} on attempt {attempt + 1}')
31
+ logger.debug(f'Successful response for "{url}" on attempt {attempt + 1}')
27
32
  return response
28
33
  except requests.exceptions.ConnectionError as e:
29
- logger.error(f'Connection error ({attempt + 1}/{retries}): {e}')
34
+ logger.debug(f'Connection error ({attempt + 1}/{retries}): {e}')
30
35
  except requests.exceptions.Timeout as e:
31
- logger.error(f'Request timed out ({attempt + 1}/{retries}): {e}')
36
+ logger.debug(f'Request timed out ({attempt + 1}/{retries}): {e}')
32
37
  except requests.exceptions.HTTPError as e:
33
- logger.error(f'HTTP error ({attempt + 1}/{retries}): {e}')
38
+ logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
34
39
  except requests.exceptions.InvalidSchema as e:
35
- logger.error(f'Invalid URL schema for "{url}": {e}')
40
+ logger.debug(f'Invalid URL schema for "{url}": {e}')
36
41
  break # Don't retry on invalid schema
37
42
  except requests.exceptions.RequestException as e:
38
- logger.error(f'Request failed ({attempt + 1}/{retries}): {e}')
43
+ logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
39
44
 
40
45
  if attempt < retries - 1:
41
46
  logger.debug(f'Waiting {time_between_retries} seconds before retrying')
42
47
  time.sleep(time_between_retries) # Wait before retrying
43
- logger.debug(f'Failed to get a successful response for {url} after {retries} attempts')
48
+ logger.warning(f'Failed to get a successful response for "{url}" after {retries} attempts using common HTTP Request')
44
49
  return None
45
50
 
46
51
 
47
- def get_request_flaresolver(url: str,
48
- timeout: int = 20,
49
- flaresolver_url: str = FLARESOLVER_URL,
50
- retries: int = 3,
51
- time_between_retries: int = 1) -> requests.Response | None:
52
- logger.debug(f'Starting get_request_flaresolver for {url} with timeout={timeout}, retries={retries}, time_between_retries={time_between_retries}')
52
+ def _get_request_flaresolver(url: str,
53
+ timeout: int,
54
+ retries: int,
55
+ time_between_retries: int,
56
+ flaresolver_url: str) -> Optional[requests.Response]:
57
+ logger.debug(
58
+ f'Starting get_request_flaresolver for "{url}" with timeout={timeout}, '
59
+ f'retries={retries}, '
60
+ f'time_between_retries={time_between_retries}')
53
61
  for attempt in range(retries):
54
- logger.debug(f'Attempt {attempt + 1} for {url} using FlareSolver')
62
+ logger.debug(f'Attempt {attempt + 1} for "{url}" using FlareSolver')
55
63
  try:
56
64
  response = requests.post(
57
65
  flaresolver_url,
@@ -64,71 +72,76 @@ def get_request_flaresolver(url: str,
64
72
  timeout=timeout
65
73
  )
66
74
  response.raise_for_status()
67
- logger.debug(f'Successful response for {url} on attempt {attempt + 1} using FlareSolver')
75
+ logger.debug(f'Successful response for "{url}" on attempt {attempt + 1} using FlareSolver')
68
76
  return response
69
77
 
70
78
  except requests.exceptions.ConnectionError as e:
71
- logger.error(f'Connection error ({attempt + 1}/{retries}), check FlareSolver host: {flaresolver_url}: {e}')
79
+ logger.warning(f'Connection error with flaresolver (URL: "{flaresolver_url}"): {e}')
80
+ logger.warning(f'If the url is incorrect, set the env variable "FLARESOLVER_URL" to the correct value')
81
+ logger.warning('If FlareSolver is not installed in your machine, consider installing it.')
82
+ break # Don't retry on Connection Error
72
83
  except requests.exceptions.Timeout as e:
73
- logger.error(f'Request timed out ({attempt + 1}/{retries}): {e}')
84
+ logger.debug(f'Request timed out ({attempt + 1}/{retries}): {e}')
74
85
  except requests.exceptions.InvalidSchema as e:
75
- logger.error(f'Invalid FlareSolver URL schema "{flaresolver_url}": {e}')
86
+ logger.debug(f'Invalid FlareSolver URL schema "{flaresolver_url}": {e}')
76
87
  break # Don't retry on invalid schema
77
88
  except requests.exceptions.HTTPError as e:
78
- logger.error(f'HTTP error ({attempt + 1}/{retries}): {e}')
89
+ logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
79
90
  except requests.exceptions.RequestException as e:
80
- logger.error(f'Request failed ({attempt + 1}/{retries}): {e}')
91
+ logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
81
92
  except json.JSONDecodeError as e:
82
- logger.error(f'Invalid JSON response ({attempt + 1}/{retries}): {e}')
93
+ logger.debug(f'Invalid JSON response ({attempt + 1}/{retries}): {e}')
83
94
 
84
95
  if attempt < retries - 1:
85
96
  logger.debug(f'Waiting {time_between_retries} seconds before retrying')
86
97
  time.sleep(time_between_retries) # Wait before retrying
87
- logger.debug(f'Failed to get a successful response for {url} using FlareSolver after {retries} attempts')
98
+
99
+ logger.warning(f'Failed to get a successful response for "{url}" using FlareSolver after {retries} attempts')
88
100
  return None
89
101
 
90
102
 
91
103
  def get_html_content(url: str,
92
- retries: int = 5,
93
- flaresolver: bool = True,
104
+ retries: int = 3,
105
+ timeout: int = 20,
106
+ time_between_retries: int = 3,
94
107
  flaresolver_url: str = FLARESOLVER_URL,
95
- time_between_retries: int = 1,
96
- force_flaresolver: bool = FORCE_FLARESOLVER) -> str | None:
97
- logger.debug(f'Starting get_html_content for {url} with retries={retries}, flaresolver={flaresolver}, flaresolver_url={flaresolver_url}, time_between_retries={time_between_retries}, force_flaresolver={force_flaresolver}')
108
+ force_flaresolver: bool = FORCE_FLARESOLVER) -> Optional[str]:
109
+ logger.debug(
110
+ f'Requesting HTML Content for "{url}" with '
111
+ f'retries: "{retries}", '
112
+ f'timeout: "{timeout}", '
113
+ f'time between retries: "{time_between_retries}"')
114
+ if force_flaresolver:
115
+ logger.debug('Will directly try with FlareSolver')
116
+
98
117
  # First try with common HTTP request
99
118
  if not force_flaresolver:
100
- response = get_request(
101
- url, timeout=20, retries=retries, time_between_retries=time_between_retries)
102
- if not response:
103
- logger.warning(f'Failed to get response from {url} using common HTTP request')
104
- elif not response.ok:
105
- logger.warning(f'Response with errors from {url} using common HTTP request')
106
- else:
107
- logger.debug(f'Successfully retrieved HTML content from {url} using common HTTP request')
119
+ response = _get_request(url,
120
+ timeout=timeout,
121
+ retries=retries,
122
+ time_between_retries=time_between_retries)
123
+ if response and response.ok:
124
+ logger.debug(f'Successfully retrieved HTML content from "{url}" using common HTTP request')
108
125
  return response.text
109
126
 
110
- # If flaresolver is disabled, return None
111
- if not flaresolver:
112
- logger.debug(f'Flaresolver is disabled, returning None for {url}')
113
- return None
114
-
115
127
  # Try with Flaresolver
116
- logger.debug(f'Trying with Flaresolver for {url}')
117
- response = get_request_flaresolver(
118
- url, timeout=20, flaresolver_url=flaresolver_url, time_between_retries=time_between_retries)
119
- if not response:
120
- logger.critical(f'Failed to get response from {url} using FlareSolver')
121
- return None
122
- if not response.ok:
123
- logger.critical(f'Response with errors from {url} using FlareSolver')
128
+ logger.debug(f'Trying with Flaresolver for "{url}"')
129
+ response = _get_request_flaresolver(url,
130
+ timeout=timeout,
131
+ retries=retries,
132
+ time_between_retries=time_between_retries,
133
+ flaresolver_url=flaresolver_url)
134
+ if not response or not response.ok:
135
+ logger.warning(f'Failed all attempts to get HTML content from "{url}')
124
136
  return None
125
137
 
126
138
  response_json = response.json()
127
139
  if 'solution' not in response_json:
128
- logger.critical(f'No solution found in FlareSolver response for {url}')
140
+ logger.warning(f'No solution found in FlareSolver response for "{url}"')
129
141
  return None
130
142
  if 'response' not in response_json['solution']:
131
- logger.critical(f'No response found in FlareSolver solution for {url}')
143
+ logger.warning(f'No response found in FlareSolver solution for "{url}"')
132
144
  return None
133
- logger.debug(f'Successfully retrieved HTML content from {url} using FlareSolver')
145
+
146
+ logger.debug(f'Successfully retrieved HTML content from "{url}" using FlareSolver')
134
147
  return response_json['solution']['response']
@@ -64,3 +64,10 @@ def check_exclusive_params(param1: any, param2: any) -> bool:
64
64
 
65
65
  def create_volume_id(n: int):
66
66
  return f'v{n:02}'
67
+
68
+ def check_incomplete_url(url: str) -> bool:
69
+ if url.startswith('?') or url.startswith('#'):
70
+ return True
71
+
72
+ parsed = urlparse(url)
73
+ return not parsed.scheme or not parsed.netloc
@@ -1 +1 @@
1
- __version__ = "1.1.0"
1
+ __version__ = "1.1.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: web-novel-scraper
3
- Version: 1.1.0
3
+ Version: 1.1.1
4
4
  Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
5
5
  Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
6
6
  Project-URL: Documentation, https://web-novel-scraper.readthedocs.io
@@ -1,18 +1,18 @@
1
1
  web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  web_novel_scraper/__main__.py,sha256=OQQVX5CttmAkUwdrnjBSjKPaoh_boUI2ysHi3rLGOSs,17769
3
- web_novel_scraper/decode.py,sha256=QxPjoYI1t4bf0zAf_7uLRrpsboi8DwsD1BNZUiHO4gc,10150
3
+ web_novel_scraper/decode.py,sha256=U-78PhJ4SU2hiUmfAWeWGEBJ3YSoCW3Lupw9cUqQuI0,11013
4
4
  web_novel_scraper/file_manager.py,sha256=qAqgqtaRb7QyVtyEOW2cMhPYWdKM6nJ69weUCYKwVtM,11862
5
5
  web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
6
- web_novel_scraper/novel_scraper.py,sha256=hXIIPelRfx-jfD9VSPheg6z04I4JKxQj7wVBPlpP1go,28452
7
- web_novel_scraper/request_manager.py,sha256=VtGpRi5b_Dp3h8viCdt7yMCb9M21Lk7oLC_Q_0EkXH8,6448
8
- web_novel_scraper/utils.py,sha256=vq5ROuPv04k3MhbksTe0ni_yP0i_a7T_33mkBB1DUbQ,2076
9
- web_novel_scraper/version.py,sha256=LGVQyDsWifdACo7qztwb8RWWHds1E7uQ-ZqD8SAjyw4,22
6
+ web_novel_scraper/novel_scraper.py,sha256=DsYnY15s8cZZ2w8pRvmD3_NJw54xarhcnEQdvnTD8XI,29421
7
+ web_novel_scraper/request_manager.py,sha256=WU8LG6D_fqmDapX6wpVwpQQSItcNU8Qb9dMAlLCYI8U,6621
8
+ web_novel_scraper/utils.py,sha256=dPp7D2ji9mC2nFydqxsJ_9vkAntA_3VTt8ZmG-F1f78,2270
9
+ web_novel_scraper/version.py,sha256=q8_5C0f-8mHWNb6mMw02zlYPnEGXBqvOmP3z0CEwZKM,22
10
10
  web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
11
11
  web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
12
12
  web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
13
13
  web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
14
- web_novel_scraper/decode_guide/decode_guide.json,sha256=DbcfnyRNOVXZd6ar1HDCHxkKgnmR3ziJ-B4GOFcDMEs,7584
15
- web_novel_scraper-1.1.0.dist-info/METADATA,sha256=Llcez3yLJTICPNMAoO1aZShywK2soma1kmjl2OA3tYA,8423
16
- web_novel_scraper-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
17
- web_novel_scraper-1.1.0.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
18
- web_novel_scraper-1.1.0.dist-info/RECORD,,
14
+ web_novel_scraper/decode_guide/decode_guide.json,sha256=gNVencLtK0HmZPlubTm1wA7eatWADCxJ_LCOYWHWuA0,8556
15
+ web_novel_scraper-1.1.1.dist-info/METADATA,sha256=ow5piBhzzo4mZ0secvHrqc4KCCt4VInpDa09Qo9l4AE,8423
16
+ web_novel_scraper-1.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
17
+ web_novel_scraper-1.1.1.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
18
+ web_novel_scraper-1.1.1.dist-info/RECORD,,