web-novel-scraper 1.0.4__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,11 +12,17 @@ logger = logger_manager.create_logger('DECODE HTML')
12
12
 
13
13
  CURRENT_DIR = Path(__file__).resolve().parent
14
14
 
15
- DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{
16
- CURRENT_DIR}/decode_guide/decode_guide.json')
15
+ DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{CURRENT_DIR}/decode_guide/decode_guide.json')
17
16
 
18
17
  XOR_SEPARATOR = "XOR"
19
18
 
19
+ DEFAULT_REQUEST_CONFIG = {
20
+ "force_flaresolver": False,
21
+ "request_retries": 3,
22
+ "request_timeout": 20,
23
+ "request_time_between_retries": 3
24
+ }
25
+
20
26
  try:
21
27
  with open(DECODE_GUIDE_FILE, 'r', encoding='UTF-8') as f:
22
28
  DECODE_GUIDE = json.load(f)
@@ -37,11 +43,30 @@ except Exception as e:
37
43
  class Decoder:
38
44
  host: str
39
45
  decode_guide: json
46
+ request_config: dict
40
47
 
41
48
  def __init__(self, host: str):
42
49
  self.host = host
43
50
  self.decode_guide = self._get_element_by_key(
44
51
  DECODE_GUIDE, 'host', host)
52
+ host_request_config = self.get_request_config()
53
+ self.request_config = DEFAULT_REQUEST_CONFIG | host_request_config
54
+
55
+ def get_request_config(self) -> dict:
56
+ request_config = self.decode_guide.get('request_config')
57
+ if request_config:
58
+ logger.debug(f'Host "{self.host}" has a custom request configuration on the Decode Guide file.')
59
+ return request_config
60
+
61
+ return DEFAULT_REQUEST_CONFIG
62
+
63
+ def is_index_inverted(self, host:str = None) -> bool:
64
+ if host:
65
+ decode_guide = self._get_element_by_key(DECODE_GUIDE, 'host', host)
66
+ else:
67
+ decode_guide = self.decode_guide
68
+
69
+ return decode_guide.get('index', {}).get('inverted', False)
45
70
 
46
71
  def get_chapter_urls(self, html: str) -> list[str]:
47
72
  logger.debug('Obtaining chapter URLs...')
@@ -131,7 +156,7 @@ class Decoder:
131
156
  return ' '.join(elements)
132
157
  return elements
133
158
 
134
- def has_pagination(self, host: str = None):
159
+ def has_pagination(self, host: str = None) -> bool:
135
160
  if host:
136
161
  decode_guide = self._get_element_by_key(DECODE_GUIDE, 'host', host)
137
162
  return decode_guide['has_pagination']
@@ -246,9 +271,8 @@ class Decoder:
246
271
  return elements[0]
247
272
 
248
273
  @staticmethod
249
- def _get_element_by_key(json_data, key, value):
274
+ def _get_element_by_key(json_data, key: str, value: str):
250
275
  for item in json_data:
251
276
  if item[key] == value:
252
277
  return item
253
- logger.warning('Host not found, using default decoder.')
254
278
  return json_data[0]
@@ -130,10 +130,10 @@
130
130
  }
131
131
  },
132
132
  "content": {
133
- "element": "div#chr-content",
133
+ "element": null,
134
134
  "id": null,
135
135
  "class": null,
136
- "selector": null,
136
+ "selector": "div#chr-content p",
137
137
  "attributes": null,
138
138
  "array": true
139
139
  },
@@ -160,6 +160,9 @@
160
160
  },
161
161
  {
162
162
  "host": "novelbin.com",
163
+ "request_config": {
164
+ "force_flaresolver": "true"
165
+ },
163
166
  "has_pagination": false,
164
167
  "title": {
165
168
  "element": "h2 a.chr-title",
@@ -295,5 +298,40 @@
295
298
  "key": "href"
296
299
  }
297
300
  }
301
+ },
302
+ {
303
+ "host": "scribblehub.com",
304
+ "request_config": {
305
+ "force_flaresolver": "true",
306
+ "request_timeout": 60
307
+ },
308
+ "has_pagination": true,
309
+ "title": {
310
+ "selector": "div.chapter-title",
311
+ "extract": {
312
+ "type": "text"
313
+ }
314
+ },
315
+ "content": {
316
+ "selector": "div.chp_raw p",
317
+ "array": true
318
+ },
319
+ "index": {
320
+ "selector": "div.toc ol li a",
321
+ "array": true,
322
+ "inverted": true,
323
+ "extract": {
324
+ "type": "attr",
325
+ "key": "href"
326
+ }
327
+ },
328
+ "next_page": {
329
+ "selector": "div ul.simple-pagination li a.next",
330
+ "array": false,
331
+ "extract": {
332
+ "type": "attr",
333
+ "key": "href"
334
+ }
335
+ }
298
336
  }
299
337
  ]
@@ -45,10 +45,10 @@ class FileManager:
45
45
  novel_config_dir: str = None,
46
46
  read_only: bool = False):
47
47
  logger.debug(f'Initializing FileManager for novel: {novel_title}, read_only: {read_only}')
48
- novel_base_dir = novel_base_dir if novel_base_dir else f'{
49
- SCRAPER_BASE_DATA_DIR}/{novel_title}'
50
- novel_config_dir = novel_config_dir if novel_config_dir else f'{
51
- SCRAPER_BASE_CONFIG_DIR}/{novel_title}'
48
+ novel_base_dir = novel_base_dir if novel_base_dir else \
49
+ f'{SCRAPER_BASE_DATA_DIR}/{novel_title}'
50
+ novel_config_dir = novel_config_dir if novel_config_dir else \
51
+ f'{SCRAPER_BASE_CONFIG_DIR}/{novel_title}'
52
52
 
53
53
  logger.debug(f'Using base dir: {novel_base_dir}, config dir: {novel_config_dir}')
54
54
 
@@ -243,8 +243,7 @@ def _save_content_to_file(filepath: Path, content: str | dict, is_json: bool = F
243
243
  except (OSError, IOError) as e:
244
244
  logger.error(f'Error saving file "{filepath}": {e}')
245
245
  except Exception as e:
246
- logger.error(f'Unexpected error saving file "{
247
- filepath}": {e}', exc_info=True)
246
+ logger.error(f'Unexpected error saving file "{filepath}": {e}', exc_info=True)
248
247
 
249
248
 
250
249
  def _read_content_from_file(filepath: Path, bytes: bool = False) -> str:
@@ -263,8 +262,7 @@ def _read_content_from_file(filepath: Path, bytes: bool = False) -> str:
263
262
  logger.error(f'Error reading file "{filepath}": {e}')
264
263
  except Exception as e:
265
264
  # Log for unexpected errors
266
- logger.error(f'Unexpected error reading file "{
267
- filepath}": {e}', exc_info=True)
265
+ logger.error(f'Unexpected error reading file "{filepath}": {e}', exc_info=True)
268
266
 
269
267
 
270
268
  def _delete_file(filepath: Path) -> None:
@@ -280,8 +278,7 @@ def _delete_file(filepath: Path) -> None:
280
278
  logger.error(f'Error deleting file "{filepath}": {e}')
281
279
  except Exception as e:
282
280
  # Log any unexpected errors
283
- logger.error(f'Unexpected error deleting file "{
284
- filepath}": {e}', exc_info=True)
281
+ logger.error(f'Unexpected error deleting file "{filepath}": {e}', exc_info=True)
285
282
 
286
283
 
287
284
  def _copy_file(source: Path, destination: Path) -> bool:
@@ -10,7 +10,7 @@ from .decode import Decoder
10
10
  from .file_manager import FileManager
11
11
  from . import utils
12
12
 
13
- from . import request_manager
13
+ from .request_manager import get_html_content
14
14
 
15
15
  logger = logger_manager.create_logger('NOVEL SCRAPPING')
16
16
 
@@ -39,9 +39,11 @@ class Metadata:
39
39
  """
40
40
  Dynamic string representation of the configuration.
41
41
  """
42
- attributes = [f"{field.name}={
43
- getattr(self, field.name)}" for field in fields(self)]
44
- return f"Metadata: \n{'\n'.join(attributes)}"
42
+ attributes = [(f"{field.name}="
43
+ f"{getattr(self, field.name)}") for field in fields(self)]
44
+ attributes_str = '\n'.join(attributes)
45
+ return (f"Metadata: \n"
46
+ f"{attributes_str}")
45
47
 
46
48
 
47
49
  @dataclass_json
@@ -70,9 +72,11 @@ class ScraperBehavior:
70
72
  """
71
73
  Dynamic string representation of the configuration.
72
74
  """
73
- attributes = [f"{field.name}={
74
- getattr(self, field.name)}" for field in fields(self)]
75
- return f"Scraper Behavior: \n{'\n'.join(attributes)}"
75
+ attributes = [(f"{field.name}="
76
+ f"{getattr(self, field.name)}") for field in fields(self)]
77
+ attributes_str = '\n'.join(attributes)
78
+ return (f"Scraper Behavior: \n"
79
+ f"{attributes_str}")
76
80
 
77
81
 
78
82
  @dataclass_json(undefined=Undefined.EXCLUDE)
@@ -169,7 +173,9 @@ class Novel:
169
173
  f"TOC Info: {toc_info}",
170
174
  f"Host: {self.host}"
171
175
  ]
172
- return f"Novel Info: \n{'\n'.join(attributes)}"
176
+ attributes_str = '\n'.join(attributes)
177
+ return (f"Novel Info: \n"
178
+ f"{attributes_str}")
173
179
 
174
180
  # NOVEL PARAMETERS MANAGEMENT
175
181
 
@@ -186,8 +192,7 @@ class Novel:
186
192
  self.metadata.tags.append(tag)
187
193
  self.save_novel()
188
194
  return True
189
- logger.warning(f'Tag "{tag}" already exists on novel {
190
- self.metadata.novel_title}')
195
+ logger.warning(f'Tag "{tag}" already exists on novel {self.metadata.novel_title}')
191
196
  return False
192
197
 
193
198
  def remove_tag(self, tag: str) -> bool:
@@ -195,8 +200,7 @@ class Novel:
195
200
  self.metadata.tags.remove(tag)
196
201
  self.save_novel()
197
202
  return True
198
- logger.warning(f'Tag "{tag}" doesn\'t exist on novel {
199
- self.metadata.novel_title}')
203
+ logger.warning(f'Tag "{tag}" doesn\'t exist on novel {self.metadata.novel_title}')
200
204
  return False
201
205
 
202
206
  def set_cover_image(self, cover_image_path: str) -> bool:
@@ -273,8 +277,16 @@ class Novel:
273
277
  if chapters_url_from_toc_content is None:
274
278
  logger.error('Chapters url not found on toc_content')
275
279
  return False
276
- self.chapters_url_list = [*self.chapters_url_list,
277
- *chapters_url_from_toc_content]
280
+ # First we save a list of lists in case we need to invert the orderAdd commentMore actions
281
+ self.chapters_url_list.append(chapters_url_from_toc_content)
282
+
283
+ invert = self.decoder.is_index_inverted()
284
+ self.chapters_url_list = [
285
+ chapter
286
+ for chapters_url in (self.chapters_url_list[::-1] if invert else self.chapters_url_list)
287
+ for chapter in chapters_url
288
+ ]
289
+
278
290
  if self.scraper_behavior.auto_add_host:
279
291
  self.chapters_url_list = [
280
292
  f'https://{self.host}{chapter_url}' for chapter_url in self.chapters_url_list]
@@ -298,11 +310,9 @@ class Novel:
298
310
  chapter_list = "Chapters List:\n"
299
311
  for i, chapter in enumerate(self.chapters):
300
312
  chapter_list += f"Chapter {i + 1}:\n"
301
- chapter_list += f" Title: {
302
- chapter.chapter_title if chapter.chapter_title else 'Title not yet scrapped'}\n"
313
+ chapter_list += f" Title: {chapter.chapter_title if chapter.chapter_title else 'Title not yet scrapped'}\n"
303
314
  chapter_list += f" URL: {chapter.chapter_url}\n"
304
- chapter_list += f" Filename: {
305
- chapter.chapter_html_filename if chapter.chapter_html_filename else 'File not yet requested'}\n"
315
+ chapter_list += f" Filename: {chapter.chapter_html_filename if chapter.chapter_html_filename else 'File not yet requested'}\n"
306
316
  return chapter_list
307
317
 
308
318
  def scrap_chapter(self, chapter_url: str = None, chapter_idx: int = None, update_html: bool = False) -> Chapter:
@@ -379,8 +389,7 @@ class Novel:
379
389
  chapter = self._get_chapter(
380
390
  chapter=chapter, reload=update_html)
381
391
  if not chapter.chapter_html_filename:
382
- logger.critical(f'Error requesting chapter {
383
- i} with url {chapter.chapter_url}')
392
+ logger.critical(f'Error requesting chapter {i} with url {chapter.chapter_url}')
384
393
  return False
385
394
 
386
395
  self._add_or_update_chapter_data(chapter=chapter, link_idx=i,
@@ -402,16 +411,15 @@ class Novel:
402
411
  self.sync_toc()
403
412
 
404
413
  if start_chapter > len(self.chapters):
405
- logger.info(f'The start chapter is bigger than the number of chapters saved ({
406
- len(self.chapters)})')
414
+ logger.info(f'The start chapter is bigger than the number of chapters saved ({len(self.chapters)})')
407
415
  return
408
416
 
409
417
  if not end_chapter:
410
418
  end_chapter = len(self.chapters)
411
419
  elif end_chapter > len(self.chapters):
412
420
  end_chapter = len(self.chapters)
413
- logger.info(f'The end chapter is bigger than the number of chapters, automatically setting it to {
414
- end_chapter}.')
421
+ logger.info(f'The end chapter is bigger than the number of chapters, '
422
+ f'automatically setting it to {end_chapter}.')
415
423
 
416
424
  idx = 1
417
425
  start = start_chapter
@@ -421,8 +429,8 @@ class Novel:
421
429
  end_chapter=end,
422
430
  collection_idx=idx)
423
431
  if not result:
424
- logger.critical(f'Error with saving novel to epub, with start chapter: {
425
- start_chapter} and end chapter: {end_chapter}')
432
+ logger.critical(f'Error with saving novel to epub, with start chapter: '
433
+ f'{start_chapter} and end chapter: {end_chapter}')
426
434
  return False
427
435
  start = start + chapters_by_book
428
436
  idx = idx + 1
@@ -464,6 +472,16 @@ class Novel:
464
472
  toc = self.decoder.clean_html(toc, hard_clean=hard_clean)
465
473
  self.file_manager.update_toc(toc, i)
466
474
 
475
+ def _request_html_content(self, url: str) -> Optional[str]:
476
+ request_config = self.decoder.request_config
477
+ force_flaresolver = request_config.get('force_flaresolver') or self.scraper_behavior.force_flaresolver
478
+ html_content = get_html_content(url,
479
+ retries=request_config.get('request_retries'),
480
+ timeout=request_config.get('request_timeout'),
481
+ time_between_retries=request_config.get('request_time_between_retries'),
482
+ force_flaresolver=force_flaresolver)
483
+ return html_content
484
+
467
485
  def _get_chapter(self,
468
486
  chapter: Chapter,
469
487
  reload: bool = False) -> Chapter | None:
@@ -481,8 +499,7 @@ class Novel:
481
499
  return chapter
482
500
 
483
501
  # Fetch fresh content
484
- chapter.chapter_html = request_manager.get_html_content(chapter.chapter_url,
485
- force_flaresolver=self.scraper_behavior.force_flaresolver)
502
+ chapter.chapter_html = self._request_html_content(chapter.chapter_url)
486
503
  if not chapter.chapter_html:
487
504
  logger.error(f'No content found on link {chapter.chapter_url}')
488
505
  return chapter
@@ -501,7 +518,11 @@ class Novel:
501
518
  if content:
502
519
  return content
503
520
 
504
- content = request_manager.get_html_content(url)
521
+ if utils.check_incomplete_url(url):
522
+ url = self.toc_main_url + url
523
+
524
+ # Fetch fresh content
525
+ content = self._request_html_content(url)
505
526
  if not content:
506
527
  logger.warning(f'No content found on link {url}')
507
528
  sys.exit(1)
@@ -679,8 +700,7 @@ class Novel:
679
700
  idx_start = start_chapter - 1
680
701
  idx_end = end_chapter
681
702
  # We create the epub book
682
- book_title = f'{self.metadata.novel_title} Chapters {
683
- start_chapter} - {end_chapter}'
703
+ book_title = f'{self.metadata.novel_title} Chapters {start_chapter} - {end_chapter}'
684
704
  calibre_collection = None
685
705
  # If collection_idx is set, we create a calibre collection
686
706
  if collection_idx:
@@ -692,8 +712,7 @@ class Novel:
692
712
  book = self._add_chapter_to_epub_book(chapter=chapter,
693
713
  book=book)
694
714
  if book is None:
695
- logger.critical(f'Error saving epub {book_title}, could not decode chapter {
696
- chapter} using host {self.host}')
715
+ logger.critical(f'Error saving epub {book_title}, could not decode chapter {chapter} using host {self.host}')
697
716
  return False
698
717
 
699
718
  book.add_item(epub.EpubNcx())
@@ -4,6 +4,7 @@ from . import logger_manager
4
4
  from dotenv import load_dotenv
5
5
  import json
6
6
  import time
7
+ from typing import Optional
7
8
 
8
9
  load_dotenv()
9
10
 
@@ -13,45 +14,52 @@ FORCE_FLARESOLVER = os.getenv('FORCE_FLARESOLVER', '0') == '1'
13
14
 
14
15
  logger = logger_manager.create_logger('GET HTML CONTENT')
15
16
 
16
- def get_request(url: str,
17
- timeout: int = 20,
18
- retries: int = 3,
19
- time_between_retries: int = 1) -> requests.Response | None:
20
- logger.debug(f'Starting get_request for {url} with timeout={timeout}, retries={retries}, time_between_retries={time_between_retries}')
17
+
18
+ def _get_request(url: str,
19
+ timeout: int,
20
+ retries: int,
21
+ time_between_retries: int) -> Optional[requests.Response]:
22
+ logger.debug(
23
+ f'Starting get_request for "{url}" with timeout={timeout}, '
24
+ f'retries={retries}, '
25
+ f'time_between_retries={time_between_retries}')
21
26
  for attempt in range(retries):
22
- logger.debug(f'Attempt {attempt + 1} for {url}')
27
+ logger.debug(f'Attempt {attempt + 1} for "{url}"')
23
28
  try:
24
29
  response = requests.get(url, timeout=timeout)
25
30
  response.raise_for_status()
26
- logger.debug(f'Successful response for {url} on attempt {attempt + 1}')
31
+ logger.debug(f'Successful response for "{url}" on attempt {attempt + 1}')
27
32
  return response
28
33
  except requests.exceptions.ConnectionError as e:
29
- logger.error(f'Connection error ({attempt + 1}/{retries}): {e}')
34
+ logger.debug(f'Connection error ({attempt + 1}/{retries}): {e}')
30
35
  except requests.exceptions.Timeout as e:
31
- logger.error(f'Request timed out ({attempt + 1}/{retries}): {e}')
36
+ logger.debug(f'Request timed out ({attempt + 1}/{retries}): {e}')
32
37
  except requests.exceptions.HTTPError as e:
33
- logger.error(f'HTTP error ({attempt + 1}/{retries}): {e}')
38
+ logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
34
39
  except requests.exceptions.InvalidSchema as e:
35
- logger.error(f'Invalid URL schema for "{url}": {e}')
40
+ logger.debug(f'Invalid URL schema for "{url}": {e}')
36
41
  break # Don't retry on invalid schema
37
42
  except requests.exceptions.RequestException as e:
38
- logger.error(f'Request failed ({attempt + 1}/{retries}): {e}')
43
+ logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
39
44
 
40
45
  if attempt < retries - 1:
41
46
  logger.debug(f'Waiting {time_between_retries} seconds before retrying')
42
47
  time.sleep(time_between_retries) # Wait before retrying
43
- logger.debug(f'Failed to get a successful response for {url} after {retries} attempts')
48
+ logger.warning(f'Failed to get a successful response for "{url}" after {retries} attempts using common HTTP Request')
44
49
  return None
45
50
 
46
51
 
47
- def get_request_flaresolver(url: str,
48
- timeout: int = 20,
49
- flaresolver_url: str = FLARESOLVER_URL,
50
- retries: int = 3,
51
- time_between_retries: int = 1) -> requests.Response | None:
52
- logger.debug(f'Starting get_request_flaresolver for {url} with timeout={timeout}, retries={retries}, time_between_retries={time_between_retries}')
52
+ def _get_request_flaresolver(url: str,
53
+ timeout: int,
54
+ retries: int,
55
+ time_between_retries: int,
56
+ flaresolver_url: str) -> Optional[requests.Response]:
57
+ logger.debug(
58
+ f'Starting get_request_flaresolver for "{url}" with timeout={timeout}, '
59
+ f'retries={retries}, '
60
+ f'time_between_retries={time_between_retries}')
53
61
  for attempt in range(retries):
54
- logger.debug(f'Attempt {attempt + 1} for {url} using FlareSolver')
62
+ logger.debug(f'Attempt {attempt + 1} for "{url}" using FlareSolver')
55
63
  try:
56
64
  response = requests.post(
57
65
  flaresolver_url,
@@ -64,71 +72,76 @@ def get_request_flaresolver(url: str,
64
72
  timeout=timeout
65
73
  )
66
74
  response.raise_for_status()
67
- logger.debug(f'Successful response for {url} on attempt {attempt + 1} using FlareSolver')
75
+ logger.debug(f'Successful response for "{url}" on attempt {attempt + 1} using FlareSolver')
68
76
  return response
69
77
 
70
78
  except requests.exceptions.ConnectionError as e:
71
- logger.error(f'Connection error ({attempt + 1}/{retries}), check FlareSolver host: {flaresolver_url}: {e}')
79
+ logger.warning(f'Connection error with flaresolver (URL: "{flaresolver_url}"): {e}')
80
+ logger.warning(f'If the url is incorrect, set the env variable "FLARESOLVER_URL" to the correct value')
81
+ logger.warning('If FlareSolver is not installed in your machine, consider installing it.')
82
+ break # Don't retry on Connection Error
72
83
  except requests.exceptions.Timeout as e:
73
- logger.error(f'Request timed out ({attempt + 1}/{retries}): {e}')
84
+ logger.debug(f'Request timed out ({attempt + 1}/{retries}): {e}')
74
85
  except requests.exceptions.InvalidSchema as e:
75
- logger.error(f'Invalid FlareSolver URL schema "{flaresolver_url}": {e}')
86
+ logger.debug(f'Invalid FlareSolver URL schema "{flaresolver_url}": {e}')
76
87
  break # Don't retry on invalid schema
77
88
  except requests.exceptions.HTTPError as e:
78
- logger.error(f'HTTP error ({attempt + 1}/{retries}): {e}')
89
+ logger.debug(f'HTTP error ({attempt + 1}/{retries}): {e}')
79
90
  except requests.exceptions.RequestException as e:
80
- logger.error(f'Request failed ({attempt + 1}/{retries}): {e}')
91
+ logger.debug(f'Request failed ({attempt + 1}/{retries}): {e}')
81
92
  except json.JSONDecodeError as e:
82
- logger.error(f'Invalid JSON response ({attempt + 1}/{retries}): {e}')
93
+ logger.debug(f'Invalid JSON response ({attempt + 1}/{retries}): {e}')
83
94
 
84
95
  if attempt < retries - 1:
85
96
  logger.debug(f'Waiting {time_between_retries} seconds before retrying')
86
97
  time.sleep(time_between_retries) # Wait before retrying
87
- logger.debug(f'Failed to get a successful response for {url} using FlareSolver after {retries} attempts')
98
+
99
+ logger.warning(f'Failed to get a successful response for "{url}" using FlareSolver after {retries} attempts')
88
100
  return None
89
101
 
90
102
 
91
103
  def get_html_content(url: str,
92
- retries: int = 5,
93
- flaresolver: bool = True,
104
+ retries: int = 3,
105
+ timeout: int = 20,
106
+ time_between_retries: int = 3,
94
107
  flaresolver_url: str = FLARESOLVER_URL,
95
- time_between_retries: int = 1,
96
- force_flaresolver: bool = FORCE_FLARESOLVER) -> str | None:
97
- logger.debug(f'Starting get_html_content for {url} with retries={retries}, flaresolver={flaresolver}, flaresolver_url={flaresolver_url}, time_between_retries={time_between_retries}, force_flaresolver={force_flaresolver}')
108
+ force_flaresolver: bool = FORCE_FLARESOLVER) -> Optional[str]:
109
+ logger.debug(
110
+ f'Requesting HTML Content for "{url}" with '
111
+ f'retries: "{retries}", '
112
+ f'timeout: "{timeout}", '
113
+ f'time between retries: "{time_between_retries}"')
114
+ if force_flaresolver:
115
+ logger.debug('Will directly try with FlareSolver')
116
+
98
117
  # First try with common HTTP request
99
118
  if not force_flaresolver:
100
- response = get_request(
101
- url, timeout=20, retries=retries, time_between_retries=time_between_retries)
102
- if not response:
103
- logger.warning(f'Failed to get response from {url} using common HTTP request')
104
- elif not response.ok:
105
- logger.warning(f'Response with errors from {url} using common HTTP request')
106
- else:
107
- logger.debug(f'Successfully retrieved HTML content from {url} using common HTTP request')
119
+ response = _get_request(url,
120
+ timeout=timeout,
121
+ retries=retries,
122
+ time_between_retries=time_between_retries)
123
+ if response and response.ok:
124
+ logger.debug(f'Successfully retrieved HTML content from "{url}" using common HTTP request')
108
125
  return response.text
109
126
 
110
- # If flaresolver is disabled, return None
111
- if not flaresolver:
112
- logger.debug(f'Flaresolver is disabled, returning None for {url}')
113
- return None
114
-
115
127
  # Try with Flaresolver
116
- logger.debug(f'Trying with Flaresolver for {url}')
117
- response = get_request_flaresolver(
118
- url, timeout=20, flaresolver_url=flaresolver_url, time_between_retries=time_between_retries)
119
- if not response:
120
- logger.critical(f'Failed to get response from {url} using FlareSolver')
121
- return None
122
- if not response.ok:
123
- logger.critical(f'Response with errors from {url} using FlareSolver')
128
+ logger.debug(f'Trying with Flaresolver for "{url}"')
129
+ response = _get_request_flaresolver(url,
130
+ timeout=timeout,
131
+ retries=retries,
132
+ time_between_retries=time_between_retries,
133
+ flaresolver_url=flaresolver_url)
134
+ if not response or not response.ok:
135
+ logger.warning(f'Failed all attempts to get HTML content from "{url}')
124
136
  return None
125
137
 
126
138
  response_json = response.json()
127
139
  if 'solution' not in response_json:
128
- logger.critical(f'No solution found in FlareSolver response for {url}')
140
+ logger.warning(f'No solution found in FlareSolver response for "{url}"')
129
141
  return None
130
142
  if 'response' not in response_json['solution']:
131
- logger.critical(f'No response found in FlareSolver solution for {url}')
143
+ logger.warning(f'No response found in FlareSolver solution for "{url}"')
132
144
  return None
133
- logger.debug(f'Successfully retrieved HTML content from {url} using FlareSolver')
145
+
146
+ logger.debug(f'Successfully retrieved HTML content from "{url}" using FlareSolver')
134
147
  return response_json['solution']['response']
@@ -64,3 +64,10 @@ def check_exclusive_params(param1: any, param2: any) -> bool:
64
64
 
65
65
  def create_volume_id(n: int):
66
66
  return f'v{n:02}'
67
+
68
+ def check_incomplete_url(url: str) -> bool:
69
+ if url.startswith('?') or url.startswith('#'):
70
+ return True
71
+
72
+ parsed = urlparse(url)
73
+ return not parsed.scheme or not parsed.netloc
@@ -1 +1 @@
1
- __version__ = "1.0.4"
1
+ __version__ = "1.1.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: web-novel-scraper
3
- Version: 1.0.4
3
+ Version: 1.1.1
4
4
  Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
5
5
  Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
6
6
  Project-URL: Documentation, https://web-novel-scraper.readthedocs.io
@@ -0,0 +1,18 @@
1
+ web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ web_novel_scraper/__main__.py,sha256=OQQVX5CttmAkUwdrnjBSjKPaoh_boUI2ysHi3rLGOSs,17769
3
+ web_novel_scraper/decode.py,sha256=U-78PhJ4SU2hiUmfAWeWGEBJ3YSoCW3Lupw9cUqQuI0,11013
4
+ web_novel_scraper/file_manager.py,sha256=qAqgqtaRb7QyVtyEOW2cMhPYWdKM6nJ69weUCYKwVtM,11862
5
+ web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
6
+ web_novel_scraper/novel_scraper.py,sha256=DsYnY15s8cZZ2w8pRvmD3_NJw54xarhcnEQdvnTD8XI,29421
7
+ web_novel_scraper/request_manager.py,sha256=WU8LG6D_fqmDapX6wpVwpQQSItcNU8Qb9dMAlLCYI8U,6621
8
+ web_novel_scraper/utils.py,sha256=dPp7D2ji9mC2nFydqxsJ_9vkAntA_3VTt8ZmG-F1f78,2270
9
+ web_novel_scraper/version.py,sha256=q8_5C0f-8mHWNb6mMw02zlYPnEGXBqvOmP3z0CEwZKM,22
10
+ web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
11
+ web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
12
+ web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
13
+ web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
14
+ web_novel_scraper/decode_guide/decode_guide.json,sha256=gNVencLtK0HmZPlubTm1wA7eatWADCxJ_LCOYWHWuA0,8556
15
+ web_novel_scraper-1.1.1.dist-info/METADATA,sha256=ow5piBhzzo4mZ0secvHrqc4KCCt4VInpDa09Qo9l4AE,8423
16
+ web_novel_scraper-1.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
17
+ web_novel_scraper-1.1.1.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
18
+ web_novel_scraper-1.1.1.dist-info/RECORD,,
@@ -1,18 +0,0 @@
1
- web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- web_novel_scraper/__main__.py,sha256=OQQVX5CttmAkUwdrnjBSjKPaoh_boUI2ysHi3rLGOSs,17769
3
- web_novel_scraper/decode.py,sha256=0RMHx1buR01KhuXiVQwdSpCGN960Xh-iPw1eYHxLeDg,10181
4
- web_novel_scraper/file_manager.py,sha256=Q3DH-c8fWz9sziMps7A3p_sQoDMEpqBket07Agh-__Q,11898
5
- web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
6
- web_novel_scraper/novel_scraper.py,sha256=Notk0O94HZrO-MVKDGCBL0VopApFchn13FO2_N3ZfRM,28418
7
- web_novel_scraper/request_manager.py,sha256=VtGpRi5b_Dp3h8viCdt7yMCb9M21Lk7oLC_Q_0EkXH8,6448
8
- web_novel_scraper/utils.py,sha256=vq5ROuPv04k3MhbksTe0ni_yP0i_a7T_33mkBB1DUbQ,2076
9
- web_novel_scraper/version.py,sha256=acuR_XSJzp4OrQ5T8-Ac5gYe48mUwObuwjRmisFmZ7k,22
10
- web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
11
- web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
12
- web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
13
- web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
14
- web_novel_scraper/decode_guide/decode_guide.json,sha256=IBBzbSSVO-yQ5PCY7o8ralnaonMwBpEZW1v1TStiVqc,7582
15
- web_novel_scraper-1.0.4.dist-info/METADATA,sha256=IhvDqK_Gz1POjzbH2cQVUYql1dhZJvdHnM9R--le0uc,8423
16
- web_novel_scraper-1.0.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
17
- web_novel_scraper-1.0.4.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
18
- web_novel_scraper-1.0.4.dist-info/RECORD,,