web-novel-scraper 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -52,7 +52,7 @@ def validate_date(ctx, param, value):
52
52
 
53
53
  # COMMON ARGUMENTS
54
54
  title_option = click.option(
55
- '-t', '--title', type=str, required=True, help='Title of the novel, this server as the identifier.')
55
+ '-t', '--title', type=str, required=True, envvar='SCRAPER_NOVEL_TITLE', help='Title of the novel, this server as the identifier.')
56
56
  novel_base_dir_option = click.option(
57
57
  '-nb', '--novel-base-dir', type=str, help='Alternative base directory for the novel files.')
58
58
 
@@ -330,19 +330,25 @@ def show_toc(title, novel_base_dir):
330
330
  @click.option('--update-html', is_flag=True, default=False, show_default=True, help='If the chapter HTML is saved, it will be updated.')
331
331
  def scrap_chapter(title, novel_base_dir, chapter_url, chapter_num, update_html):
332
332
  """Scrap a chapter of a novel."""
333
+ if (chapter_url is None and chapter_num is None) or (chapter_url and chapter_num):
334
+ raise click.UsageError("You must set exactly one: --chapter-url o --chapter-num.")
335
+
333
336
  novel = obtain_novel(title, novel_base_dir)
334
- if not chapter_url and not chapter_num:
335
- click.echo('Chapter URL or chapter number should be set.', err=True)
336
- if chapter_num and chapter_url:
337
- click.echo('It should be either chapter URL or chapter number.', err=True)
338
- if chapter_num <= 0 or chapter_num > len(novel.chapters):
339
- raise click.BadParameter(
340
- 'Chapter number should be positive and an existing chapter.', param_hint='--chapter-num')
341
- chapter = novel.scrap_chapter(
342
- chapter_url=chapter_url, chapter_idx=chapter_num - 1, update_html=update_html)
337
+
338
+ if chapter_num is not None:
339
+ if chapter_num <= 0 or chapter_num > len(novel.chapters):
340
+ raise click.BadParameter(
341
+ 'Chapter number should be positive and an existing chapter.', param_hint='--chapter-num')
342
+ chapter = novel.scrap_chapter(chapter_idx=chapter_num - 1,
343
+ update_html=update_html)
344
+
345
+ else:
346
+ chapter = novel.scrap_chapter(chapter_url=chapter_url,
347
+ update_html=update_html)
348
+
343
349
  if not chapter:
344
- click.echo('Chapter number or URL not found.', err=True)
345
- return
350
+ raise click.ClickException('Chapter not found or scrap failed.')
351
+
346
352
  click.echo(chapter)
347
353
  click.echo('Content:')
348
354
  click.echo(chapter.chapter_content)
@@ -0,0 +1,2 @@
1
+ from .custom_processor import CustomProcessor, ProcessorRegistry
2
+ from .sites import royalroad, genesis
@@ -0,0 +1,25 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Dict
3
+
4
+ class CustomProcessor(ABC):
5
+ @abstractmethod
6
+ def process(self, html: str) -> Any:
7
+ """Process the HTML content using custom logic"""
8
+ pass
9
+
10
+ class ProcessorRegistry:
11
+ _processors: Dict[str, Dict[str, CustomProcessor]] = {}
12
+
13
+ @classmethod
14
+ def register(cls, host: str, content_type: str, processor: CustomProcessor):
15
+ if host not in cls._processors:
16
+ cls._processors[host] = {}
17
+ cls._processors[host][content_type] = processor
18
+
19
+ @classmethod
20
+ def get_processor(cls, host: str, content_type: str) -> CustomProcessor:
21
+ return cls._processors.get(host, {}).get(content_type)
22
+
23
+ @classmethod
24
+ def has_processor(cls, host: str, content_type: str) -> bool:
25
+ return bool(cls.get_processor(host, content_type))
@@ -0,0 +1,46 @@
1
+ import re
2
+ import json
3
+ from typing import List, Optional
4
+ from ..custom_processor import CustomProcessor, ProcessorRegistry
5
+
6
+ GENESIS_STUDIO_VIEWER_URL = 'https://genesistudio.com/viewer'
7
+
8
+ class GenesisChaptersProcessor(CustomProcessor):
9
+ def process(self, html: str) -> Optional[List[dict]]:
10
+ pattern = r',chapters:\s*{\s*free:\s*(\[.*?"}}])'
11
+ match = re.search(pattern, html, re.DOTALL)
12
+
13
+ if not match:
14
+ if not match:
15
+ return None
16
+
17
+ try:
18
+ chapters_json = match.group(1).strip()
19
+ replaces = {
20
+ "chapter_title:": '"chapter_tile":',
21
+ "id:": '"id":',
22
+ "nsfw:": '"nsfw":',
23
+ "required_tier:": '"required_tier":',
24
+ "date_created:": '"date_created":',
25
+ "spoiler_title:": '"spoiler_title":',
26
+ "chapter_number:": '"chapter_number":',
27
+ "novel:": '"novel":',
28
+ }
29
+ # Ensure the JSON string ends properly
30
+ if not chapters_json.endswith(']'):
31
+ chapters_json += ']'
32
+ for old_key, new_key in replaces.items():
33
+ chapters_json = chapters_json.replace(old_key, new_key)
34
+ # print(f"Extracted JSON: {chapters_json[12200:12300]}") # Debug print
35
+ chapters = json.loads(chapters_json)
36
+ chapters_url = []
37
+ for chapter in chapters:
38
+ chapters_url.append(f"{GENESIS_STUDIO_VIEWER_URL}/{chapter['id']}")
39
+ print(chapters)
40
+ return chapters_url
41
+
42
+ except (json.JSONDecodeError, IndexError) as e:
43
+ print(f"Error processing JSON: {str(e)}")
44
+ return None
45
+
46
+ ProcessorRegistry.register('genesistudio.com', 'index', GenesisChaptersProcessor())
@@ -0,0 +1,22 @@
1
+ import re
2
+ import json
3
+ from typing import List, Optional
4
+ from ..custom_processor import CustomProcessor, ProcessorRegistry
5
+
6
+ class RoyalRoadChaptersProcessor(CustomProcessor):
7
+ def process(self, html: str) -> Optional[List[dict]]:
8
+ pattern = r'window\.chapters\s*=\s*(\[.*?\]);'
9
+ match = re.search(pattern, html, re.DOTALL)
10
+
11
+ if not match:
12
+ return None
13
+
14
+ try:
15
+ chapters_json = match.group(1)
16
+ chapters = json.loads(chapters_json)
17
+ chapters = [chapter['url'] for chapter in chapters if 'url' in chapter]
18
+ return chapters
19
+ except (json.JSONDecodeError, IndexError):
20
+ return None
21
+
22
+ ProcessorRegistry.register('www.royalroad.com', 'index', RoyalRoadChaptersProcessor())
@@ -1,8 +1,10 @@
1
1
  import os
2
2
  import json
3
3
  from pathlib import Path
4
+ from typing import Optional
4
5
 
5
6
  from . import logger_manager
7
+ from .custom_processor.custom_processor import ProcessorRegistry
6
8
 
7
9
  from bs4 import BeautifulSoup
8
10
 
@@ -41,17 +43,92 @@ class Decoder:
41
43
  self.decode_guide = self._get_element_by_key(
42
44
  DECODE_GUIDE, 'host', host)
43
45
 
44
- def decode_html(self, html: str, content_type: str):
46
+ def get_chapter_urls(self, html: str) -> list[str]:
47
+ logger.debug('Obtaining chapter URLs...')
48
+ chapter_urls = self.decode_html(html, 'index')
49
+
50
+ if chapter_urls is None:
51
+ logger.critical(f"Failed to obtain chapter URLs for {self.host}")
52
+ raise ValueError(f"Failed to obtain chapter URLs for {self.host}")
53
+
54
+ if isinstance(chapter_urls, str):
55
+ logger.warning('When obtaining chapter urls, obtained a String but expected a List')
56
+ logger.warning('Check decode config')
57
+ chapter_urls = [chapter_urls]
58
+
59
+ return chapter_urls
60
+
61
+ def get_toc_next_page_url(self, html: str) -> Optional[str]:
62
+ logger.debug('Obtaining toc next page URL...')
63
+ toc_next_page_url = self.decode_html(html, 'next_page')
64
+ if toc_next_page_url is None:
65
+ logger.debug('No next page URL found, assuming last page...')
66
+ return None
67
+ return toc_next_page_url
68
+
69
+ def get_chapter_title(self, html: str) -> Optional[str]:
70
+ logger.debug('Obtaining chapter title...')
71
+ chapter_title = self.decode_html(html, 'title')
72
+ if chapter_title is None:
73
+ logger.debug(f'No chapter_title found.')
74
+ return chapter_title
75
+
76
+ def get_chapter_content(self, html: str, save_title_to_content: bool, chapter_title: str) -> str:
77
+ logger.debug('Obtaining chapter content...')
78
+ full_chapter_content = ""
79
+ chapter_content = self.decode_html(html, 'content')
80
+
81
+ if chapter_content is None:
82
+ logger.critical('No content found on chapter')
83
+ raise ValueError('No content found on chapter')
84
+
85
+ if save_title_to_content:
86
+ logger.debug('Saving chapter title to content...')
87
+ full_chapter_content += f'<h4>{chapter_title}</h4>'
88
+
89
+ if isinstance(chapter_content, list):
90
+ logger.debug(f'{len(chapter_content)} paragraphs found in chapter')
91
+ logger.debug('Converting list of paragraphs to a single string')
92
+ for paragraph in chapter_content:
93
+ full_chapter_content += str(paragraph)
94
+ else:
95
+ logger.debug('Chapter content is not a list, no conversion made')
96
+ full_chapter_content += str(chapter_content)
97
+ return full_chapter_content
98
+
99
+ def decode_html(self, html: str, content_type: str) -> str | list[str] | None:
100
+ logger.debug(f'Decoding HTML...')
101
+ logger.debug(f'Content type: {content_type}')
102
+ logger.debug(f'Decode guide: {DECODE_GUIDE_FILE}')
103
+ logger.debug(f'Host: {self.host}')
45
104
  if not content_type in self.decode_guide:
46
- logger.error(f'{content_type} key does not exists on decode guide {
47
- DECODE_GUIDE_FILE} for host {self.host}')
48
- return
49
- soup = BeautifulSoup(html, 'html.parser')
105
+ logger.critical(f'{content_type} key does not exists on decode guide {DECODE_GUIDE_FILE}'
106
+ f'for host {self.host}')
107
+ raise ValueError(f'{content_type} key does not exists on decode guide {DECODE_GUIDE_FILE}'
108
+ f'for host {self.host}')
109
+
110
+ if ProcessorRegistry.has_processor(self.host, content_type):
111
+ logger.debug(f'Host {self.host} will use a custom processor')
112
+ processor = ProcessorRegistry.get_processor(self.host, content_type)
113
+ return processor.process(html)
114
+
115
+ logger.debug('Starting HTML parsing...')
116
+ try:
117
+ soup = BeautifulSoup(html, 'html.parser')
118
+ except Exception as e:
119
+ logger.critical(f'Error parsing HTML with BeautifulSoup: {e}')
120
+ raise ValueError(f'Error parsing HTML with BeautifulSoup: {e}')
121
+
50
122
  decoder = self.decode_guide[content_type]
51
123
  elements = self._find_elements(soup, decoder)
52
124
  if not elements:
53
- logger.warning(f'{content_type} not found on html using {
54
- DECODE_GUIDE_FILE} for host {self.host}')
125
+ logger.warning(f'{content_type} not found on html using {DECODE_GUIDE_FILE} '
126
+ f'for host {self.host}')
127
+
128
+ # Investigate this conditional
129
+ if content_type == 'title' and isinstance(elements, list):
130
+ logger.debug('Joining titles...')
131
+ return ' '.join(elements)
55
132
  return elements
56
133
 
57
134
  def has_pagination(self, host: str = None):
@@ -81,8 +158,11 @@ class Decoder:
81
158
 
82
159
  return "\n".join([line.strip() for line in str(soup).splitlines() if line.strip()])
83
160
 
84
- def _find_elements(self, soup: BeautifulSoup, decoder: dict):
161
+ @staticmethod
162
+ def _find_elements(soup: BeautifulSoup, decoder: dict):
163
+ logger.debug('Finding elements...')
85
164
  selector = decoder.get('selector')
165
+ elements = []
86
166
  if selector is None:
87
167
  selector = ''
88
168
  element = decoder.get('element')
@@ -91,32 +171,46 @@ class Decoder:
91
171
  attributes = decoder.get('attributes')
92
172
 
93
173
  if element:
174
+ logger.debug(f'Using element "{element}"')
94
175
  selector += element
95
176
  if _id:
177
+ logger.debug(f'Using id "{_id}"')
96
178
  selector += f'#{_id}'
97
179
  if _class:
180
+ logger.debug(f'Using class "{_class}"')
98
181
  selector += f'.{_class}'
99
182
  if attributes:
100
183
  for attr, value in attributes.items():
101
- selector += f'[{attr}="{value}"]' if value else f'[{attr}]'
184
+ logger.debug(f'Using attribute "{attr}"')
185
+ if value is not None:
186
+ logger.debug(f'With value "{value}"')
187
+ selector += f'[{attr}="{value}"]'
188
+ else:
189
+ selector += f'[{attr}]'
102
190
  selectors = [selector]
103
191
  else:
192
+ logger.debug(f'Using selector "{selector}"')
104
193
  if XOR_SEPARATOR in selector:
194
+ logger.debug(f'Found XOR_OPERATOR "{XOR_SEPARATOR}" in selector')
195
+ logger.debug('Splitting selectors...')
105
196
  selectors = selector.split(XOR_SEPARATOR)
106
197
  else:
107
198
  selectors = [selector]
108
199
 
109
200
  for selector in selectors:
110
- logger.debug(f'Attempt using selector {selector}')
201
+ logger.debug(f'Searching using selector "{selector}"...')
111
202
  elements = soup.select(selector)
112
203
  if elements:
113
- logger.debug(f'{len(elements)} found using selector {selector}')
204
+ logger.debug(f'{len(elements)} found using selector "{selector}"')
114
205
  break
206
+ logger.debug(f'No elements found using selector "{selector}"')
115
207
 
116
208
  extract = decoder.get('extract')
117
209
  if extract:
210
+ logger.debug(f'Extracting from elements...')
118
211
  if extract["type"] == "attr":
119
212
  attr_key = extract["key"]
213
+ logger.debug(f'Extracting value from attribute "{attr_key}"...')
120
214
  elements_aux = elements
121
215
  elements = []
122
216
  for element in elements_aux:
@@ -125,15 +219,34 @@ class Decoder:
125
219
  if attr:
126
220
  elements.append(attr)
127
221
  except KeyError:
222
+ logger.debug(f'Attribute "{attr_key}" not found')
223
+ logger.debug('Ignoring...')
128
224
  pass
225
+ logger.debug(f'{len(elements)} elements found using attribute "{attr_key}"')
129
226
  if extract["type"] == "text":
227
+ logger.debug('Extracting text from elements...')
130
228
  elements = [element.string for element in elements]
229
+
230
+ if not elements:
231
+ logger.error('No elements found, returning "None"')
232
+ return None
233
+
131
234
  inverted = decoder.get('inverted')
132
235
  if inverted:
236
+ logger.debug('Inverted option activate')
237
+ logger.debug('Inverting elements order...')
133
238
  elements = elements[::-1]
134
- return elements if decoder.get('array') else elements[0] if elements else None
135
239
 
136
- def _get_element_by_key(self, json_data, key, value):
240
+ if decoder.get('array'):
241
+ logger.debug('Array option activated')
242
+ logger.debug('Returning elements a list')
243
+ return elements
244
+ logger.debug('Array option not activated')
245
+ logger.debug('Returning only first element...')
246
+ return elements[0]
247
+
248
+ @staticmethod
249
+ def _get_element_by_key(json_data, key, value):
137
250
  for item in json_data:
138
251
  if item[key] == value:
139
252
  return item
@@ -24,7 +24,11 @@
24
24
  "class": null,
25
25
  "selector": null,
26
26
  "attributes": null,
27
- "array": true
27
+ "array": true,
28
+ "extract": {
29
+ "type": "attr",
30
+ "key": "href"
31
+ }
28
32
  },
29
33
  "next_page": {
30
34
  "element": "p",
@@ -60,7 +64,11 @@
60
64
  "class": null,
61
65
  "selector": "div.m-newest2 ul li a",
62
66
  "attributes": null,
63
- "array": true
67
+ "array": true,
68
+ "extract": {
69
+ "type": "attr",
70
+ "key": "href"
71
+ }
64
72
  },
65
73
  "next_page": {
66
74
  "element": null,
@@ -72,7 +80,7 @@
72
80
  }
73
81
  },
74
82
  {
75
- "host": "royalroad.com",
83
+ "host": "www.royalroad.com",
76
84
  "has_pagination": false,
77
85
  "title": {
78
86
  "element": null,
@@ -95,12 +103,7 @@
95
103
  "array": true
96
104
  },
97
105
  "index": {
98
- "element": null,
99
- "id": null,
100
- "class": null,
101
- "selector": "tr.chapter-row td a",
102
- "attributes": null,
103
- "array": true
106
+ "use_custom_processor": true
104
107
  },
105
108
  "next_page": {
106
109
  "element": null,
@@ -140,7 +143,11 @@
140
143
  "class": null,
141
144
  "selector": "ul.list-chapter li a",
142
145
  "attributes": null,
143
- "array": true
146
+ "array": true,
147
+ "extract": {
148
+ "type": "attr",
149
+ "key": "href"
150
+ }
144
151
  },
145
152
  "next_page": {
146
153
  "element": null,
@@ -167,10 +174,10 @@
167
174
  }
168
175
  },
169
176
  "content": {
170
- "element": "div#chr-content",
177
+ "element": null,
171
178
  "id": null,
172
179
  "class": null,
173
- "selector": null,
180
+ "selector": "div#chr-content p",
174
181
  "attributes": null,
175
182
  "array": true
176
183
  },
@@ -180,7 +187,11 @@
180
187
  "class": null,
181
188
  "selector": "ul.list-chapter li a",
182
189
  "attributes": null,
183
- "array": true
190
+ "array": true,
191
+ "extract": {
192
+ "type": "attr",
193
+ "key": "href"
194
+ }
184
195
  },
185
196
  "next_page": {
186
197
  "element": null,
@@ -207,7 +218,82 @@
207
218
  "index": {
208
219
  "element": "ul.main li a",
209
220
  "array": true,
210
- "inverted": true
221
+ "inverted": true,
222
+ "extract": {
223
+ "type": "attr",
224
+ "key": "href"
225
+ }
226
+ }
227
+ },
228
+ {
229
+ "host": "genesistudio.com",
230
+ "has_pagination": false,
231
+ "title": {
232
+ "element": null,
233
+ "id": null,
234
+ "class": null,
235
+ "selector": "p.leading-none span",
236
+ "attributes": null,
237
+ "array": true,
238
+ "extract": {
239
+ "type": "text",
240
+ "key": "text"
241
+ }
242
+ },
243
+ "content": {
244
+ "element": "p",
245
+ "id": null,
246
+ "class": "narration",
247
+ "selector": null,
248
+ "attributes": null,
249
+ "array": true,
250
+ "extract": {
251
+ "type": "text",
252
+ "key": "text"
253
+ }
254
+ },
255
+ "index": {
256
+ "use_custom_processor": true
257
+ },
258
+ "next_page": {
259
+ "element": null,
260
+ "id": null,
261
+ "class": null,
262
+ "selector": null,
263
+ "attributes": null,
264
+ "array": true
265
+ }
266
+ },
267
+ {
268
+ "host": "hostednovel.com",
269
+ "has_pagination": true,
270
+ "title": {
271
+ "selector": "span#chapter-title",
272
+ "extract": {
273
+ "type": "text"
274
+ }
275
+ },
276
+ "content": {
277
+ "element": "div",
278
+ "id": "chapter-content",
279
+ "array": true
280
+ },
281
+ "index": {
282
+ "selector": "li ul li.flow-root a",
283
+ "array": true,
284
+ "inverted": false,
285
+ "extract": {
286
+ "type": "attr",
287
+ "key": "href"
288
+ }
289
+ },
290
+ "next_page": {
291
+ "selector": "a:has(span:contains('Next'))",
292
+ "array": false,
293
+ "extract": {
294
+ "type": "attr",
295
+ "key": "href"
296
+ }
211
297
  }
212
298
  }
213
299
  ]
@@ -7,6 +7,7 @@ from pathlib import Path
7
7
  import shutil
8
8
  from dotenv import load_dotenv
9
9
  from ebooklib import epub
10
+ import unicodedata
10
11
 
11
12
  from . import logger_manager
12
13
 
@@ -77,6 +78,16 @@ class FileManager:
77
78
  def save_chapter_html(self, filename: str, content: str):
78
79
  full_path = self.novel_chapters_dir / filename
79
80
  logger.debug(f'Saving chapter to {full_path}')
81
+ content = unicodedata.normalize('NFKC', content)
82
+ char_replacements = {
83
+ "â": "'", # Reemplazar â con apóstrofe
84
+ "\u2018": "'", # Comillda simple izquierda Unicode
85
+ "\u2019": "'", # Comilla simple derecha Unicode
86
+ "\u201C": '"', # Comilla doble izquierda Unicode
87
+ "\u201D": '"', # Comilla doble derecha Unicode
88
+ }
89
+ for old_char, new_char in char_replacements.items():
90
+ content = content.replace(old_char, new_char)
80
91
  _save_content_to_file(full_path, content)
81
92
 
82
93
  def load_chapter_html(self, filename: str):
@@ -220,6 +220,7 @@ class Novel:
220
220
  self.decoder = Decoder(self.host)
221
221
  elif update_host:
222
222
  self.decoder = Decoder(utils.obtain_host(self.toc_main_url))
223
+ self.save_novel()
223
224
 
224
225
  def add_toc_html(self, html: str, host: str = None) -> None:
225
226
  if self.toc_main_url:
@@ -248,7 +249,7 @@ class Novel:
248
249
  toc_not_exists = not all_tocs_content and self.toc_main_url is None
249
250
  if toc_not_exists:
250
251
  logger.critical(
251
- 'There is no toc html and no toc url setted, unable to get toc.')
252
+ 'There is no toc html and no toc url set, unable to get toc.')
252
253
  return False
253
254
 
254
255
  reload_files = reload_files and self.toc_main_url is not None
@@ -259,18 +260,16 @@ class Novel:
259
260
  toc_content = self._add_toc(self.toc_main_url)
260
261
  all_tocs_content.append(toc_content)
261
262
  if self.decoder.has_pagination():
262
- next_page = self._get_next_page_from_toc_content(toc_content)
263
+ next_page = self.decoder.get_toc_next_page_url(toc_content)
263
264
  while next_page:
264
265
  toc_content = self._add_toc(next_page)
265
- next_page = self._get_next_page_from_toc_content(
266
- toc_content)
266
+ next_page = self.decoder.get_toc_next_page_url(toc_content)
267
267
  all_tocs_content.append(toc_content)
268
268
 
269
269
  # Now we get the links from the toc content
270
270
  self.chapters_url_list = []
271
271
  for toc_content in all_tocs_content:
272
- chapters_url_from_toc_content = self._get_chapter_urls_from_toc_content(
273
- toc_content)
272
+ chapters_url_from_toc_content = self.decoder.get_chapter_urls(toc_content)
274
273
  if chapters_url_from_toc_content is None:
275
274
  logger.error('Chapters url not found on toc_content')
276
275
  return False
@@ -307,35 +306,39 @@ class Novel:
307
306
  return chapter_list
308
307
 
309
308
  def scrap_chapter(self, chapter_url: str = None, chapter_idx: int = None, update_html: bool = False) -> Chapter:
309
+ logger.info('Scraping Chapter...')
310
+ chapter = None
310
311
  if not utils.check_exclusive_params(chapter_url, chapter_idx):
311
- logger.error(
312
- 'chapter_url and chapter_id, only one needs to be setted')
313
- return
312
+ raise ValueError("chapter_url and chapter_id, only one needs to be set")
314
313
 
315
314
  if chapter_url is not None:
315
+ logger.debug(f'Using chapter url: {chapter_url}')
316
316
  chapter = self._get_chapter_by_url(chapter_url=chapter_url)
317
317
  if chapter is None:
318
+ logger.warning(f'Chapter with url "{chapter_url}" does not exist, generating one...')
318
319
  chapter = Chapter(chapter_url=chapter_url)
319
320
 
320
321
  if chapter_idx is not None:
322
+ logger.debug(f'Using chapter index: {chapter_idx}')
321
323
  if chapter_idx < 0 or chapter_idx >= len(self.chapters):
322
- logger.error(f'Could not find chapter with idx {chapter_idx}')
323
- return
324
- chapter = self.chapters[chapter_idx]
324
+ logger.critical(f'Could not find chapter with idx {chapter_idx}')
325
+ raise ValueError(f'Could not find chapter with idx {chapter_idx}')
325
326
 
327
+ chapter = self.chapters[chapter_idx]
328
+ if update_html:
329
+ logger.debug('HTML will be updated...')
326
330
  chapter = self._get_chapter(chapter,
327
331
  reload=update_html)
328
332
 
329
333
  if not chapter.chapter_html or not chapter.chapter_html_filename:
330
- logger.warning(f'Failed to create chapter on link: "{
331
- chapter_url}" on path "{chapter.chapter_html_filename}"')
332
- return
334
+ logger.critical(f'Failed to create chapter on link: "{chapter_url}" '
335
+ f'on path "{chapter.chapter_html_filename}"')
336
+ raise ValueError(f'Failed to create chapter on link: "{chapter_url}" '
337
+ f'on path "{chapter.chapter_html_filename}"')
333
338
 
334
- # We get the title and content, if there's no title, we autogenerate one.
339
+ # We get the chapter title and content
340
+ # We pass an index so we can autogenerate a Title
335
341
  chapter = self._decode_chapter(chapter=chapter, idx_for_chapter_name=chapter_idx)
336
- if not chapter.chapter_content:
337
- logger.error('Content not found')
338
- return
339
342
 
340
343
  logger.info(f'Chapter scrapped from link: {chapter_url}')
341
344
  return chapter
@@ -506,22 +509,6 @@ class Novel:
506
509
  self.file_manager.add_toc(content)
507
510
  return content
508
511
 
509
- def _get_chapter_urls_from_toc_content(self, toc_content: str) -> list[str]:
510
- toc_elements = self.decoder.decode_html(toc_content, 'index')
511
- try:
512
- toc_urls = [toc_element['href'] for toc_element in toc_elements]
513
- except KeyError as e:
514
- logger.error(f'{e} not found on the Tag elements decoded from TOC')
515
- return
516
- if toc_urls:
517
- return toc_urls
518
- logger.warning('No chapter links found on toc content')
519
-
520
- def _get_next_page_from_toc_content(self, toc_content: str) -> str:
521
- next_page = self.decoder.decode_html(toc_content, 'next_page')
522
- if next_page:
523
- return next_page[0]['href']
524
-
525
512
  def _add_or_update_chapter_data(self, chapter: Chapter, link_idx: int = None, save_in_file: bool = True) -> None:
526
513
  if link_idx:
527
514
  chapter_idx = link_idx
@@ -579,35 +566,28 @@ class Novel:
579
566
  self.save_novel()
580
567
 
581
568
  def _decode_chapter(self, chapter: Chapter, idx_for_chapter_name: str = None) -> Chapter:
582
- chapter_title = None
583
-
569
+ logger.debug('Decoding chapter...')
584
570
  if chapter.chapter_html is None:
571
+ logger.debug(f'No HTML content found, requesting HTML content...')
585
572
  chapter = self._get_chapter(chapter)
586
573
 
587
574
  if not chapter.chapter_html:
588
- logger.error(f'No chapter content found for chapter link {
589
- chapter.chapter_url} on file {chapter.chapter_html_filename}')
590
- return None
591
-
592
- paragraphs = self.decoder.decode_html(chapter.chapter_html, 'content')
593
-
594
- if not paragraphs:
595
- if chapter:
596
- logger.warning(f'No paragraphs found in chapter link {
597
- chapter.chapter_url} on file {chapter.chapter_html_filename}')
575
+ raise ValueError(f'Chapter HTML could not be obtained for chapter link "{chapter.chapter_url}" '
576
+ f'on file "{chapter.chapter_html_filename}"')
598
577
 
599
- chapter_title = self.decoder.decode_html(chapter.chapter_html, 'title')
578
+ logger.debug('Obtaining chapter title...')
579
+ chapter_title = self.decoder.get_chapter_title(chapter.chapter_html)
600
580
  if not chapter_title:
601
- chapter_title = f'{self.metadata.novel_title} Chapter {
602
- idx_for_chapter_name}'
581
+ logger.debug('No chapter title found, generating one...')
582
+ chapter_title = f'{self.metadata.novel_title} Chapter {idx_for_chapter_name}'
603
583
  chapter.chapter_title = str(chapter_title)
584
+ logger.debug(f'Chapter title: "{chapter_title}"')
604
585
 
605
- chapter.chapter_content = ""
606
- if self.scraper_behavior.save_title_to_content:
607
- chapter.chapter_content += f'<h4>{chapter_title}</h4>'
608
- logger.info(f'{len(paragraphs)} paragraphs found in chapter')
609
- for paragraph in paragraphs:
610
- chapter.chapter_content += str(paragraph)
586
+ logger.debug('Obtaining chapter content...')
587
+ chapter.chapter_content = self.decoder.get_chapter_content(chapter.chapter_html,
588
+ self.scraper_behavior.save_title_to_content,
589
+ chapter.chapter_title)
590
+ logger.debug('Chapter successfully decoded')
611
591
 
612
592
  return chapter
613
593
 
@@ -631,7 +611,7 @@ class Novel:
631
611
  if self.metadata.start_date:
632
612
  date_metadata += self.metadata.start_date
633
613
  # Calibre specification doesn't use end_date.
634
- # For now we use a custom metadata
614
+ # For now, we use a custom metadata
635
615
  # https://idpf.org/epub/31/spec/epub-packages.html#sec-opf-dcdate
636
616
  # if self.metadata.end_date:
637
617
  # date_metadata += f'/{self.metadata.end_date}'
@@ -1 +1 @@
1
- __version__ = "1.0.3"
1
+ __version__ = "1.0.4"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: web-novel-scraper
3
- Version: 1.0.3
3
+ Version: 1.0.4
4
4
  Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
5
5
  Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
6
6
  Project-URL: Documentation, https://web-novel-scraper.readthedocs.io
@@ -0,0 +1,18 @@
1
+ web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ web_novel_scraper/__main__.py,sha256=OQQVX5CttmAkUwdrnjBSjKPaoh_boUI2ysHi3rLGOSs,17769
3
+ web_novel_scraper/decode.py,sha256=0RMHx1buR01KhuXiVQwdSpCGN960Xh-iPw1eYHxLeDg,10181
4
+ web_novel_scraper/file_manager.py,sha256=Q3DH-c8fWz9sziMps7A3p_sQoDMEpqBket07Agh-__Q,11898
5
+ web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
6
+ web_novel_scraper/novel_scraper.py,sha256=Notk0O94HZrO-MVKDGCBL0VopApFchn13FO2_N3ZfRM,28418
7
+ web_novel_scraper/request_manager.py,sha256=VtGpRi5b_Dp3h8viCdt7yMCb9M21Lk7oLC_Q_0EkXH8,6448
8
+ web_novel_scraper/utils.py,sha256=vq5ROuPv04k3MhbksTe0ni_yP0i_a7T_33mkBB1DUbQ,2076
9
+ web_novel_scraper/version.py,sha256=acuR_XSJzp4OrQ5T8-Ac5gYe48mUwObuwjRmisFmZ7k,22
10
+ web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
11
+ web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
12
+ web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
13
+ web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
14
+ web_novel_scraper/decode_guide/decode_guide.json,sha256=IBBzbSSVO-yQ5PCY7o8ralnaonMwBpEZW1v1TStiVqc,7582
15
+ web_novel_scraper-1.0.4.dist-info/METADATA,sha256=IhvDqK_Gz1POjzbH2cQVUYql1dhZJvdHnM9R--le0uc,8423
16
+ web_novel_scraper-1.0.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
17
+ web_novel_scraper-1.0.4.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
18
+ web_novel_scraper-1.0.4.dist-info/RECORD,,
@@ -1,14 +0,0 @@
1
- web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- web_novel_scraper/__main__.py,sha256=PBIAG1vshnRdvYwyeD-RxlHS5mNEby-d4puV9kEyfpA,17615
3
- web_novel_scraper/decode.py,sha256=dqGv_8nFSKwO6GBj3jhaO9SQeLHeBjDzoV1C_YcN40k,5085
4
- web_novel_scraper/file_manager.py,sha256=PJu8kKeng49DTNQBbbMekFtIcTZOkeCEjFYqYJarv9M,11363
5
- web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
6
- web_novel_scraper/novel_scraper.py,sha256=eiic2i3AdK9lcFK9aNb4d8ptnKv9ua1B_9kcUY8_liM,28660
7
- web_novel_scraper/request_manager.py,sha256=VtGpRi5b_Dp3h8viCdt7yMCb9M21Lk7oLC_Q_0EkXH8,6448
8
- web_novel_scraper/utils.py,sha256=vq5ROuPv04k3MhbksTe0ni_yP0i_a7T_33mkBB1DUbQ,2076
9
- web_novel_scraper/version.py,sha256=2plzdEEb24FLjE2I2XyBBcJEPYWHccNL4SgtLC_6erg,22
10
- web_novel_scraper/decode_guide/decode_guide.json,sha256=Q4v-OZh_1MwdrFxDDVvj8T3evW3zzbSapRaGwFCdnX8,5425
11
- web_novel_scraper-1.0.3.dist-info/METADATA,sha256=VKG91J-QhL_NBjSuS29Em5_ZcFlw9oKf50-7WcJ97Lw,8423
12
- web_novel_scraper-1.0.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
13
- web_novel_scraper-1.0.3.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
14
- web_novel_scraper-1.0.3.dist-info/RECORD,,