web-novel-scraper 1.0.3__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -52,7 +52,7 @@ def validate_date(ctx, param, value):
52
52
 
53
53
  # COMMON ARGUMENTS
54
54
  title_option = click.option(
55
- '-t', '--title', type=str, required=True, help='Title of the novel, this server as the identifier.')
55
+ '-t', '--title', type=str, required=True, envvar='SCRAPER_NOVEL_TITLE', help='Title of the novel, this server as the identifier.')
56
56
  novel_base_dir_option = click.option(
57
57
  '-nb', '--novel-base-dir', type=str, help='Alternative base directory for the novel files.')
58
58
 
@@ -330,19 +330,25 @@ def show_toc(title, novel_base_dir):
330
330
  @click.option('--update-html', is_flag=True, default=False, show_default=True, help='If the chapter HTML is saved, it will be updated.')
331
331
  def scrap_chapter(title, novel_base_dir, chapter_url, chapter_num, update_html):
332
332
  """Scrap a chapter of a novel."""
333
+ if (chapter_url is None and chapter_num is None) or (chapter_url and chapter_num):
334
+ raise click.UsageError("You must set exactly one: --chapter-url o --chapter-num.")
335
+
333
336
  novel = obtain_novel(title, novel_base_dir)
334
- if not chapter_url and not chapter_num:
335
- click.echo('Chapter URL or chapter number should be set.', err=True)
336
- if chapter_num and chapter_url:
337
- click.echo('It should be either chapter URL or chapter number.', err=True)
338
- if chapter_num <= 0 or chapter_num > len(novel.chapters):
339
- raise click.BadParameter(
340
- 'Chapter number should be positive and an existing chapter.', param_hint='--chapter-num')
341
- chapter = novel.scrap_chapter(
342
- chapter_url=chapter_url, chapter_idx=chapter_num - 1, update_html=update_html)
337
+
338
+ if chapter_num is not None:
339
+ if chapter_num <= 0 or chapter_num > len(novel.chapters):
340
+ raise click.BadParameter(
341
+ 'Chapter number should be positive and an existing chapter.', param_hint='--chapter-num')
342
+ chapter = novel.scrap_chapter(chapter_idx=chapter_num - 1,
343
+ update_html=update_html)
344
+
345
+ else:
346
+ chapter = novel.scrap_chapter(chapter_url=chapter_url,
347
+ update_html=update_html)
348
+
343
349
  if not chapter:
344
- click.echo('Chapter number or URL not found.', err=True)
345
- return
350
+ raise click.ClickException('Chapter not found or scrap failed.')
351
+
346
352
  click.echo(chapter)
347
353
  click.echo('Content:')
348
354
  click.echo(chapter.chapter_content)
@@ -0,0 +1,2 @@
1
+ from .custom_processor import CustomProcessor, ProcessorRegistry
2
+ from .sites import royalroad, genesis
@@ -0,0 +1,25 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Dict
3
+
4
+ class CustomProcessor(ABC):
5
+ @abstractmethod
6
+ def process(self, html: str) -> Any:
7
+ """Process the HTML content using custom logic"""
8
+ pass
9
+
10
+ class ProcessorRegistry:
11
+ _processors: Dict[str, Dict[str, CustomProcessor]] = {}
12
+
13
+ @classmethod
14
+ def register(cls, host: str, content_type: str, processor: CustomProcessor):
15
+ if host not in cls._processors:
16
+ cls._processors[host] = {}
17
+ cls._processors[host][content_type] = processor
18
+
19
+ @classmethod
20
+ def get_processor(cls, host: str, content_type: str) -> CustomProcessor:
21
+ return cls._processors.get(host, {}).get(content_type)
22
+
23
+ @classmethod
24
+ def has_processor(cls, host: str, content_type: str) -> bool:
25
+ return bool(cls.get_processor(host, content_type))
@@ -0,0 +1,46 @@
1
+ import re
2
+ import json
3
+ from typing import List, Optional
4
+ from ..custom_processor import CustomProcessor, ProcessorRegistry
5
+
6
+ GENESIS_STUDIO_VIEWER_URL = 'https://genesistudio.com/viewer'
7
+
8
+ class GenesisChaptersProcessor(CustomProcessor):
9
+ def process(self, html: str) -> Optional[List[dict]]:
10
+ pattern = r',chapters:\s*{\s*free:\s*(\[.*?"}}])'
11
+ match = re.search(pattern, html, re.DOTALL)
12
+
13
+ if not match:
14
+ if not match:
15
+ return None
16
+
17
+ try:
18
+ chapters_json = match.group(1).strip()
19
+ replaces = {
20
+ "chapter_title:": '"chapter_tile":',
21
+ "id:": '"id":',
22
+ "nsfw:": '"nsfw":',
23
+ "required_tier:": '"required_tier":',
24
+ "date_created:": '"date_created":',
25
+ "spoiler_title:": '"spoiler_title":',
26
+ "chapter_number:": '"chapter_number":',
27
+ "novel:": '"novel":',
28
+ }
29
+ # Ensure the JSON string ends properly
30
+ if not chapters_json.endswith(']'):
31
+ chapters_json += ']'
32
+ for old_key, new_key in replaces.items():
33
+ chapters_json = chapters_json.replace(old_key, new_key)
34
+ # print(f"Extracted JSON: {chapters_json[12200:12300]}") # Debug print
35
+ chapters = json.loads(chapters_json)
36
+ chapters_url = []
37
+ for chapter in chapters:
38
+ chapters_url.append(f"{GENESIS_STUDIO_VIEWER_URL}/{chapter['id']}")
39
+ print(chapters)
40
+ return chapters_url
41
+
42
+ except (json.JSONDecodeError, IndexError) as e:
43
+ print(f"Error processing JSON: {str(e)}")
44
+ return None
45
+
46
+ ProcessorRegistry.register('genesistudio.com', 'index', GenesisChaptersProcessor())
@@ -0,0 +1,22 @@
1
+ import re
2
+ import json
3
+ from typing import List, Optional
4
+ from ..custom_processor import CustomProcessor, ProcessorRegistry
5
+
6
+ class RoyalRoadChaptersProcessor(CustomProcessor):
7
+ def process(self, html: str) -> Optional[List[dict]]:
8
+ pattern = r'window\.chapters\s*=\s*(\[.*?\]);'
9
+ match = re.search(pattern, html, re.DOTALL)
10
+
11
+ if not match:
12
+ return None
13
+
14
+ try:
15
+ chapters_json = match.group(1)
16
+ chapters = json.loads(chapters_json)
17
+ chapters = [chapter['url'] for chapter in chapters if 'url' in chapter]
18
+ return chapters
19
+ except (json.JSONDecodeError, IndexError):
20
+ return None
21
+
22
+ ProcessorRegistry.register('www.royalroad.com', 'index', RoyalRoadChaptersProcessor())
@@ -1,8 +1,10 @@
1
1
  import os
2
2
  import json
3
3
  from pathlib import Path
4
+ from typing import Optional
4
5
 
5
6
  from . import logger_manager
7
+ from .custom_processor.custom_processor import ProcessorRegistry
6
8
 
7
9
  from bs4 import BeautifulSoup
8
10
 
@@ -10,8 +12,7 @@ logger = logger_manager.create_logger('DECODE HTML')
10
12
 
11
13
  CURRENT_DIR = Path(__file__).resolve().parent
12
14
 
13
- DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{
14
- CURRENT_DIR}/decode_guide/decode_guide.json')
15
+ DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{CURRENT_DIR}/decode_guide/decode_guide.json')
15
16
 
16
17
  XOR_SEPARATOR = "XOR"
17
18
 
@@ -41,17 +42,92 @@ class Decoder:
41
42
  self.decode_guide = self._get_element_by_key(
42
43
  DECODE_GUIDE, 'host', host)
43
44
 
44
- def decode_html(self, html: str, content_type: str):
45
+ def get_chapter_urls(self, html: str) -> list[str]:
46
+ logger.debug('Obtaining chapter URLs...')
47
+ chapter_urls = self.decode_html(html, 'index')
48
+
49
+ if chapter_urls is None:
50
+ logger.critical(f"Failed to obtain chapter URLs for {self.host}")
51
+ raise ValueError(f"Failed to obtain chapter URLs for {self.host}")
52
+
53
+ if isinstance(chapter_urls, str):
54
+ logger.warning('When obtaining chapter urls, obtained a String but expected a List')
55
+ logger.warning('Check decode config')
56
+ chapter_urls = [chapter_urls]
57
+
58
+ return chapter_urls
59
+
60
+ def get_toc_next_page_url(self, html: str) -> Optional[str]:
61
+ logger.debug('Obtaining toc next page URL...')
62
+ toc_next_page_url = self.decode_html(html, 'next_page')
63
+ if toc_next_page_url is None:
64
+ logger.debug('No next page URL found, assuming last page...')
65
+ return None
66
+ return toc_next_page_url
67
+
68
+ def get_chapter_title(self, html: str) -> Optional[str]:
69
+ logger.debug('Obtaining chapter title...')
70
+ chapter_title = self.decode_html(html, 'title')
71
+ if chapter_title is None:
72
+ logger.debug(f'No chapter_title found.')
73
+ return chapter_title
74
+
75
+ def get_chapter_content(self, html: str, save_title_to_content: bool, chapter_title: str) -> str:
76
+ logger.debug('Obtaining chapter content...')
77
+ full_chapter_content = ""
78
+ chapter_content = self.decode_html(html, 'content')
79
+
80
+ if chapter_content is None:
81
+ logger.critical('No content found on chapter')
82
+ raise ValueError('No content found on chapter')
83
+
84
+ if save_title_to_content:
85
+ logger.debug('Saving chapter title to content...')
86
+ full_chapter_content += f'<h4>{chapter_title}</h4>'
87
+
88
+ if isinstance(chapter_content, list):
89
+ logger.debug(f'{len(chapter_content)} paragraphs found in chapter')
90
+ logger.debug('Converting list of paragraphs to a single string')
91
+ for paragraph in chapter_content:
92
+ full_chapter_content += str(paragraph)
93
+ else:
94
+ logger.debug('Chapter content is not a list, no conversion made')
95
+ full_chapter_content += str(chapter_content)
96
+ return full_chapter_content
97
+
98
+ def decode_html(self, html: str, content_type: str) -> str | list[str] | None:
99
+ logger.debug(f'Decoding HTML...')
100
+ logger.debug(f'Content type: {content_type}')
101
+ logger.debug(f'Decode guide: {DECODE_GUIDE_FILE}')
102
+ logger.debug(f'Host: {self.host}')
45
103
  if not content_type in self.decode_guide:
46
- logger.error(f'{content_type} key does not exists on decode guide {
47
- DECODE_GUIDE_FILE} for host {self.host}')
48
- return
49
- soup = BeautifulSoup(html, 'html.parser')
104
+ logger.critical(f'{content_type} key does not exists on decode guide {DECODE_GUIDE_FILE}'
105
+ f'for host {self.host}')
106
+ raise ValueError(f'{content_type} key does not exists on decode guide {DECODE_GUIDE_FILE}'
107
+ f'for host {self.host}')
108
+
109
+ if ProcessorRegistry.has_processor(self.host, content_type):
110
+ logger.debug(f'Host {self.host} will use a custom processor')
111
+ processor = ProcessorRegistry.get_processor(self.host, content_type)
112
+ return processor.process(html)
113
+
114
+ logger.debug('Starting HTML parsing...')
115
+ try:
116
+ soup = BeautifulSoup(html, 'html.parser')
117
+ except Exception as e:
118
+ logger.critical(f'Error parsing HTML with BeautifulSoup: {e}')
119
+ raise ValueError(f'Error parsing HTML with BeautifulSoup: {e}')
120
+
50
121
  decoder = self.decode_guide[content_type]
51
122
  elements = self._find_elements(soup, decoder)
52
123
  if not elements:
53
- logger.warning(f'{content_type} not found on html using {
54
- DECODE_GUIDE_FILE} for host {self.host}')
124
+ logger.warning(f'{content_type} not found on html using {DECODE_GUIDE_FILE} '
125
+ f'for host {self.host}')
126
+
127
+ # Investigate this conditional
128
+ if content_type == 'title' and isinstance(elements, list):
129
+ logger.debug('Joining titles...')
130
+ return ' '.join(elements)
55
131
  return elements
56
132
 
57
133
  def has_pagination(self, host: str = None):
@@ -81,8 +157,11 @@ class Decoder:
81
157
 
82
158
  return "\n".join([line.strip() for line in str(soup).splitlines() if line.strip()])
83
159
 
84
- def _find_elements(self, soup: BeautifulSoup, decoder: dict):
160
+ @staticmethod
161
+ def _find_elements(soup: BeautifulSoup, decoder: dict):
162
+ logger.debug('Finding elements...')
85
163
  selector = decoder.get('selector')
164
+ elements = []
86
165
  if selector is None:
87
166
  selector = ''
88
167
  element = decoder.get('element')
@@ -91,32 +170,46 @@ class Decoder:
91
170
  attributes = decoder.get('attributes')
92
171
 
93
172
  if element:
173
+ logger.debug(f'Using element "{element}"')
94
174
  selector += element
95
175
  if _id:
176
+ logger.debug(f'Using id "{_id}"')
96
177
  selector += f'#{_id}'
97
178
  if _class:
179
+ logger.debug(f'Using class "{_class}"')
98
180
  selector += f'.{_class}'
99
181
  if attributes:
100
182
  for attr, value in attributes.items():
101
- selector += f'[{attr}="{value}"]' if value else f'[{attr}]'
183
+ logger.debug(f'Using attribute "{attr}"')
184
+ if value is not None:
185
+ logger.debug(f'With value "{value}"')
186
+ selector += f'[{attr}="{value}"]'
187
+ else:
188
+ selector += f'[{attr}]'
102
189
  selectors = [selector]
103
190
  else:
191
+ logger.debug(f'Using selector "{selector}"')
104
192
  if XOR_SEPARATOR in selector:
193
+ logger.debug(f'Found XOR_OPERATOR "{XOR_SEPARATOR}" in selector')
194
+ logger.debug('Splitting selectors...')
105
195
  selectors = selector.split(XOR_SEPARATOR)
106
196
  else:
107
197
  selectors = [selector]
108
198
 
109
199
  for selector in selectors:
110
- logger.debug(f'Attempt using selector {selector}')
200
+ logger.debug(f'Searching using selector "{selector}"...')
111
201
  elements = soup.select(selector)
112
202
  if elements:
113
- logger.debug(f'{len(elements)} found using selector {selector}')
203
+ logger.debug(f'{len(elements)} found using selector "{selector}"')
114
204
  break
205
+ logger.debug(f'No elements found using selector "{selector}"')
115
206
 
116
207
  extract = decoder.get('extract')
117
208
  if extract:
209
+ logger.debug(f'Extracting from elements...')
118
210
  if extract["type"] == "attr":
119
211
  attr_key = extract["key"]
212
+ logger.debug(f'Extracting value from attribute "{attr_key}"...')
120
213
  elements_aux = elements
121
214
  elements = []
122
215
  for element in elements_aux:
@@ -125,15 +218,34 @@ class Decoder:
125
218
  if attr:
126
219
  elements.append(attr)
127
220
  except KeyError:
221
+ logger.debug(f'Attribute "{attr_key}" not found')
222
+ logger.debug('Ignoring...')
128
223
  pass
224
+ logger.debug(f'{len(elements)} elements found using attribute "{attr_key}"')
129
225
  if extract["type"] == "text":
226
+ logger.debug('Extracting text from elements...')
130
227
  elements = [element.string for element in elements]
228
+
229
+ if not elements:
230
+ logger.error('No elements found, returning "None"')
231
+ return None
232
+
131
233
  inverted = decoder.get('inverted')
132
234
  if inverted:
235
+ logger.debug('Inverted option activate')
236
+ logger.debug('Inverting elements order...')
133
237
  elements = elements[::-1]
134
- return elements if decoder.get('array') else elements[0] if elements else None
135
238
 
136
- def _get_element_by_key(self, json_data, key, value):
239
+ if decoder.get('array'):
240
+ logger.debug('Array option activated')
241
+ logger.debug('Returning elements a list')
242
+ return elements
243
+ logger.debug('Array option not activated')
244
+ logger.debug('Returning only first element...')
245
+ return elements[0]
246
+
247
+ @staticmethod
248
+ def _get_element_by_key(json_data, key, value):
137
249
  for item in json_data:
138
250
  if item[key] == value:
139
251
  return item
@@ -24,7 +24,11 @@
24
24
  "class": null,
25
25
  "selector": null,
26
26
  "attributes": null,
27
- "array": true
27
+ "array": true,
28
+ "extract": {
29
+ "type": "attr",
30
+ "key": "href"
31
+ }
28
32
  },
29
33
  "next_page": {
30
34
  "element": "p",
@@ -60,7 +64,11 @@
60
64
  "class": null,
61
65
  "selector": "div.m-newest2 ul li a",
62
66
  "attributes": null,
63
- "array": true
67
+ "array": true,
68
+ "extract": {
69
+ "type": "attr",
70
+ "key": "href"
71
+ }
64
72
  },
65
73
  "next_page": {
66
74
  "element": null,
@@ -72,7 +80,7 @@
72
80
  }
73
81
  },
74
82
  {
75
- "host": "royalroad.com",
83
+ "host": "www.royalroad.com",
76
84
  "has_pagination": false,
77
85
  "title": {
78
86
  "element": null,
@@ -95,12 +103,7 @@
95
103
  "array": true
96
104
  },
97
105
  "index": {
98
- "element": null,
99
- "id": null,
100
- "class": null,
101
- "selector": "tr.chapter-row td a",
102
- "attributes": null,
103
- "array": true
106
+ "use_custom_processor": true
104
107
  },
105
108
  "next_page": {
106
109
  "element": null,
@@ -127,10 +130,10 @@
127
130
  }
128
131
  },
129
132
  "content": {
130
- "element": "div#chr-content",
133
+ "element": null,
131
134
  "id": null,
132
135
  "class": null,
133
- "selector": null,
136
+ "selector": "div#chr-content p",
134
137
  "attributes": null,
135
138
  "array": true
136
139
  },
@@ -140,7 +143,11 @@
140
143
  "class": null,
141
144
  "selector": "ul.list-chapter li a",
142
145
  "attributes": null,
143
- "array": true
146
+ "array": true,
147
+ "extract": {
148
+ "type": "attr",
149
+ "key": "href"
150
+ }
144
151
  },
145
152
  "next_page": {
146
153
  "element": null,
@@ -167,10 +174,10 @@
167
174
  }
168
175
  },
169
176
  "content": {
170
- "element": "div#chr-content",
177
+ "element": null,
171
178
  "id": null,
172
179
  "class": null,
173
- "selector": null,
180
+ "selector": "div#chr-content p",
174
181
  "attributes": null,
175
182
  "array": true
176
183
  },
@@ -180,7 +187,11 @@
180
187
  "class": null,
181
188
  "selector": "ul.list-chapter li a",
182
189
  "attributes": null,
183
- "array": true
190
+ "array": true,
191
+ "extract": {
192
+ "type": "attr",
193
+ "key": "href"
194
+ }
184
195
  },
185
196
  "next_page": {
186
197
  "element": null,
@@ -207,7 +218,82 @@
207
218
  "index": {
208
219
  "element": "ul.main li a",
209
220
  "array": true,
210
- "inverted": true
221
+ "inverted": true,
222
+ "extract": {
223
+ "type": "attr",
224
+ "key": "href"
225
+ }
226
+ }
227
+ },
228
+ {
229
+ "host": "genesistudio.com",
230
+ "has_pagination": false,
231
+ "title": {
232
+ "element": null,
233
+ "id": null,
234
+ "class": null,
235
+ "selector": "p.leading-none span",
236
+ "attributes": null,
237
+ "array": true,
238
+ "extract": {
239
+ "type": "text",
240
+ "key": "text"
241
+ }
242
+ },
243
+ "content": {
244
+ "element": "p",
245
+ "id": null,
246
+ "class": "narration",
247
+ "selector": null,
248
+ "attributes": null,
249
+ "array": true,
250
+ "extract": {
251
+ "type": "text",
252
+ "key": "text"
253
+ }
254
+ },
255
+ "index": {
256
+ "use_custom_processor": true
257
+ },
258
+ "next_page": {
259
+ "element": null,
260
+ "id": null,
261
+ "class": null,
262
+ "selector": null,
263
+ "attributes": null,
264
+ "array": true
265
+ }
266
+ },
267
+ {
268
+ "host": "hostednovel.com",
269
+ "has_pagination": true,
270
+ "title": {
271
+ "selector": "span#chapter-title",
272
+ "extract": {
273
+ "type": "text"
274
+ }
275
+ },
276
+ "content": {
277
+ "element": "div",
278
+ "id": "chapter-content",
279
+ "array": true
280
+ },
281
+ "index": {
282
+ "selector": "li ul li.flow-root a",
283
+ "array": true,
284
+ "inverted": false,
285
+ "extract": {
286
+ "type": "attr",
287
+ "key": "href"
288
+ }
289
+ },
290
+ "next_page": {
291
+ "selector": "a:has(span:contains('Next'))",
292
+ "array": false,
293
+ "extract": {
294
+ "type": "attr",
295
+ "key": "href"
296
+ }
211
297
  }
212
298
  }
213
299
  ]
@@ -7,6 +7,7 @@ from pathlib import Path
7
7
  import shutil
8
8
  from dotenv import load_dotenv
9
9
  from ebooklib import epub
10
+ import unicodedata
10
11
 
11
12
  from . import logger_manager
12
13
 
@@ -44,10 +45,10 @@ class FileManager:
44
45
  novel_config_dir: str = None,
45
46
  read_only: bool = False):
46
47
  logger.debug(f'Initializing FileManager for novel: {novel_title}, read_only: {read_only}')
47
- novel_base_dir = novel_base_dir if novel_base_dir else f'{
48
- SCRAPER_BASE_DATA_DIR}/{novel_title}'
49
- novel_config_dir = novel_config_dir if novel_config_dir else f'{
50
- SCRAPER_BASE_CONFIG_DIR}/{novel_title}'
48
+ novel_base_dir = novel_base_dir if novel_base_dir else \
49
+ f'{SCRAPER_BASE_DATA_DIR}/{novel_title}'
50
+ novel_config_dir = novel_config_dir if novel_config_dir else \
51
+ f'{SCRAPER_BASE_CONFIG_DIR}/{novel_title}'
51
52
 
52
53
  logger.debug(f'Using base dir: {novel_base_dir}, config dir: {novel_config_dir}')
53
54
 
@@ -77,6 +78,16 @@ class FileManager:
77
78
  def save_chapter_html(self, filename: str, content: str):
78
79
  full_path = self.novel_chapters_dir / filename
79
80
  logger.debug(f'Saving chapter to {full_path}')
81
+ content = unicodedata.normalize('NFKC', content)
82
+ char_replacements = {
83
+ "â": "'", # Reemplazar â con apóstrofe
84
+ "\u2018": "'", # Comillda simple izquierda Unicode
85
+ "\u2019": "'", # Comilla simple derecha Unicode
86
+ "\u201C": '"', # Comilla doble izquierda Unicode
87
+ "\u201D": '"', # Comilla doble derecha Unicode
88
+ }
89
+ for old_char, new_char in char_replacements.items():
90
+ content = content.replace(old_char, new_char)
80
91
  _save_content_to_file(full_path, content)
81
92
 
82
93
  def load_chapter_html(self, filename: str):
@@ -232,8 +243,7 @@ def _save_content_to_file(filepath: Path, content: str | dict, is_json: bool = F
232
243
  except (OSError, IOError) as e:
233
244
  logger.error(f'Error saving file "{filepath}": {e}')
234
245
  except Exception as e:
235
- logger.error(f'Unexpected error saving file "{
236
- filepath}": {e}', exc_info=True)
246
+ logger.error(f'Unexpected error saving file "{filepath}": {e}', exc_info=True)
237
247
 
238
248
 
239
249
  def _read_content_from_file(filepath: Path, bytes: bool = False) -> str:
@@ -252,8 +262,7 @@ def _read_content_from_file(filepath: Path, bytes: bool = False) -> str:
252
262
  logger.error(f'Error reading file "{filepath}": {e}')
253
263
  except Exception as e:
254
264
  # Log for unexpected errors
255
- logger.error(f'Unexpected error reading file "{
256
- filepath}": {e}', exc_info=True)
265
+ logger.error(f'Unexpected error reading file "{filepath}": {e}', exc_info=True)
257
266
 
258
267
 
259
268
  def _delete_file(filepath: Path) -> None:
@@ -269,8 +278,7 @@ def _delete_file(filepath: Path) -> None:
269
278
  logger.error(f'Error deleting file "{filepath}": {e}')
270
279
  except Exception as e:
271
280
  # Log any unexpected errors
272
- logger.error(f'Unexpected error deleting file "{
273
- filepath}": {e}', exc_info=True)
281
+ logger.error(f'Unexpected error deleting file "{filepath}": {e}', exc_info=True)
274
282
 
275
283
 
276
284
  def _copy_file(source: Path, destination: Path) -> bool:
@@ -39,9 +39,11 @@ class Metadata:
39
39
  """
40
40
  Dynamic string representation of the configuration.
41
41
  """
42
- attributes = [f"{field.name}={
43
- getattr(self, field.name)}" for field in fields(self)]
44
- return f"Metadata: \n{'\n'.join(attributes)}"
42
+ attributes = [(f"{field.name}="
43
+ f"{getattr(self, field.name)}") for field in fields(self)]
44
+ attributes_str = '\n'.join(attributes)
45
+ return (f"Metadata: \n"
46
+ f"{attributes_str}")
45
47
 
46
48
 
47
49
  @dataclass_json
@@ -70,9 +72,11 @@ class ScraperBehavior:
70
72
  """
71
73
  Dynamic string representation of the configuration.
72
74
  """
73
- attributes = [f"{field.name}={
74
- getattr(self, field.name)}" for field in fields(self)]
75
- return f"Scraper Behavior: \n{'\n'.join(attributes)}"
75
+ attributes = [(f"{field.name}="
76
+ f"{getattr(self, field.name)}") for field in fields(self)]
77
+ attributes_str = '\n'.join(attributes)
78
+ return (f"Scraper Behavior: \n"
79
+ f"{attributes_str}")
76
80
 
77
81
 
78
82
  @dataclass_json(undefined=Undefined.EXCLUDE)
@@ -169,7 +173,9 @@ class Novel:
169
173
  f"TOC Info: {toc_info}",
170
174
  f"Host: {self.host}"
171
175
  ]
172
- return f"Novel Info: \n{'\n'.join(attributes)}"
176
+ attributes_str = '\n'.join(attributes)
177
+ return (f"Novel Info: \n"
178
+ f"{attributes_str}")
173
179
 
174
180
  # NOVEL PARAMETERS MANAGEMENT
175
181
 
@@ -186,8 +192,7 @@ class Novel:
186
192
  self.metadata.tags.append(tag)
187
193
  self.save_novel()
188
194
  return True
189
- logger.warning(f'Tag "{tag}" already exists on novel {
190
- self.metadata.novel_title}')
195
+ logger.warning(f'Tag "{tag}" already exists on novel {self.metadata.novel_title}')
191
196
  return False
192
197
 
193
198
  def remove_tag(self, tag: str) -> bool:
@@ -195,8 +200,7 @@ class Novel:
195
200
  self.metadata.tags.remove(tag)
196
201
  self.save_novel()
197
202
  return True
198
- logger.warning(f'Tag "{tag}" doesn\'t exist on novel {
199
- self.metadata.novel_title}')
203
+ logger.warning(f'Tag "{tag}" doesn\'t exist on novel {self.metadata.novel_title}')
200
204
  return False
201
205
 
202
206
  def set_cover_image(self, cover_image_path: str) -> bool:
@@ -220,6 +224,7 @@ class Novel:
220
224
  self.decoder = Decoder(self.host)
221
225
  elif update_host:
222
226
  self.decoder = Decoder(utils.obtain_host(self.toc_main_url))
227
+ self.save_novel()
223
228
 
224
229
  def add_toc_html(self, html: str, host: str = None) -> None:
225
230
  if self.toc_main_url:
@@ -248,7 +253,7 @@ class Novel:
248
253
  toc_not_exists = not all_tocs_content and self.toc_main_url is None
249
254
  if toc_not_exists:
250
255
  logger.critical(
251
- 'There is no toc html and no toc url setted, unable to get toc.')
256
+ 'There is no toc html and no toc url set, unable to get toc.')
252
257
  return False
253
258
 
254
259
  reload_files = reload_files and self.toc_main_url is not None
@@ -259,18 +264,16 @@ class Novel:
259
264
  toc_content = self._add_toc(self.toc_main_url)
260
265
  all_tocs_content.append(toc_content)
261
266
  if self.decoder.has_pagination():
262
- next_page = self._get_next_page_from_toc_content(toc_content)
267
+ next_page = self.decoder.get_toc_next_page_url(toc_content)
263
268
  while next_page:
264
269
  toc_content = self._add_toc(next_page)
265
- next_page = self._get_next_page_from_toc_content(
266
- toc_content)
270
+ next_page = self.decoder.get_toc_next_page_url(toc_content)
267
271
  all_tocs_content.append(toc_content)
268
272
 
269
273
  # Now we get the links from the toc content
270
274
  self.chapters_url_list = []
271
275
  for toc_content in all_tocs_content:
272
- chapters_url_from_toc_content = self._get_chapter_urls_from_toc_content(
273
- toc_content)
276
+ chapters_url_from_toc_content = self.decoder.get_chapter_urls(toc_content)
274
277
  if chapters_url_from_toc_content is None:
275
278
  logger.error('Chapters url not found on toc_content')
276
279
  return False
@@ -299,43 +302,45 @@ class Novel:
299
302
  chapter_list = "Chapters List:\n"
300
303
  for i, chapter in enumerate(self.chapters):
301
304
  chapter_list += f"Chapter {i + 1}:\n"
302
- chapter_list += f" Title: {
303
- chapter.chapter_title if chapter.chapter_title else 'Title not yet scrapped'}\n"
305
+ chapter_list += f" Title: {chapter.chapter_title if chapter.chapter_title else 'Title not yet scrapped'}\n"
304
306
  chapter_list += f" URL: {chapter.chapter_url}\n"
305
- chapter_list += f" Filename: {
306
- chapter.chapter_html_filename if chapter.chapter_html_filename else 'File not yet requested'}\n"
307
+ chapter_list += f" Filename: {chapter.chapter_html_filename if chapter.chapter_html_filename else 'File not yet requested'}\n"
307
308
  return chapter_list
308
309
 
309
310
  def scrap_chapter(self, chapter_url: str = None, chapter_idx: int = None, update_html: bool = False) -> Chapter:
311
+ logger.info('Scraping Chapter...')
312
+ chapter = None
310
313
  if not utils.check_exclusive_params(chapter_url, chapter_idx):
311
- logger.error(
312
- 'chapter_url and chapter_id, only one needs to be setted')
313
- return
314
+ raise ValueError("chapter_url and chapter_id, only one needs to be set")
314
315
 
315
316
  if chapter_url is not None:
317
+ logger.debug(f'Using chapter url: {chapter_url}')
316
318
  chapter = self._get_chapter_by_url(chapter_url=chapter_url)
317
319
  if chapter is None:
320
+ logger.warning(f'Chapter with url "{chapter_url}" does not exist, generating one...')
318
321
  chapter = Chapter(chapter_url=chapter_url)
319
322
 
320
323
  if chapter_idx is not None:
324
+ logger.debug(f'Using chapter index: {chapter_idx}')
321
325
  if chapter_idx < 0 or chapter_idx >= len(self.chapters):
322
- logger.error(f'Could not find chapter with idx {chapter_idx}')
323
- return
324
- chapter = self.chapters[chapter_idx]
326
+ logger.critical(f'Could not find chapter with idx {chapter_idx}')
327
+ raise ValueError(f'Could not find chapter with idx {chapter_idx}')
325
328
 
329
+ chapter = self.chapters[chapter_idx]
330
+ if update_html:
331
+ logger.debug('HTML will be updated...')
326
332
  chapter = self._get_chapter(chapter,
327
333
  reload=update_html)
328
334
 
329
335
  if not chapter.chapter_html or not chapter.chapter_html_filename:
330
- logger.warning(f'Failed to create chapter on link: "{
331
- chapter_url}" on path "{chapter.chapter_html_filename}"')
332
- return
336
+ logger.critical(f'Failed to create chapter on link: "{chapter_url}" '
337
+ f'on path "{chapter.chapter_html_filename}"')
338
+ raise ValueError(f'Failed to create chapter on link: "{chapter_url}" '
339
+ f'on path "{chapter.chapter_html_filename}"')
333
340
 
334
- # We get the title and content, if there's no title, we autogenerate one.
341
+ # We get the chapter title and content
342
+ # We pass an index so we can autogenerate a Title
335
343
  chapter = self._decode_chapter(chapter=chapter, idx_for_chapter_name=chapter_idx)
336
- if not chapter.chapter_content:
337
- logger.error('Content not found')
338
- return
339
344
 
340
345
  logger.info(f'Chapter scrapped from link: {chapter_url}')
341
346
  return chapter
@@ -376,8 +381,7 @@ class Novel:
376
381
  chapter = self._get_chapter(
377
382
  chapter=chapter, reload=update_html)
378
383
  if not chapter.chapter_html_filename:
379
- logger.critical(f'Error requesting chapter {
380
- i} with url {chapter.chapter_url}')
384
+ logger.critical(f'Error requesting chapter {i} with url {chapter.chapter_url}')
381
385
  return False
382
386
 
383
387
  self._add_or_update_chapter_data(chapter=chapter, link_idx=i,
@@ -399,16 +403,15 @@ class Novel:
399
403
  self.sync_toc()
400
404
 
401
405
  if start_chapter > len(self.chapters):
402
- logger.info(f'The start chapter is bigger than the number of chapters saved ({
403
- len(self.chapters)})')
406
+ logger.info(f'The start chapter is bigger than the number of chapters saved ({len(self.chapters)})')
404
407
  return
405
408
 
406
409
  if not end_chapter:
407
410
  end_chapter = len(self.chapters)
408
411
  elif end_chapter > len(self.chapters):
409
412
  end_chapter = len(self.chapters)
410
- logger.info(f'The end chapter is bigger than the number of chapters, automatically setting it to {
411
- end_chapter}.')
413
+ logger.info(f'The end chapter is bigger than the number of chapters, '
414
+ f'automatically setting it to {end_chapter}.')
412
415
 
413
416
  idx = 1
414
417
  start = start_chapter
@@ -418,8 +421,8 @@ class Novel:
418
421
  end_chapter=end,
419
422
  collection_idx=idx)
420
423
  if not result:
421
- logger.critical(f'Error with saving novel to epub, with start chapter: {
422
- start_chapter} and end chapter: {end_chapter}')
424
+ logger.critical(f'Error with saving novel to epub, with start chapter: '
425
+ f'{start_chapter} and end chapter: {end_chapter}')
423
426
  return False
424
427
  start = start + chapters_by_book
425
428
  idx = idx + 1
@@ -506,22 +509,6 @@ class Novel:
506
509
  self.file_manager.add_toc(content)
507
510
  return content
508
511
 
509
- def _get_chapter_urls_from_toc_content(self, toc_content: str) -> list[str]:
510
- toc_elements = self.decoder.decode_html(toc_content, 'index')
511
- try:
512
- toc_urls = [toc_element['href'] for toc_element in toc_elements]
513
- except KeyError as e:
514
- logger.error(f'{e} not found on the Tag elements decoded from TOC')
515
- return
516
- if toc_urls:
517
- return toc_urls
518
- logger.warning('No chapter links found on toc content')
519
-
520
- def _get_next_page_from_toc_content(self, toc_content: str) -> str:
521
- next_page = self.decoder.decode_html(toc_content, 'next_page')
522
- if next_page:
523
- return next_page[0]['href']
524
-
525
512
  def _add_or_update_chapter_data(self, chapter: Chapter, link_idx: int = None, save_in_file: bool = True) -> None:
526
513
  if link_idx:
527
514
  chapter_idx = link_idx
@@ -579,35 +566,28 @@ class Novel:
579
566
  self.save_novel()
580
567
 
581
568
  def _decode_chapter(self, chapter: Chapter, idx_for_chapter_name: str = None) -> Chapter:
582
- chapter_title = None
583
-
569
+ logger.debug('Decoding chapter...')
584
570
  if chapter.chapter_html is None:
571
+ logger.debug(f'No HTML content found, requesting HTML content...')
585
572
  chapter = self._get_chapter(chapter)
586
573
 
587
574
  if not chapter.chapter_html:
588
- logger.error(f'No chapter content found for chapter link {
589
- chapter.chapter_url} on file {chapter.chapter_html_filename}')
590
- return None
591
-
592
- paragraphs = self.decoder.decode_html(chapter.chapter_html, 'content')
593
-
594
- if not paragraphs:
595
- if chapter:
596
- logger.warning(f'No paragraphs found in chapter link {
597
- chapter.chapter_url} on file {chapter.chapter_html_filename}')
575
+ raise ValueError(f'Chapter HTML could not be obtained for chapter link "{chapter.chapter_url}" '
576
+ f'on file "{chapter.chapter_html_filename}"')
598
577
 
599
- chapter_title = self.decoder.decode_html(chapter.chapter_html, 'title')
578
+ logger.debug('Obtaining chapter title...')
579
+ chapter_title = self.decoder.get_chapter_title(chapter.chapter_html)
600
580
  if not chapter_title:
601
- chapter_title = f'{self.metadata.novel_title} Chapter {
602
- idx_for_chapter_name}'
581
+ logger.debug('No chapter title found, generating one...')
582
+ chapter_title = f'{self.metadata.novel_title} Chapter {idx_for_chapter_name}'
603
583
  chapter.chapter_title = str(chapter_title)
584
+ logger.debug(f'Chapter title: "{chapter_title}"')
604
585
 
605
- chapter.chapter_content = ""
606
- if self.scraper_behavior.save_title_to_content:
607
- chapter.chapter_content += f'<h4>{chapter_title}</h4>'
608
- logger.info(f'{len(paragraphs)} paragraphs found in chapter')
609
- for paragraph in paragraphs:
610
- chapter.chapter_content += str(paragraph)
586
+ logger.debug('Obtaining chapter content...')
587
+ chapter.chapter_content = self.decoder.get_chapter_content(chapter.chapter_html,
588
+ self.scraper_behavior.save_title_to_content,
589
+ chapter.chapter_title)
590
+ logger.debug('Chapter successfully decoded')
611
591
 
612
592
  return chapter
613
593
 
@@ -631,7 +611,7 @@ class Novel:
631
611
  if self.metadata.start_date:
632
612
  date_metadata += self.metadata.start_date
633
613
  # Calibre specification doesn't use end_date.
634
- # For now we use a custom metadata
614
+ # For now, we use a custom metadata
635
615
  # https://idpf.org/epub/31/spec/epub-packages.html#sec-opf-dcdate
636
616
  # if self.metadata.end_date:
637
617
  # date_metadata += f'/{self.metadata.end_date}'
@@ -699,8 +679,7 @@ class Novel:
699
679
  idx_start = start_chapter - 1
700
680
  idx_end = end_chapter
701
681
  # We create the epub book
702
- book_title = f'{self.metadata.novel_title} Chapters {
703
- start_chapter} - {end_chapter}'
682
+ book_title = f'{self.metadata.novel_title} Chapters {start_chapter} - {end_chapter}'
704
683
  calibre_collection = None
705
684
  # If collection_idx is set, we create a calibre collection
706
685
  if collection_idx:
@@ -712,8 +691,7 @@ class Novel:
712
691
  book = self._add_chapter_to_epub_book(chapter=chapter,
713
692
  book=book)
714
693
  if book is None:
715
- logger.critical(f'Error saving epub {book_title}, could not decode chapter {
716
- chapter} using host {self.host}')
694
+ logger.critical(f'Error saving epub {book_title}, could not decode chapter {chapter} using host {self.host}')
717
695
  return False
718
696
 
719
697
  book.add_item(epub.EpubNcx())
@@ -1 +1 @@
1
- __version__ = "1.0.3"
1
+ __version__ = "1.1.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: web-novel-scraper
3
- Version: 1.0.3
3
+ Version: 1.1.0
4
4
  Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
5
5
  Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
6
6
  Project-URL: Documentation, https://web-novel-scraper.readthedocs.io
@@ -0,0 +1,18 @@
1
+ web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ web_novel_scraper/__main__.py,sha256=OQQVX5CttmAkUwdrnjBSjKPaoh_boUI2ysHi3rLGOSs,17769
3
+ web_novel_scraper/decode.py,sha256=QxPjoYI1t4bf0zAf_7uLRrpsboi8DwsD1BNZUiHO4gc,10150
4
+ web_novel_scraper/file_manager.py,sha256=qAqgqtaRb7QyVtyEOW2cMhPYWdKM6nJ69weUCYKwVtM,11862
5
+ web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
6
+ web_novel_scraper/novel_scraper.py,sha256=hXIIPelRfx-jfD9VSPheg6z04I4JKxQj7wVBPlpP1go,28452
7
+ web_novel_scraper/request_manager.py,sha256=VtGpRi5b_Dp3h8viCdt7yMCb9M21Lk7oLC_Q_0EkXH8,6448
8
+ web_novel_scraper/utils.py,sha256=vq5ROuPv04k3MhbksTe0ni_yP0i_a7T_33mkBB1DUbQ,2076
9
+ web_novel_scraper/version.py,sha256=LGVQyDsWifdACo7qztwb8RWWHds1E7uQ-ZqD8SAjyw4,22
10
+ web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
11
+ web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
12
+ web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
13
+ web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
14
+ web_novel_scraper/decode_guide/decode_guide.json,sha256=DbcfnyRNOVXZd6ar1HDCHxkKgnmR3ziJ-B4GOFcDMEs,7584
15
+ web_novel_scraper-1.1.0.dist-info/METADATA,sha256=Llcez3yLJTICPNMAoO1aZShywK2soma1kmjl2OA3tYA,8423
16
+ web_novel_scraper-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
17
+ web_novel_scraper-1.1.0.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
18
+ web_novel_scraper-1.1.0.dist-info/RECORD,,
@@ -1,14 +0,0 @@
1
- web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- web_novel_scraper/__main__.py,sha256=PBIAG1vshnRdvYwyeD-RxlHS5mNEby-d4puV9kEyfpA,17615
3
- web_novel_scraper/decode.py,sha256=dqGv_8nFSKwO6GBj3jhaO9SQeLHeBjDzoV1C_YcN40k,5085
4
- web_novel_scraper/file_manager.py,sha256=PJu8kKeng49DTNQBbbMekFtIcTZOkeCEjFYqYJarv9M,11363
5
- web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
6
- web_novel_scraper/novel_scraper.py,sha256=eiic2i3AdK9lcFK9aNb4d8ptnKv9ua1B_9kcUY8_liM,28660
7
- web_novel_scraper/request_manager.py,sha256=VtGpRi5b_Dp3h8viCdt7yMCb9M21Lk7oLC_Q_0EkXH8,6448
8
- web_novel_scraper/utils.py,sha256=vq5ROuPv04k3MhbksTe0ni_yP0i_a7T_33mkBB1DUbQ,2076
9
- web_novel_scraper/version.py,sha256=2plzdEEb24FLjE2I2XyBBcJEPYWHccNL4SgtLC_6erg,22
10
- web_novel_scraper/decode_guide/decode_guide.json,sha256=Q4v-OZh_1MwdrFxDDVvj8T3evW3zzbSapRaGwFCdnX8,5425
11
- web_novel_scraper-1.0.3.dist-info/METADATA,sha256=VKG91J-QhL_NBjSuS29Em5_ZcFlw9oKf50-7WcJ97Lw,8423
12
- web_novel_scraper-1.0.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
13
- web_novel_scraper-1.0.3.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
14
- web_novel_scraper-1.0.3.dist-info/RECORD,,