web-novel-scraper 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web_novel_scraper/__main__.py +18 -12
- web_novel_scraper/custom_processor/__init__.py +2 -0
- web_novel_scraper/custom_processor/custom_processor.py +25 -0
- web_novel_scraper/custom_processor/sites/genesis.py +46 -0
- web_novel_scraper/custom_processor/sites/royalroad.py +22 -0
- web_novel_scraper/decode.py +126 -13
- web_novel_scraper/decode_guide/decode_guide.json +100 -14
- web_novel_scraper/file_manager.py +11 -0
- web_novel_scraper/novel_scraper.py +37 -57
- web_novel_scraper/request_manager.py +0 -1
- web_novel_scraper/version.py +1 -1
- {web_novel_scraper-1.0.2.dist-info → web_novel_scraper-1.0.4.dist-info}/METADATA +8 -8
- web_novel_scraper-1.0.4.dist-info/RECORD +18 -0
- web_novel_scraper-1.0.2.dist-info/RECORD +0 -14
- {web_novel_scraper-1.0.2.dist-info → web_novel_scraper-1.0.4.dist-info}/WHEEL +0 -0
- {web_novel_scraper-1.0.2.dist-info → web_novel_scraper-1.0.4.dist-info}/entry_points.txt +0 -0
web_novel_scraper/__main__.py
CHANGED
@@ -52,7 +52,7 @@ def validate_date(ctx, param, value):
|
|
52
52
|
|
53
53
|
# COMMON ARGUMENTS
|
54
54
|
title_option = click.option(
|
55
|
-
'-t', '--title', type=str, required=True, help='Title of the novel, this server as the identifier.')
|
55
|
+
'-t', '--title', type=str, required=True, envvar='SCRAPER_NOVEL_TITLE', help='Title of the novel, this server as the identifier.')
|
56
56
|
novel_base_dir_option = click.option(
|
57
57
|
'-nb', '--novel-base-dir', type=str, help='Alternative base directory for the novel files.')
|
58
58
|
|
@@ -330,19 +330,25 @@ def show_toc(title, novel_base_dir):
|
|
330
330
|
@click.option('--update-html', is_flag=True, default=False, show_default=True, help='If the chapter HTML is saved, it will be updated.')
|
331
331
|
def scrap_chapter(title, novel_base_dir, chapter_url, chapter_num, update_html):
|
332
332
|
"""Scrap a chapter of a novel."""
|
333
|
+
if (chapter_url is None and chapter_num is None) or (chapter_url and chapter_num):
|
334
|
+
raise click.UsageError("You must set exactly one: --chapter-url o --chapter-num.")
|
335
|
+
|
333
336
|
novel = obtain_novel(title, novel_base_dir)
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
337
|
+
|
338
|
+
if chapter_num is not None:
|
339
|
+
if chapter_num <= 0 or chapter_num > len(novel.chapters):
|
340
|
+
raise click.BadParameter(
|
341
|
+
'Chapter number should be positive and an existing chapter.', param_hint='--chapter-num')
|
342
|
+
chapter = novel.scrap_chapter(chapter_idx=chapter_num - 1,
|
343
|
+
update_html=update_html)
|
344
|
+
|
345
|
+
else:
|
346
|
+
chapter = novel.scrap_chapter(chapter_url=chapter_url,
|
347
|
+
update_html=update_html)
|
348
|
+
|
343
349
|
if not chapter:
|
344
|
-
click.
|
345
|
-
|
350
|
+
raise click.ClickException('Chapter not found or scrap failed.')
|
351
|
+
|
346
352
|
click.echo(chapter)
|
347
353
|
click.echo('Content:')
|
348
354
|
click.echo(chapter.chapter_content)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Any, Dict
|
3
|
+
|
4
|
+
class CustomProcessor(ABC):
|
5
|
+
@abstractmethod
|
6
|
+
def process(self, html: str) -> Any:
|
7
|
+
"""Process the HTML content using custom logic"""
|
8
|
+
pass
|
9
|
+
|
10
|
+
class ProcessorRegistry:
|
11
|
+
_processors: Dict[str, Dict[str, CustomProcessor]] = {}
|
12
|
+
|
13
|
+
@classmethod
|
14
|
+
def register(cls, host: str, content_type: str, processor: CustomProcessor):
|
15
|
+
if host not in cls._processors:
|
16
|
+
cls._processors[host] = {}
|
17
|
+
cls._processors[host][content_type] = processor
|
18
|
+
|
19
|
+
@classmethod
|
20
|
+
def get_processor(cls, host: str, content_type: str) -> CustomProcessor:
|
21
|
+
return cls._processors.get(host, {}).get(content_type)
|
22
|
+
|
23
|
+
@classmethod
|
24
|
+
def has_processor(cls, host: str, content_type: str) -> bool:
|
25
|
+
return bool(cls.get_processor(host, content_type))
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import re
|
2
|
+
import json
|
3
|
+
from typing import List, Optional
|
4
|
+
from ..custom_processor import CustomProcessor, ProcessorRegistry
|
5
|
+
|
6
|
+
GENESIS_STUDIO_VIEWER_URL = 'https://genesistudio.com/viewer'
|
7
|
+
|
8
|
+
class GenesisChaptersProcessor(CustomProcessor):
|
9
|
+
def process(self, html: str) -> Optional[List[dict]]:
|
10
|
+
pattern = r',chapters:\s*{\s*free:\s*(\[.*?"}}])'
|
11
|
+
match = re.search(pattern, html, re.DOTALL)
|
12
|
+
|
13
|
+
if not match:
|
14
|
+
if not match:
|
15
|
+
return None
|
16
|
+
|
17
|
+
try:
|
18
|
+
chapters_json = match.group(1).strip()
|
19
|
+
replaces = {
|
20
|
+
"chapter_title:": '"chapter_tile":',
|
21
|
+
"id:": '"id":',
|
22
|
+
"nsfw:": '"nsfw":',
|
23
|
+
"required_tier:": '"required_tier":',
|
24
|
+
"date_created:": '"date_created":',
|
25
|
+
"spoiler_title:": '"spoiler_title":',
|
26
|
+
"chapter_number:": '"chapter_number":',
|
27
|
+
"novel:": '"novel":',
|
28
|
+
}
|
29
|
+
# Ensure the JSON string ends properly
|
30
|
+
if not chapters_json.endswith(']'):
|
31
|
+
chapters_json += ']'
|
32
|
+
for old_key, new_key in replaces.items():
|
33
|
+
chapters_json = chapters_json.replace(old_key, new_key)
|
34
|
+
# print(f"Extracted JSON: {chapters_json[12200:12300]}") # Debug print
|
35
|
+
chapters = json.loads(chapters_json)
|
36
|
+
chapters_url = []
|
37
|
+
for chapter in chapters:
|
38
|
+
chapters_url.append(f"{GENESIS_STUDIO_VIEWER_URL}/{chapter['id']}")
|
39
|
+
print(chapters)
|
40
|
+
return chapters_url
|
41
|
+
|
42
|
+
except (json.JSONDecodeError, IndexError) as e:
|
43
|
+
print(f"Error processing JSON: {str(e)}")
|
44
|
+
return None
|
45
|
+
|
46
|
+
ProcessorRegistry.register('genesistudio.com', 'index', GenesisChaptersProcessor())
|
@@ -0,0 +1,22 @@
|
|
1
|
+
import re
|
2
|
+
import json
|
3
|
+
from typing import List, Optional
|
4
|
+
from ..custom_processor import CustomProcessor, ProcessorRegistry
|
5
|
+
|
6
|
+
class RoyalRoadChaptersProcessor(CustomProcessor):
|
7
|
+
def process(self, html: str) -> Optional[List[dict]]:
|
8
|
+
pattern = r'window\.chapters\s*=\s*(\[.*?\]);'
|
9
|
+
match = re.search(pattern, html, re.DOTALL)
|
10
|
+
|
11
|
+
if not match:
|
12
|
+
return None
|
13
|
+
|
14
|
+
try:
|
15
|
+
chapters_json = match.group(1)
|
16
|
+
chapters = json.loads(chapters_json)
|
17
|
+
chapters = [chapter['url'] for chapter in chapters if 'url' in chapter]
|
18
|
+
return chapters
|
19
|
+
except (json.JSONDecodeError, IndexError):
|
20
|
+
return None
|
21
|
+
|
22
|
+
ProcessorRegistry.register('www.royalroad.com', 'index', RoyalRoadChaptersProcessor())
|
web_novel_scraper/decode.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
import os
|
2
2
|
import json
|
3
3
|
from pathlib import Path
|
4
|
+
from typing import Optional
|
4
5
|
|
5
6
|
from . import logger_manager
|
7
|
+
from .custom_processor.custom_processor import ProcessorRegistry
|
6
8
|
|
7
9
|
from bs4 import BeautifulSoup
|
8
10
|
|
@@ -41,17 +43,92 @@ class Decoder:
|
|
41
43
|
self.decode_guide = self._get_element_by_key(
|
42
44
|
DECODE_GUIDE, 'host', host)
|
43
45
|
|
44
|
-
def
|
46
|
+
def get_chapter_urls(self, html: str) -> list[str]:
|
47
|
+
logger.debug('Obtaining chapter URLs...')
|
48
|
+
chapter_urls = self.decode_html(html, 'index')
|
49
|
+
|
50
|
+
if chapter_urls is None:
|
51
|
+
logger.critical(f"Failed to obtain chapter URLs for {self.host}")
|
52
|
+
raise ValueError(f"Failed to obtain chapter URLs for {self.host}")
|
53
|
+
|
54
|
+
if isinstance(chapter_urls, str):
|
55
|
+
logger.warning('When obtaining chapter urls, obtained a String but expected a List')
|
56
|
+
logger.warning('Check decode config')
|
57
|
+
chapter_urls = [chapter_urls]
|
58
|
+
|
59
|
+
return chapter_urls
|
60
|
+
|
61
|
+
def get_toc_next_page_url(self, html: str) -> Optional[str]:
|
62
|
+
logger.debug('Obtaining toc next page URL...')
|
63
|
+
toc_next_page_url = self.decode_html(html, 'next_page')
|
64
|
+
if toc_next_page_url is None:
|
65
|
+
logger.debug('No next page URL found, assuming last page...')
|
66
|
+
return None
|
67
|
+
return toc_next_page_url
|
68
|
+
|
69
|
+
def get_chapter_title(self, html: str) -> Optional[str]:
|
70
|
+
logger.debug('Obtaining chapter title...')
|
71
|
+
chapter_title = self.decode_html(html, 'title')
|
72
|
+
if chapter_title is None:
|
73
|
+
logger.debug(f'No chapter_title found.')
|
74
|
+
return chapter_title
|
75
|
+
|
76
|
+
def get_chapter_content(self, html: str, save_title_to_content: bool, chapter_title: str) -> str:
|
77
|
+
logger.debug('Obtaining chapter content...')
|
78
|
+
full_chapter_content = ""
|
79
|
+
chapter_content = self.decode_html(html, 'content')
|
80
|
+
|
81
|
+
if chapter_content is None:
|
82
|
+
logger.critical('No content found on chapter')
|
83
|
+
raise ValueError('No content found on chapter')
|
84
|
+
|
85
|
+
if save_title_to_content:
|
86
|
+
logger.debug('Saving chapter title to content...')
|
87
|
+
full_chapter_content += f'<h4>{chapter_title}</h4>'
|
88
|
+
|
89
|
+
if isinstance(chapter_content, list):
|
90
|
+
logger.debug(f'{len(chapter_content)} paragraphs found in chapter')
|
91
|
+
logger.debug('Converting list of paragraphs to a single string')
|
92
|
+
for paragraph in chapter_content:
|
93
|
+
full_chapter_content += str(paragraph)
|
94
|
+
else:
|
95
|
+
logger.debug('Chapter content is not a list, no conversion made')
|
96
|
+
full_chapter_content += str(chapter_content)
|
97
|
+
return full_chapter_content
|
98
|
+
|
99
|
+
def decode_html(self, html: str, content_type: str) -> str | list[str] | None:
|
100
|
+
logger.debug(f'Decoding HTML...')
|
101
|
+
logger.debug(f'Content type: {content_type}')
|
102
|
+
logger.debug(f'Decode guide: {DECODE_GUIDE_FILE}')
|
103
|
+
logger.debug(f'Host: {self.host}')
|
45
104
|
if not content_type in self.decode_guide:
|
46
|
-
logger.
|
47
|
-
|
48
|
-
|
49
|
-
|
105
|
+
logger.critical(f'{content_type} key does not exists on decode guide {DECODE_GUIDE_FILE}'
|
106
|
+
f'for host {self.host}')
|
107
|
+
raise ValueError(f'{content_type} key does not exists on decode guide {DECODE_GUIDE_FILE}'
|
108
|
+
f'for host {self.host}')
|
109
|
+
|
110
|
+
if ProcessorRegistry.has_processor(self.host, content_type):
|
111
|
+
logger.debug(f'Host {self.host} will use a custom processor')
|
112
|
+
processor = ProcessorRegistry.get_processor(self.host, content_type)
|
113
|
+
return processor.process(html)
|
114
|
+
|
115
|
+
logger.debug('Starting HTML parsing...')
|
116
|
+
try:
|
117
|
+
soup = BeautifulSoup(html, 'html.parser')
|
118
|
+
except Exception as e:
|
119
|
+
logger.critical(f'Error parsing HTML with BeautifulSoup: {e}')
|
120
|
+
raise ValueError(f'Error parsing HTML with BeautifulSoup: {e}')
|
121
|
+
|
50
122
|
decoder = self.decode_guide[content_type]
|
51
123
|
elements = self._find_elements(soup, decoder)
|
52
124
|
if not elements:
|
53
|
-
logger.warning(f'{content_type} not found on html using {
|
54
|
-
|
125
|
+
logger.warning(f'{content_type} not found on html using {DECODE_GUIDE_FILE} '
|
126
|
+
f'for host {self.host}')
|
127
|
+
|
128
|
+
# Investigate this conditional
|
129
|
+
if content_type == 'title' and isinstance(elements, list):
|
130
|
+
logger.debug('Joining titles...')
|
131
|
+
return ' '.join(elements)
|
55
132
|
return elements
|
56
133
|
|
57
134
|
def has_pagination(self, host: str = None):
|
@@ -81,8 +158,11 @@ class Decoder:
|
|
81
158
|
|
82
159
|
return "\n".join([line.strip() for line in str(soup).splitlines() if line.strip()])
|
83
160
|
|
84
|
-
|
161
|
+
@staticmethod
|
162
|
+
def _find_elements(soup: BeautifulSoup, decoder: dict):
|
163
|
+
logger.debug('Finding elements...')
|
85
164
|
selector = decoder.get('selector')
|
165
|
+
elements = []
|
86
166
|
if selector is None:
|
87
167
|
selector = ''
|
88
168
|
element = decoder.get('element')
|
@@ -91,32 +171,46 @@ class Decoder:
|
|
91
171
|
attributes = decoder.get('attributes')
|
92
172
|
|
93
173
|
if element:
|
174
|
+
logger.debug(f'Using element "{element}"')
|
94
175
|
selector += element
|
95
176
|
if _id:
|
177
|
+
logger.debug(f'Using id "{_id}"')
|
96
178
|
selector += f'#{_id}'
|
97
179
|
if _class:
|
180
|
+
logger.debug(f'Using class "{_class}"')
|
98
181
|
selector += f'.{_class}'
|
99
182
|
if attributes:
|
100
183
|
for attr, value in attributes.items():
|
101
|
-
|
184
|
+
logger.debug(f'Using attribute "{attr}"')
|
185
|
+
if value is not None:
|
186
|
+
logger.debug(f'With value "{value}"')
|
187
|
+
selector += f'[{attr}="{value}"]'
|
188
|
+
else:
|
189
|
+
selector += f'[{attr}]'
|
102
190
|
selectors = [selector]
|
103
191
|
else:
|
192
|
+
logger.debug(f'Using selector "{selector}"')
|
104
193
|
if XOR_SEPARATOR in selector:
|
194
|
+
logger.debug(f'Found XOR_OPERATOR "{XOR_SEPARATOR}" in selector')
|
195
|
+
logger.debug('Splitting selectors...')
|
105
196
|
selectors = selector.split(XOR_SEPARATOR)
|
106
197
|
else:
|
107
198
|
selectors = [selector]
|
108
199
|
|
109
200
|
for selector in selectors:
|
110
|
-
logger.debug(f'
|
201
|
+
logger.debug(f'Searching using selector "{selector}"...')
|
111
202
|
elements = soup.select(selector)
|
112
203
|
if elements:
|
113
|
-
logger.debug(f'{len(elements)} found using selector {selector}')
|
204
|
+
logger.debug(f'{len(elements)} found using selector "{selector}"')
|
114
205
|
break
|
206
|
+
logger.debug(f'No elements found using selector "{selector}"')
|
115
207
|
|
116
208
|
extract = decoder.get('extract')
|
117
209
|
if extract:
|
210
|
+
logger.debug(f'Extracting from elements...')
|
118
211
|
if extract["type"] == "attr":
|
119
212
|
attr_key = extract["key"]
|
213
|
+
logger.debug(f'Extracting value from attribute "{attr_key}"...')
|
120
214
|
elements_aux = elements
|
121
215
|
elements = []
|
122
216
|
for element in elements_aux:
|
@@ -125,15 +219,34 @@ class Decoder:
|
|
125
219
|
if attr:
|
126
220
|
elements.append(attr)
|
127
221
|
except KeyError:
|
222
|
+
logger.debug(f'Attribute "{attr_key}" not found')
|
223
|
+
logger.debug('Ignoring...')
|
128
224
|
pass
|
225
|
+
logger.debug(f'{len(elements)} elements found using attribute "{attr_key}"')
|
129
226
|
if extract["type"] == "text":
|
227
|
+
logger.debug('Extracting text from elements...')
|
130
228
|
elements = [element.string for element in elements]
|
229
|
+
|
230
|
+
if not elements:
|
231
|
+
logger.error('No elements found, returning "None"')
|
232
|
+
return None
|
233
|
+
|
131
234
|
inverted = decoder.get('inverted')
|
132
235
|
if inverted:
|
236
|
+
logger.debug('Inverted option activate')
|
237
|
+
logger.debug('Inverting elements order...')
|
133
238
|
elements = elements[::-1]
|
134
|
-
return elements if decoder.get('array') else elements[0] if elements else None
|
135
239
|
|
136
|
-
|
240
|
+
if decoder.get('array'):
|
241
|
+
logger.debug('Array option activated')
|
242
|
+
logger.debug('Returning elements a list')
|
243
|
+
return elements
|
244
|
+
logger.debug('Array option not activated')
|
245
|
+
logger.debug('Returning only first element...')
|
246
|
+
return elements[0]
|
247
|
+
|
248
|
+
@staticmethod
|
249
|
+
def _get_element_by_key(json_data, key, value):
|
137
250
|
for item in json_data:
|
138
251
|
if item[key] == value:
|
139
252
|
return item
|
@@ -24,7 +24,11 @@
|
|
24
24
|
"class": null,
|
25
25
|
"selector": null,
|
26
26
|
"attributes": null,
|
27
|
-
"array": true
|
27
|
+
"array": true,
|
28
|
+
"extract": {
|
29
|
+
"type": "attr",
|
30
|
+
"key": "href"
|
31
|
+
}
|
28
32
|
},
|
29
33
|
"next_page": {
|
30
34
|
"element": "p",
|
@@ -60,7 +64,11 @@
|
|
60
64
|
"class": null,
|
61
65
|
"selector": "div.m-newest2 ul li a",
|
62
66
|
"attributes": null,
|
63
|
-
"array": true
|
67
|
+
"array": true,
|
68
|
+
"extract": {
|
69
|
+
"type": "attr",
|
70
|
+
"key": "href"
|
71
|
+
}
|
64
72
|
},
|
65
73
|
"next_page": {
|
66
74
|
"element": null,
|
@@ -72,7 +80,7 @@
|
|
72
80
|
}
|
73
81
|
},
|
74
82
|
{
|
75
|
-
"host": "royalroad.com",
|
83
|
+
"host": "www.royalroad.com",
|
76
84
|
"has_pagination": false,
|
77
85
|
"title": {
|
78
86
|
"element": null,
|
@@ -95,12 +103,7 @@
|
|
95
103
|
"array": true
|
96
104
|
},
|
97
105
|
"index": {
|
98
|
-
"
|
99
|
-
"id": null,
|
100
|
-
"class": null,
|
101
|
-
"selector": "tr.chapter-row td a",
|
102
|
-
"attributes": null,
|
103
|
-
"array": true
|
106
|
+
"use_custom_processor": true
|
104
107
|
},
|
105
108
|
"next_page": {
|
106
109
|
"element": null,
|
@@ -140,7 +143,11 @@
|
|
140
143
|
"class": null,
|
141
144
|
"selector": "ul.list-chapter li a",
|
142
145
|
"attributes": null,
|
143
|
-
"array": true
|
146
|
+
"array": true,
|
147
|
+
"extract": {
|
148
|
+
"type": "attr",
|
149
|
+
"key": "href"
|
150
|
+
}
|
144
151
|
},
|
145
152
|
"next_page": {
|
146
153
|
"element": null,
|
@@ -167,10 +174,10 @@
|
|
167
174
|
}
|
168
175
|
},
|
169
176
|
"content": {
|
170
|
-
"element":
|
177
|
+
"element": null,
|
171
178
|
"id": null,
|
172
179
|
"class": null,
|
173
|
-
"selector":
|
180
|
+
"selector": "div#chr-content p",
|
174
181
|
"attributes": null,
|
175
182
|
"array": true
|
176
183
|
},
|
@@ -180,7 +187,11 @@
|
|
180
187
|
"class": null,
|
181
188
|
"selector": "ul.list-chapter li a",
|
182
189
|
"attributes": null,
|
183
|
-
"array": true
|
190
|
+
"array": true,
|
191
|
+
"extract": {
|
192
|
+
"type": "attr",
|
193
|
+
"key": "href"
|
194
|
+
}
|
184
195
|
},
|
185
196
|
"next_page": {
|
186
197
|
"element": null,
|
@@ -207,7 +218,82 @@
|
|
207
218
|
"index": {
|
208
219
|
"element": "ul.main li a",
|
209
220
|
"array": true,
|
210
|
-
"inverted": true
|
221
|
+
"inverted": true,
|
222
|
+
"extract": {
|
223
|
+
"type": "attr",
|
224
|
+
"key": "href"
|
225
|
+
}
|
226
|
+
}
|
227
|
+
},
|
228
|
+
{
|
229
|
+
"host": "genesistudio.com",
|
230
|
+
"has_pagination": false,
|
231
|
+
"title": {
|
232
|
+
"element": null,
|
233
|
+
"id": null,
|
234
|
+
"class": null,
|
235
|
+
"selector": "p.leading-none span",
|
236
|
+
"attributes": null,
|
237
|
+
"array": true,
|
238
|
+
"extract": {
|
239
|
+
"type": "text",
|
240
|
+
"key": "text"
|
241
|
+
}
|
242
|
+
},
|
243
|
+
"content": {
|
244
|
+
"element": "p",
|
245
|
+
"id": null,
|
246
|
+
"class": "narration",
|
247
|
+
"selector": null,
|
248
|
+
"attributes": null,
|
249
|
+
"array": true,
|
250
|
+
"extract": {
|
251
|
+
"type": "text",
|
252
|
+
"key": "text"
|
253
|
+
}
|
254
|
+
},
|
255
|
+
"index": {
|
256
|
+
"use_custom_processor": true
|
257
|
+
},
|
258
|
+
"next_page": {
|
259
|
+
"element": null,
|
260
|
+
"id": null,
|
261
|
+
"class": null,
|
262
|
+
"selector": null,
|
263
|
+
"attributes": null,
|
264
|
+
"array": true
|
265
|
+
}
|
266
|
+
},
|
267
|
+
{
|
268
|
+
"host": "hostednovel.com",
|
269
|
+
"has_pagination": true,
|
270
|
+
"title": {
|
271
|
+
"selector": "span#chapter-title",
|
272
|
+
"extract": {
|
273
|
+
"type": "text"
|
274
|
+
}
|
275
|
+
},
|
276
|
+
"content": {
|
277
|
+
"element": "div",
|
278
|
+
"id": "chapter-content",
|
279
|
+
"array": true
|
280
|
+
},
|
281
|
+
"index": {
|
282
|
+
"selector": "li ul li.flow-root a",
|
283
|
+
"array": true,
|
284
|
+
"inverted": false,
|
285
|
+
"extract": {
|
286
|
+
"type": "attr",
|
287
|
+
"key": "href"
|
288
|
+
}
|
289
|
+
},
|
290
|
+
"next_page": {
|
291
|
+
"selector": "a:has(span:contains('Next'))",
|
292
|
+
"array": false,
|
293
|
+
"extract": {
|
294
|
+
"type": "attr",
|
295
|
+
"key": "href"
|
296
|
+
}
|
211
297
|
}
|
212
298
|
}
|
213
299
|
]
|
@@ -7,6 +7,7 @@ from pathlib import Path
|
|
7
7
|
import shutil
|
8
8
|
from dotenv import load_dotenv
|
9
9
|
from ebooklib import epub
|
10
|
+
import unicodedata
|
10
11
|
|
11
12
|
from . import logger_manager
|
12
13
|
|
@@ -77,6 +78,16 @@ class FileManager:
|
|
77
78
|
def save_chapter_html(self, filename: str, content: str):
|
78
79
|
full_path = self.novel_chapters_dir / filename
|
79
80
|
logger.debug(f'Saving chapter to {full_path}')
|
81
|
+
content = unicodedata.normalize('NFKC', content)
|
82
|
+
char_replacements = {
|
83
|
+
"â": "'", # Reemplazar â con apóstrofe
|
84
|
+
"\u2018": "'", # Comillda simple izquierda Unicode
|
85
|
+
"\u2019": "'", # Comilla simple derecha Unicode
|
86
|
+
"\u201C": '"', # Comilla doble izquierda Unicode
|
87
|
+
"\u201D": '"', # Comilla doble derecha Unicode
|
88
|
+
}
|
89
|
+
for old_char, new_char in char_replacements.items():
|
90
|
+
content = content.replace(old_char, new_char)
|
80
91
|
_save_content_to_file(full_path, content)
|
81
92
|
|
82
93
|
def load_chapter_html(self, filename: str):
|
@@ -220,6 +220,7 @@ class Novel:
|
|
220
220
|
self.decoder = Decoder(self.host)
|
221
221
|
elif update_host:
|
222
222
|
self.decoder = Decoder(utils.obtain_host(self.toc_main_url))
|
223
|
+
self.save_novel()
|
223
224
|
|
224
225
|
def add_toc_html(self, html: str, host: str = None) -> None:
|
225
226
|
if self.toc_main_url:
|
@@ -248,7 +249,7 @@ class Novel:
|
|
248
249
|
toc_not_exists = not all_tocs_content and self.toc_main_url is None
|
249
250
|
if toc_not_exists:
|
250
251
|
logger.critical(
|
251
|
-
'There is no toc html and no toc url
|
252
|
+
'There is no toc html and no toc url set, unable to get toc.')
|
252
253
|
return False
|
253
254
|
|
254
255
|
reload_files = reload_files and self.toc_main_url is not None
|
@@ -259,18 +260,16 @@ class Novel:
|
|
259
260
|
toc_content = self._add_toc(self.toc_main_url)
|
260
261
|
all_tocs_content.append(toc_content)
|
261
262
|
if self.decoder.has_pagination():
|
262
|
-
next_page = self.
|
263
|
+
next_page = self.decoder.get_toc_next_page_url(toc_content)
|
263
264
|
while next_page:
|
264
265
|
toc_content = self._add_toc(next_page)
|
265
|
-
next_page = self.
|
266
|
-
toc_content)
|
266
|
+
next_page = self.decoder.get_toc_next_page_url(toc_content)
|
267
267
|
all_tocs_content.append(toc_content)
|
268
268
|
|
269
269
|
# Now we get the links from the toc content
|
270
270
|
self.chapters_url_list = []
|
271
271
|
for toc_content in all_tocs_content:
|
272
|
-
chapters_url_from_toc_content = self.
|
273
|
-
toc_content)
|
272
|
+
chapters_url_from_toc_content = self.decoder.get_chapter_urls(toc_content)
|
274
273
|
if chapters_url_from_toc_content is None:
|
275
274
|
logger.error('Chapters url not found on toc_content')
|
276
275
|
return False
|
@@ -307,35 +306,39 @@ class Novel:
|
|
307
306
|
return chapter_list
|
308
307
|
|
309
308
|
def scrap_chapter(self, chapter_url: str = None, chapter_idx: int = None, update_html: bool = False) -> Chapter:
|
309
|
+
logger.info('Scraping Chapter...')
|
310
|
+
chapter = None
|
310
311
|
if not utils.check_exclusive_params(chapter_url, chapter_idx):
|
311
|
-
|
312
|
-
'chapter_url and chapter_id, only one needs to be setted')
|
313
|
-
return
|
312
|
+
raise ValueError("chapter_url and chapter_id, only one needs to be set")
|
314
313
|
|
315
314
|
if chapter_url is not None:
|
315
|
+
logger.debug(f'Using chapter url: {chapter_url}')
|
316
316
|
chapter = self._get_chapter_by_url(chapter_url=chapter_url)
|
317
317
|
if chapter is None:
|
318
|
+
logger.warning(f'Chapter with url "{chapter_url}" does not exist, generating one...')
|
318
319
|
chapter = Chapter(chapter_url=chapter_url)
|
319
320
|
|
320
321
|
if chapter_idx is not None:
|
322
|
+
logger.debug(f'Using chapter index: {chapter_idx}')
|
321
323
|
if chapter_idx < 0 or chapter_idx >= len(self.chapters):
|
322
|
-
logger.
|
323
|
-
|
324
|
-
chapter = self.chapters[chapter_idx]
|
324
|
+
logger.critical(f'Could not find chapter with idx {chapter_idx}')
|
325
|
+
raise ValueError(f'Could not find chapter with idx {chapter_idx}')
|
325
326
|
|
327
|
+
chapter = self.chapters[chapter_idx]
|
328
|
+
if update_html:
|
329
|
+
logger.debug('HTML will be updated...')
|
326
330
|
chapter = self._get_chapter(chapter,
|
327
331
|
reload=update_html)
|
328
332
|
|
329
333
|
if not chapter.chapter_html or not chapter.chapter_html_filename:
|
330
|
-
logger.
|
331
|
-
|
332
|
-
|
334
|
+
logger.critical(f'Failed to create chapter on link: "{chapter_url}" '
|
335
|
+
f'on path "{chapter.chapter_html_filename}"')
|
336
|
+
raise ValueError(f'Failed to create chapter on link: "{chapter_url}" '
|
337
|
+
f'on path "{chapter.chapter_html_filename}"')
|
333
338
|
|
334
|
-
# We get the title and content
|
339
|
+
# We get the chapter title and content
|
340
|
+
# We pass an index so we can autogenerate a Title
|
335
341
|
chapter = self._decode_chapter(chapter=chapter, idx_for_chapter_name=chapter_idx)
|
336
|
-
if not chapter.chapter_content:
|
337
|
-
logger.error('Content not found')
|
338
|
-
return
|
339
342
|
|
340
343
|
logger.info(f'Chapter scrapped from link: {chapter_url}')
|
341
344
|
return chapter
|
@@ -506,22 +509,6 @@ class Novel:
|
|
506
509
|
self.file_manager.add_toc(content)
|
507
510
|
return content
|
508
511
|
|
509
|
-
def _get_chapter_urls_from_toc_content(self, toc_content: str) -> list[str]:
|
510
|
-
toc_elements = self.decoder.decode_html(toc_content, 'index')
|
511
|
-
try:
|
512
|
-
toc_urls = [toc_element['href'] for toc_element in toc_elements]
|
513
|
-
except KeyError as e:
|
514
|
-
logger.error(f'{e} not found on the Tag elements decoded from TOC')
|
515
|
-
return
|
516
|
-
if toc_urls:
|
517
|
-
return toc_urls
|
518
|
-
logger.warning('No chapter links found on toc content')
|
519
|
-
|
520
|
-
def _get_next_page_from_toc_content(self, toc_content: str) -> str:
|
521
|
-
next_page = self.decoder.decode_html(toc_content, 'next_page')
|
522
|
-
if next_page:
|
523
|
-
return next_page[0]['href']
|
524
|
-
|
525
512
|
def _add_or_update_chapter_data(self, chapter: Chapter, link_idx: int = None, save_in_file: bool = True) -> None:
|
526
513
|
if link_idx:
|
527
514
|
chapter_idx = link_idx
|
@@ -579,35 +566,28 @@ class Novel:
|
|
579
566
|
self.save_novel()
|
580
567
|
|
581
568
|
def _decode_chapter(self, chapter: Chapter, idx_for_chapter_name: str = None) -> Chapter:
|
582
|
-
|
583
|
-
|
569
|
+
logger.debug('Decoding chapter...')
|
584
570
|
if chapter.chapter_html is None:
|
571
|
+
logger.debug(f'No HTML content found, requesting HTML content...')
|
585
572
|
chapter = self._get_chapter(chapter)
|
586
573
|
|
587
574
|
if not chapter.chapter_html:
|
588
|
-
|
589
|
-
|
590
|
-
return None
|
591
|
-
|
592
|
-
paragraphs = self.decoder.decode_html(chapter.chapter_html, 'content')
|
593
|
-
|
594
|
-
if not paragraphs:
|
595
|
-
if chapter:
|
596
|
-
logger.warning(f'No paragraphs found in chapter link {
|
597
|
-
chapter.chapter_url} on file {chapter.chapter_html_filename}')
|
575
|
+
raise ValueError(f'Chapter HTML could not be obtained for chapter link "{chapter.chapter_url}" '
|
576
|
+
f'on file "{chapter.chapter_html_filename}"')
|
598
577
|
|
599
|
-
|
578
|
+
logger.debug('Obtaining chapter title...')
|
579
|
+
chapter_title = self.decoder.get_chapter_title(chapter.chapter_html)
|
600
580
|
if not chapter_title:
|
601
|
-
|
602
|
-
|
581
|
+
logger.debug('No chapter title found, generating one...')
|
582
|
+
chapter_title = f'{self.metadata.novel_title} Chapter {idx_for_chapter_name}'
|
603
583
|
chapter.chapter_title = str(chapter_title)
|
584
|
+
logger.debug(f'Chapter title: "{chapter_title}"')
|
604
585
|
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
chapter.chapter_content += str(paragraph)
|
586
|
+
logger.debug('Obtaining chapter content...')
|
587
|
+
chapter.chapter_content = self.decoder.get_chapter_content(chapter.chapter_html,
|
588
|
+
self.scraper_behavior.save_title_to_content,
|
589
|
+
chapter.chapter_title)
|
590
|
+
logger.debug('Chapter successfully decoded')
|
611
591
|
|
612
592
|
return chapter
|
613
593
|
|
@@ -631,7 +611,7 @@ class Novel:
|
|
631
611
|
if self.metadata.start_date:
|
632
612
|
date_metadata += self.metadata.start_date
|
633
613
|
# Calibre specification doesn't use end_date.
|
634
|
-
# For now we use a custom metadata
|
614
|
+
# For now, we use a custom metadata
|
635
615
|
# https://idpf.org/epub/31/spec/epub-packages.html#sec-opf-dcdate
|
636
616
|
# if self.metadata.end_date:
|
637
617
|
# date_metadata += f'/{self.metadata.end_date}'
|
web_novel_scraper/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.0.
|
1
|
+
__version__ = "1.0.4"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: web-novel-scraper
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.4
|
4
4
|
Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
|
5
5
|
Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
|
6
6
|
Project-URL: Documentation, https://web-novel-scraper.readthedocs.io
|
@@ -138,7 +138,7 @@ In the example above:
|
|
138
138
|
The following commands are available in the Web Novel Scraping CLI:
|
139
139
|
|
140
140
|
```bash
|
141
|
-
Usage:
|
141
|
+
Usage: web-novel-scraper [OPTIONS] COMMAND [ARGS]...
|
142
142
|
|
143
143
|
CLI Tool for web novel scraping.
|
144
144
|
|
@@ -175,32 +175,32 @@ Here are some basic examples:
|
|
175
175
|
|
176
176
|
### Example 1: Creating a Novel using a main URL
|
177
177
|
```bash
|
178
|
-
|
178
|
+
web-novel-scraper create-novel --title 'Novel 1' --author 'ImagineBrkr' --toc-main-url 'https://page.me/Novel-1/toc' --cover 'cover.jpg'
|
179
179
|
```
|
180
180
|
Some pages have too much JavaScript, so you can just copy the HTML manually to a file and create the novel from it:
|
181
181
|
```bash
|
182
|
-
|
182
|
+
web-novel-scraper create-novel --title 'Novel 1' --author 'ImagineBrkr' --toc-html 'toc.html' --host 'page.me' --cover 'cover.jpg'
|
183
183
|
```
|
184
184
|
If there is more than one page for the TOC, you can add them:
|
185
185
|
```bash
|
186
|
-
|
186
|
+
web-novel-scraper add-toc-html --title 'Novel 1' --toc-html 'toc2.html'
|
187
187
|
```
|
188
188
|
You can create the chapters from this TOC, or synchronize if they were already created but there are new chapters.
|
189
189
|
```bash
|
190
|
-
|
190
|
+
web-novel-scraper sync-toc --title 'Novel 1'
|
191
191
|
```
|
192
192
|
The default directory will be %APPDATA%/ImagineBrkr/web-novel-scraper for Windows, all the files will be saved there, but you can change it.
|
193
193
|
|
194
194
|
### Example 2: Requesting files
|
195
195
|
We can now download all the chapters
|
196
196
|
```bash
|
197
|
-
|
197
|
+
web-novel-scraper request-all-chapters --title 'Novel 1'
|
198
198
|
```
|
199
199
|
|
200
200
|
### Example 3: Saving to EPUB
|
201
201
|
With
|
202
202
|
```bash
|
203
|
-
|
203
|
+
web-novel-scraper save-novel-to-epub --title 'Novel 1'
|
204
204
|
```
|
205
205
|
|
206
206
|
For more detailed usage and options, use --help for each command.
|
@@ -0,0 +1,18 @@
|
|
1
|
+
web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
web_novel_scraper/__main__.py,sha256=OQQVX5CttmAkUwdrnjBSjKPaoh_boUI2ysHi3rLGOSs,17769
|
3
|
+
web_novel_scraper/decode.py,sha256=0RMHx1buR01KhuXiVQwdSpCGN960Xh-iPw1eYHxLeDg,10181
|
4
|
+
web_novel_scraper/file_manager.py,sha256=Q3DH-c8fWz9sziMps7A3p_sQoDMEpqBket07Agh-__Q,11898
|
5
|
+
web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
|
6
|
+
web_novel_scraper/novel_scraper.py,sha256=Notk0O94HZrO-MVKDGCBL0VopApFchn13FO2_N3ZfRM,28418
|
7
|
+
web_novel_scraper/request_manager.py,sha256=VtGpRi5b_Dp3h8viCdt7yMCb9M21Lk7oLC_Q_0EkXH8,6448
|
8
|
+
web_novel_scraper/utils.py,sha256=vq5ROuPv04k3MhbksTe0ni_yP0i_a7T_33mkBB1DUbQ,2076
|
9
|
+
web_novel_scraper/version.py,sha256=acuR_XSJzp4OrQ5T8-Ac5gYe48mUwObuwjRmisFmZ7k,22
|
10
|
+
web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
|
11
|
+
web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
|
12
|
+
web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
|
13
|
+
web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
|
14
|
+
web_novel_scraper/decode_guide/decode_guide.json,sha256=IBBzbSSVO-yQ5PCY7o8ralnaonMwBpEZW1v1TStiVqc,7582
|
15
|
+
web_novel_scraper-1.0.4.dist-info/METADATA,sha256=IhvDqK_Gz1POjzbH2cQVUYql1dhZJvdHnM9R--le0uc,8423
|
16
|
+
web_novel_scraper-1.0.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
17
|
+
web_novel_scraper-1.0.4.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
|
18
|
+
web_novel_scraper-1.0.4.dist-info/RECORD,,
|
@@ -1,14 +0,0 @@
|
|
1
|
-
web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
web_novel_scraper/__main__.py,sha256=PBIAG1vshnRdvYwyeD-RxlHS5mNEby-d4puV9kEyfpA,17615
|
3
|
-
web_novel_scraper/decode.py,sha256=dqGv_8nFSKwO6GBj3jhaO9SQeLHeBjDzoV1C_YcN40k,5085
|
4
|
-
web_novel_scraper/file_manager.py,sha256=PJu8kKeng49DTNQBbbMekFtIcTZOkeCEjFYqYJarv9M,11363
|
5
|
-
web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
|
6
|
-
web_novel_scraper/novel_scraper.py,sha256=eiic2i3AdK9lcFK9aNb4d8ptnKv9ua1B_9kcUY8_liM,28660
|
7
|
-
web_novel_scraper/request_manager.py,sha256=0M_ekBuDCMRGZIWxDbZ_yAwPOxJr2mBpP-Yj8zsE13o,6449
|
8
|
-
web_novel_scraper/utils.py,sha256=vq5ROuPv04k3MhbksTe0ni_yP0i_a7T_33mkBB1DUbQ,2076
|
9
|
-
web_novel_scraper/version.py,sha256=Y3LSfRioSl2xch70pq_ULlvyECXyEtN3krVaWeGyaxk,22
|
10
|
-
web_novel_scraper/decode_guide/decode_guide.json,sha256=Q4v-OZh_1MwdrFxDDVvj8T3evW3zzbSapRaGwFCdnX8,5425
|
11
|
-
web_novel_scraper-1.0.2.dist-info/METADATA,sha256=OBhkSUWS02JIFh4qbsRdS_s_UL15_gAOWAqsQu-Day4,8419
|
12
|
-
web_novel_scraper-1.0.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
13
|
-
web_novel_scraper-1.0.2.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
|
14
|
-
web_novel_scraper-1.0.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|