web-novel-scraper 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,135 @@
1
+ import requests
2
+ import os
3
+ from . import logger_manager
4
+ from dotenv import load_dotenv
5
+ import json
6
+ import time
7
+
8
+ load_dotenv()
9
+
10
+ FLARESOLVER_URL = os.getenv('SCRAPER_FLARESOLVER_URL', 'http://localhost:8191/v1')
11
+ FLARE_HEADERS = {'Content-Type': 'application/json'}
12
+ FORCE_FLARESOLVER = os.getenv('FORCE_FLARESOLVER', '0') == '1'
13
+
14
+ logger = logger_manager.create_logger('GET HTML CONTENT')
15
+
16
+
17
+ def get_request(url: str,
18
+ timeout: int = 20,
19
+ retries: int = 3,
20
+ time_between_retries: int = 1) -> requests.Response | None:
21
+ logger.debug(f'Starting get_request for {url} with timeout={timeout}, retries={retries}, time_between_retries={time_between_retries}')
22
+ for attempt in range(retries):
23
+ logger.debug(f'Attempt {attempt + 1} for {url}')
24
+ try:
25
+ response = requests.get(url, timeout=timeout)
26
+ response.raise_for_status()
27
+ logger.debug(f'Successful response for {url} on attempt {attempt + 1}')
28
+ return response
29
+ except requests.exceptions.ConnectionError as e:
30
+ logger.error(f'Connection error ({attempt + 1}/{retries}): {e}')
31
+ except requests.exceptions.Timeout as e:
32
+ logger.error(f'Request timed out ({attempt + 1}/{retries}): {e}')
33
+ except requests.exceptions.HTTPError as e:
34
+ logger.error(f'HTTP error ({attempt + 1}/{retries}): {e}')
35
+ except requests.exceptions.InvalidSchema as e:
36
+ logger.error(f'Invalid URL schema for "{url}": {e}')
37
+ break # Don't retry on invalid schema
38
+ except requests.exceptions.RequestException as e:
39
+ logger.error(f'Request failed ({attempt + 1}/{retries}): {e}')
40
+
41
+ if attempt < retries - 1:
42
+ logger.debug(f'Waiting {time_between_retries} seconds before retrying')
43
+ time.sleep(time_between_retries) # Wait before retrying
44
+ logger.debug(f'Failed to get a successful response for {url} after {retries} attempts')
45
+ return None
46
+
47
+
48
+ def get_request_flaresolver(url: str,
49
+ timeout: int = 20,
50
+ flaresolver_url: str = FLARESOLVER_URL,
51
+ retries: int = 3,
52
+ time_between_retries: int = 1) -> requests.Response | None:
53
+ logger.debug(f'Starting get_request_flaresolver for {url} with timeout={timeout}, retries={retries}, time_between_retries={time_between_retries}')
54
+ for attempt in range(retries):
55
+ logger.debug(f'Attempt {attempt + 1} for {url} using FlareSolver')
56
+ try:
57
+ response = requests.post(
58
+ flaresolver_url,
59
+ headers=FLARE_HEADERS,
60
+ json={
61
+ 'cmd': 'request.get',
62
+ 'url': url,
63
+ 'maxTimeout': timeout * 1000
64
+ },
65
+ timeout=timeout
66
+ )
67
+ response.raise_for_status()
68
+ logger.debug(f'Successful response for {url} on attempt {attempt + 1} using FlareSolver')
69
+ return response
70
+
71
+ except requests.exceptions.ConnectionError as e:
72
+ logger.error(f'Connection error ({attempt + 1}/{retries}), check FlareSolver host: {flaresolver_url}: {e}')
73
+ except requests.exceptions.Timeout as e:
74
+ logger.error(f'Request timed out ({attempt + 1}/{retries}): {e}')
75
+ except requests.exceptions.InvalidSchema as e:
76
+ logger.error(f'Invalid FlareSolver URL schema "{flaresolver_url}": {e}')
77
+ break # Don't retry on invalid schema
78
+ except requests.exceptions.HTTPError as e:
79
+ logger.error(f'HTTP error ({attempt + 1}/{retries}): {e}')
80
+ except requests.exceptions.RequestException as e:
81
+ logger.error(f'Request failed ({attempt + 1}/{retries}): {e}')
82
+ except json.JSONDecodeError as e:
83
+ logger.error(f'Invalid JSON response ({attempt + 1}/{retries}): {e}')
84
+
85
+ if attempt < retries - 1:
86
+ logger.debug(f'Waiting {time_between_retries} seconds before retrying')
87
+ time.sleep(time_between_retries) # Wait before retrying
88
+ logger.debug(f'Failed to get a successful response for {url} using FlareSolver after {retries} attempts')
89
+ return None
90
+
91
+
92
+ def get_html_content(url: str,
93
+ retries: int = 5,
94
+ flaresolver: bool = True,
95
+ flaresolver_url: str = FLARESOLVER_URL,
96
+ time_between_retries: int = 1,
97
+ force_flaresolver: bool = FORCE_FLARESOLVER) -> str | None:
98
+ logger.debug(f'Starting get_html_content for {url} with retries={retries}, flaresolver={flaresolver}, flaresolver_url={flaresolver_url}, time_between_retries={time_between_retries}, force_flaresolver={force_flaresolver}')
99
+ # First try with common HTTP request
100
+ if not force_flaresolver:
101
+ response = get_request(
102
+ url, timeout=20, retries=retries, time_between_retries=time_between_retries)
103
+ if not response:
104
+ logger.warning(f'Failed to get response from {url} using common HTTP request')
105
+ elif not response.ok:
106
+ logger.warning(f'Response with errors from {url} using common HTTP request')
107
+ else:
108
+ logger.debug(f'Successfully retrieved HTML content from {url} using common HTTP request')
109
+ return response.text
110
+
111
+ # If flaresolver is disabled, return None
112
+ if not flaresolver:
113
+ logger.debug(f'Flaresolver is disabled, returning None for {url}')
114
+ return None
115
+
116
+ # Try with Flaresolver
117
+ logger.debug(f'Trying with Flaresolver for {url}')
118
+ response = get_request_flaresolver(
119
+ url, timeout=20, flaresolver_url=flaresolver_url, time_between_retries=time_between_retries)
120
+ if not response:
121
+ logger.critical(f'Failed to get response from {url} using FlareSolver')
122
+ return None
123
+ if not response.ok:
124
+ logger.critical(f'Response with errors from {url} using FlareSolver')
125
+ return None
126
+
127
+ response_json = response.json()
128
+ if 'solution' not in response_json:
129
+ logger.critical(f'No solution found in FlareSolver response for {url}')
130
+ return None
131
+ if 'response' not in response_json['solution']:
132
+ logger.critical(f'No response found in FlareSolver solution for {url}')
133
+ return None
134
+ logger.debug(f'Successfully retrieved HTML content from {url} using FlareSolver')
135
+ return response_json['solution']['response']
@@ -0,0 +1,66 @@
1
+ from .file_manager import FileManager
2
+ from . import request_manager
3
+ import hashlib
4
+ from urllib.parse import urlparse
5
+ import re
6
+ import unicodedata
7
+
8
+
9
+ def generate_file_name_from_url(url: str) -> str:
10
+ # Parsea URL
11
+ parsed_url = urlparse(url)
12
+ # Delete slash
13
+ path = parsed_url.path.strip('/')
14
+ path_parts = path.split('/')
15
+ last_two_parts = path_parts[-2:] if len(path_parts) >= 2 else path_parts
16
+ base_name = '_'.join(last_two_parts) if last_two_parts else 'index'
17
+
18
+ # Replace not allowed characters
19
+ safe_base_name = re.sub(r'[^a-zA-Z0-9_\-]', '_', base_name)
20
+ # Limit the path length
21
+ if len(safe_base_name) > 50:
22
+ safe_base_name = safe_base_name[:50]
23
+ # Hash if neccesary
24
+ url_hash = hashlib.md5(url.encode('utf-8')).hexdigest()[:8]
25
+ filename = f"{safe_base_name}_{url_hash}.html"
26
+ return filename
27
+
28
+
29
+ def generate_epub_file_name_from_title(title: str) -> str:
30
+ normalized_title = unicodedata.normalize(
31
+ 'NFKD', title).encode('ASCII', 'ignore').decode('ASCII')
32
+ normalized_title = normalized_title.lower()
33
+ normalized_title = re.sub(r'[\s\-]+', '_', normalized_title)
34
+ sanitized_title = re.sub(r'[^a-zA-Z0-9_]', '', normalized_title)
35
+ title_hash = hashlib.md5(sanitized_title.encode('utf-8')).hexdigest()[:8]
36
+
37
+ max_length = 50
38
+ if len(sanitized_title) > max_length:
39
+ sanitized_title = sanitized_title[:max_length]
40
+ if not sanitized_title:
41
+ sanitized_title = 'chapter'
42
+
43
+ filename = f"{sanitized_title}_{title_hash}.xhtml"
44
+ return filename
45
+
46
+ def delete_duplicates(str_list: list[str]) -> list[str]:
47
+ return list(dict.fromkeys(str_list))
48
+
49
+ def obtain_host(url: str):
50
+ host = url.split(':')[1]
51
+ # try:
52
+ # host = url.split(':')[1]
53
+ # except Exception as e:
54
+ # pass
55
+ while host.startswith('/'):
56
+ host = host[1:]
57
+
58
+ host = host.split('/')[0].replace('www.', '')
59
+
60
+ return host
61
+
62
+ def check_exclusive_params(param1: any, param2: any) -> bool:
63
+ return (param1 is None) != (param2 is None)
64
+
65
+ def create_volume_id(n: int):
66
+ return f'v{n:02}'
@@ -0,0 +1 @@
1
+ __version__ = "1.0.2"
@@ -0,0 +1,231 @@
1
+ Metadata-Version: 2.4
2
+ Name: web-novel-scraper
3
+ Version: 1.0.2
4
+ Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
5
+ Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
6
+ Project-URL: Documentation, https://web-novel-scraper.readthedocs.io
7
+ Project-URL: Repository, https://github.com/ImagineBrkr/web-novel-scraper.git
8
+ Author-email: ImagineBrkr <salvattore_25@hotmail.com>
9
+ Keywords: Novel Downloader,Scraper,Web Novel,Web Novel Downloader,Web Novel Scraper
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: bs4>=0.0.2
12
+ Requires-Dist: click<9,>=8.0
13
+ Requires-Dist: dataclasses-json<1,>=0.6.7
14
+ Requires-Dist: ebooklib<1,>=0.18
15
+ Requires-Dist: platformdirs
16
+ Requires-Dist: python-dotenv
17
+ Requires-Dist: requests
18
+ Description-Content-Type: text/markdown
19
+
20
+ # Web Novel scraper CLI
21
+
22
+ ## Table of Contents
23
+ - [Introduction](#introduction)
24
+ - [Installation](#installation)
25
+ - [Basic Concepts](#basic-concepts)
26
+ - [Commands](#commands)
27
+ - [Basic Examples](#basic-examples)
28
+
29
+
30
+ ## Introduction
31
+ This tool allows you to scrape web novels from various sources. I made it because my hands hurt from scrolling too much.
32
+
33
+ ## Installation
34
+ To install the Web Novel Scraping CLI, you can use pip:
35
+
36
+ ```bash
37
+ pip install web-novel-scraper
38
+ ```
39
+ Or you can manually install it:
40
+
41
+ 1. Clone the repository:
42
+ ```bash
43
+ git clone https://github.com/ImagineBrkr/web-novel-scraper.git
44
+ ```
45
+ 2. Navigate to the project directory:
46
+ ```bash
47
+ cd web-novel-scraper
48
+ ```
49
+ 3. Install the project:
50
+ ```bash
51
+ python -m pip install .
52
+ ```
53
+ 4. Run the CLI tool:
54
+ ```bash
55
+ web-novel-scraper
56
+ ```
57
+
58
+ ## Basic Concepts
59
+ ### Novel
60
+ Refers to a novel which has at least, a Table of Contents (can be one or more) and chapters.
61
+ It also has some metadata that can be saved like author, language, tags, creation or end date, etc.
62
+
63
+ ### Table of Contents (TOC)
64
+ Source of Truth for all the chapters the novel will have. It can be from a main URL (it will be requested and saved; if there is more than one page, they will also get requested and saved), or the HTML files can be added directly from a file. All the chapters are autogenerated from this TOC.
65
+
66
+ ### Chapters
67
+ A Chapter comes from a URL, is requested and saved as a file on your local machine. Once a file is saved, you will not need to request it anymore.
68
+ From this chapter you can get the Title and the Chapter Content.
69
+
70
+ ### Decoder
71
+ A set of rules used to extract information from a chapter, such as links, content, title, etc.
72
+ We use the host to identify which set of rules we will use. This can be added manually or generated from a TOC URL.
73
+ Example:
74
+ ```json
75
+ {
76
+ "host": "novelbin.me",
77
+ "has_pagination": false,
78
+ "title": {
79
+ "element": "h2 a.chr-title",
80
+ "id": null,
81
+ "class": null,
82
+ "selector": null,
83
+ "attributes": null,
84
+ "array": false,
85
+ "extract": {
86
+ "type": "attr",
87
+ "key": "title"
88
+ }
89
+ },
90
+ "content": {
91
+ "element": "div#chr-content",
92
+ "id": null,
93
+ "class": null,
94
+ "selector": null,
95
+ "attributes": null,
96
+ "array": true
97
+ },
98
+ "index": {
99
+ "element": null,
100
+ "id": null,
101
+ "class": null,
102
+ "selector": "ul.list-chapter li a",
103
+ "attributes": null,
104
+ "array": true
105
+ },
106
+ "next_page": {
107
+ "element": null,
108
+ "id": null,
109
+ "class": null,
110
+ "selector": null,
111
+ "attributes": null,
112
+ "array": true
113
+ }
114
+ }
115
+ ```
116
+ Uses BeautifulSoup selectors for more flexibility. You can specify the element, id, class, selector, and whether multiple tags will be used.
117
+
118
+ - `has_pagination`: Used if there is a `toc_main_url` to find the URL of the next page, using `next_page`.
119
+ - `index`: Gets the `href` of all tags found when searching the TOC.
120
+ - `title` and `content`: The title and content of the chapter, respectively.
121
+
122
+ In the example above:
123
+ - The title is in an `a` tag within an `h2` tag with class `chr-title`, extracting the `title` attribute:
124
+ ```html
125
+ <h2><a class="chr-title" href="https://url-of-chapter" title="Chapter 1"><span class="chr-text">Chapter 1</span></a></h2>
126
+ ```
127
+ - The content is in a `div` with id `chr-content`:
128
+ ```html
129
+ <div id="chr-content" class="chr-c" style="font-family: Arial, sans-serif, serif; font-size: 18px; line-height: 160%; margin-top: 15px;">Content...</div>
130
+ ```
131
+ - The URL of each chapter is in the `href` of an `a` tag within an `li` tag, which is within a `ul` tag with class `list-chapter`:
132
+ ```html
133
+ <ul class="list-chapter">
134
+ <li><span class="glyphicon glyphicon-certificate"></span>&nbsp;<a href="https://url-of-chapter-1" title="Chapter 1"><span class="nchr-text chapter-title">Chapter 1</span></a></li>
135
+ </ul>
136
+ ```
137
+ ## Commands
138
+ The following commands are available in the Web Novel Scraping CLI:
139
+
140
+ ```bash
141
+ Usage: main.py [OPTIONS] COMMAND [ARGS]...
142
+
143
+ CLI Tool for web novel scraping.
144
+
145
+ Options:
146
+ --help Show this message and exit.
147
+
148
+ Commands:
149
+ add-tags Add tags to a novel.
150
+ add-toc-html Add TOC HTML to a novel.
151
+ clean-files Clean files of a novel.
152
+ create-novel Create a new novel.
153
+ delete-toc Delete the TOC of a novel.
154
+ remove-tags Remove tags from a novel.
155
+ request-all-chapters Request all chapters of a novel.
156
+ save-novel-to-epub Save the novel to EPUB format.
157
+ scrap-chapter Scrap a chapter of a novel.
158
+ set-cover-image Set the cover image for a novel.
159
+ set-host Set the host for a novel.
160
+ set-metadata Set metadata for a novel.
161
+ set-scraper-behavior Set scraper behavior for a novel.
162
+ set-toc-main-url Set the main URL for the TOC of a novel.
163
+ show-chapters Show chapters of a novel.
164
+ show-metadata Show metadata of a novel.
165
+ show-novel-info Show information about a novel.
166
+ show-scraper-behavior Show scraper behavior of a novel.
167
+ show-tags Show tags of a novel.
168
+ show-toc Show the TOC of a novel.
169
+ sync-toc Sync the TOC of a novel.
170
+ version Show program version.
171
+ ```
172
+
173
+ ## Basic Examples
174
+ Here are some basic examples:
175
+
176
+ ### Example 1: Creating a Novel using a main URL
177
+ ```bash
178
+ python src/main.py create-novel --title 'Novel 1' --author 'ImagineBrkr' --toc-main-url 'https://page.me/Novel-1/toc' --cover 'cover.jpg'
179
+ ```
180
+ Some pages have too much JavaScript, so you can just copy the HTML manually to a file and create the novel from it:
181
+ ```bash
182
+ python src/main.py create-novel --title 'Novel 1' --author 'ImagineBrkr' --toc-html 'toc.html' --host 'page.me' --cover 'cover.jpg'
183
+ ```
184
+ If there is more than one page for the TOC, you can add them:
185
+ ```bash
186
+ python src/main.py add-toc-html --title 'Novel 1' --toc-html 'toc2.html'
187
+ ```
188
+ You can create the chapters from this TOC, or synchronize if they were already created but there are new chapters.
189
+ ```bash
190
+ python src/main.py sync-toc --title 'Novel 1'
191
+ ```
192
+ The default directory will be %APPDATA%/ImagineBrkr/web-novel-scraper for Windows, all the files will be saved there, but you can change it.
193
+
194
+ ### Example 2: Requesting files
195
+ We can now download all the chapters
196
+ ```bash
197
+ python src/main.py request-all-chapters --title 'Novel 1'
198
+ ```
199
+
200
+ ### Example 3: Saving to EPUB
201
+ With
202
+ ```bash
203
+ python src/main.py save-novel-to-epub --title 'Novel 1'
204
+ ```
205
+
206
+ For more detailed usage and options, use --help for each command.
207
+
208
+ ## Configuration
209
+ ### Environment Variables
210
+
211
+ The Web Novel Scraping CLI uses the following environment variables for configuration:
212
+
213
+ - `SCRAPER_LOGGING_LEVEL`: Sets the logging level for the application. By default no logs are written, it accepts the following log levels: (DEBUG, INFO, WARNING, ERROR, CRITICAL).
214
+ ```bash
215
+ export SCRAPER_LOGGING_LEVEL=INFO
216
+ ```
217
+
218
+ - `SCRAPER_LOGGING_FILE`: Specifies the file where logs will be written. Default is written to the terminal.
219
+ ```bash
220
+ export SCRAPER_LOGGING_FILE=/path/to/logfile.log
221
+ ```
222
+
223
+ - `SCRAPER_BASE_DATA_DIR`: Defines the base directory for storing novel data. Default is the user data directory.
224
+ ```bash
225
+ export SCRAPER_BASE_DATA_DIR=/path/to/data/dir
226
+ ```
227
+
228
+ - `SCRAPER_FLARESOLVER_URL`: URL for the FlareSolverr service. Default is `http://localhost:8191/v1`.
229
+ ```bash
230
+ export SCRAPER_FLARESOLVER_URL=http://localhost:8191/v1
231
+ ```
@@ -0,0 +1,14 @@
1
+ web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ web_novel_scraper/__main__.py,sha256=PBIAG1vshnRdvYwyeD-RxlHS5mNEby-d4puV9kEyfpA,17615
3
+ web_novel_scraper/decode.py,sha256=dqGv_8nFSKwO6GBj3jhaO9SQeLHeBjDzoV1C_YcN40k,5085
4
+ web_novel_scraper/file_manager.py,sha256=PJu8kKeng49DTNQBbbMekFtIcTZOkeCEjFYqYJarv9M,11363
5
+ web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
6
+ web_novel_scraper/novel_scraper.py,sha256=eiic2i3AdK9lcFK9aNb4d8ptnKv9ua1B_9kcUY8_liM,28660
7
+ web_novel_scraper/request_manager.py,sha256=0M_ekBuDCMRGZIWxDbZ_yAwPOxJr2mBpP-Yj8zsE13o,6449
8
+ web_novel_scraper/utils.py,sha256=vq5ROuPv04k3MhbksTe0ni_yP0i_a7T_33mkBB1DUbQ,2076
9
+ web_novel_scraper/version.py,sha256=Y3LSfRioSl2xch70pq_ULlvyECXyEtN3krVaWeGyaxk,22
10
+ web_novel_scraper/decode_guide/decode_guide.json,sha256=Q4v-OZh_1MwdrFxDDVvj8T3evW3zzbSapRaGwFCdnX8,5425
11
+ web_novel_scraper-1.0.2.dist-info/METADATA,sha256=OBhkSUWS02JIFh4qbsRdS_s_UL15_gAOWAqsQu-Day4,8419
12
+ web_novel_scraper-1.0.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
13
+ web_novel_scraper-1.0.2.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
14
+ web_novel_scraper-1.0.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ web-novel-scraper = web_novel_scraper.__main__:cli