web-novel-scraper 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web_novel_scraper-1.0.2/.github/scripts/update_version.py +65 -0
- web_novel_scraper-1.0.2/.github/workflows/build.yaml +31 -0
- web_novel_scraper-1.0.2/.github/workflows/publish.yaml +88 -0
- web_novel_scraper-1.0.2/.github/workflows/test.yaml +21 -0
- web_novel_scraper-1.0.2/.gitignore +15 -0
- web_novel_scraper-1.0.2/.readthedocs.yaml +25 -0
- web_novel_scraper-1.0.2/PKG-INFO +231 -0
- web_novel_scraper-1.0.2/README.md +212 -0
- web_novel_scraper-1.0.2/docs/Makefile +20 -0
- web_novel_scraper-1.0.2/docs/make.bat +35 -0
- web_novel_scraper-1.0.2/docs/requirements.txt +3 -0
- web_novel_scraper-1.0.2/docs/source/README.rst +255 -0
- web_novel_scraper-1.0.2/docs/source/_static/README.md +1 -0
- web_novel_scraper-1.0.2/docs/source/commands/chapters_commands.rst +17 -0
- web_novel_scraper-1.0.2/docs/source/commands/creation_commands.rst +41 -0
- web_novel_scraper-1.0.2/docs/source/commands/index.rst +14 -0
- web_novel_scraper-1.0.2/docs/source/commands/output_commands.rst +8 -0
- web_novel_scraper-1.0.2/docs/source/commands/toc_commands.rst +25 -0
- web_novel_scraper-1.0.2/docs/source/commands/utils_commands.rst +16 -0
- web_novel_scraper-1.0.2/docs/source/concepts.rst +178 -0
- web_novel_scraper-1.0.2/docs/source/conf.py +41 -0
- web_novel_scraper-1.0.2/docs/source/config_options.rst +33 -0
- web_novel_scraper-1.0.2/docs/source/index.rst +23 -0
- web_novel_scraper-1.0.2/docs/source/tutorial.rst +115 -0
- web_novel_scraper-1.0.2/pyproject.toml +44 -0
- web_novel_scraper-1.0.2/requirements.txt +7 -0
- web_novel_scraper-1.0.2/web_novel_scraper/__init__.py +0 -0
- web_novel_scraper-1.0.2/web_novel_scraper/__main__.py +430 -0
- web_novel_scraper-1.0.2/web_novel_scraper/decode.py +141 -0
- web_novel_scraper-1.0.2/web_novel_scraper/decode_guide/decode_guide.json +213 -0
- web_novel_scraper-1.0.2/web_novel_scraper/file_manager.py +292 -0
- web_novel_scraper-1.0.2/web_novel_scraper/logger_manager.py +72 -0
- web_novel_scraper-1.0.2/web_novel_scraper/novel_scraper.py +723 -0
- web_novel_scraper-1.0.2/web_novel_scraper/request_manager.py +135 -0
- web_novel_scraper-1.0.2/web_novel_scraper/utils.py +66 -0
- web_novel_scraper-1.0.2/web_novel_scraper/version.py +1 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
import sys
|
2
|
+
import re
|
3
|
+
|
4
|
+
def update_version(file_path, increment):
|
5
|
+
"""
|
6
|
+
Updates the version in the specified file.
|
7
|
+
|
8
|
+
Args:
|
9
|
+
file_path (str): Path to the file containing the version.
|
10
|
+
increment (str): The type of increment: "patch", "minor", or "major".
|
11
|
+
"""
|
12
|
+
try:
|
13
|
+
# Read the file
|
14
|
+
with open(file_path, "r", encoding="utf-8") as file:
|
15
|
+
content = file.read()
|
16
|
+
|
17
|
+
# Find the current version using regex
|
18
|
+
match = re.search(r'__version__ = "(\d+)\.(\d+)\.(\d+)"', content)
|
19
|
+
if not match:
|
20
|
+
raise ValueError("Version not found in the file.")
|
21
|
+
|
22
|
+
# Extract MAJOR, MINOR, and PATCH values
|
23
|
+
major, minor, patch = map(int, match.groups())
|
24
|
+
|
25
|
+
# Increment the appropriate part
|
26
|
+
if increment == "major":
|
27
|
+
major += 1
|
28
|
+
minor = 0
|
29
|
+
patch = 0
|
30
|
+
elif increment == "minor":
|
31
|
+
minor += 1
|
32
|
+
patch = 0
|
33
|
+
elif increment == "patch":
|
34
|
+
patch += 1
|
35
|
+
else:
|
36
|
+
raise ValueError("Increment type must be 'major', 'minor', or 'patch'.")
|
37
|
+
|
38
|
+
# Generate the new version
|
39
|
+
new_version = f'{major}.{minor}.{patch}'
|
40
|
+
|
41
|
+
# Update the file content with the new version
|
42
|
+
updated_content = re.sub(r'__version__ = "(\d+)\.(\d+)\.(\d+)"',
|
43
|
+
f'__version__ = "{new_version}"',
|
44
|
+
content)
|
45
|
+
|
46
|
+
# Write the updated content back to the file
|
47
|
+
with open(file_path, "w", encoding="utf-8") as file:
|
48
|
+
file.write(updated_content)
|
49
|
+
|
50
|
+
print(new_version) # Print the new version for the workflow
|
51
|
+
|
52
|
+
except Exception as e:
|
53
|
+
print(f"Error updating the version: {e}")
|
54
|
+
sys.exit(1)
|
55
|
+
|
56
|
+
|
57
|
+
if __name__ == "__main__":
|
58
|
+
if len(sys.argv) != 3:
|
59
|
+
print("Usage: python update_version.py <file_path> <increment>")
|
60
|
+
print("Example: python update_version.py web_novel_scraper/version.py patch")
|
61
|
+
sys.exit(1)
|
62
|
+
|
63
|
+
file_path = sys.argv[1]
|
64
|
+
increment = sys.argv[2].lower()
|
65
|
+
update_version(file_path, increment)
|
@@ -0,0 +1,31 @@
|
|
1
|
+
name: Build Artifacts
|
2
|
+
on:
|
3
|
+
[workflow_call, workflow_dispatch]
|
4
|
+
permissions:
|
5
|
+
contents: read
|
6
|
+
|
7
|
+
jobs:
|
8
|
+
release-build:
|
9
|
+
runs-on: ubuntu-latest
|
10
|
+
|
11
|
+
steps:
|
12
|
+
- uses: actions/checkout@v4
|
13
|
+
with:
|
14
|
+
ref: main
|
15
|
+
- uses: actions/setup-python@v5
|
16
|
+
with:
|
17
|
+
python-version: "3.x"
|
18
|
+
|
19
|
+
- name: Build release distributions
|
20
|
+
run: |
|
21
|
+
python -m pip install build hatchling
|
22
|
+
python -m build
|
23
|
+
|
24
|
+
- name: Verify build artifacts
|
25
|
+
run: test -n "$(ls -A dist)" || exit 1
|
26
|
+
|
27
|
+
- name: Upload distributions
|
28
|
+
uses: actions/upload-artifact@v4
|
29
|
+
with:
|
30
|
+
name: release-dists
|
31
|
+
path: dist/
|
@@ -0,0 +1,88 @@
|
|
1
|
+
name: Test, Build and Publish
|
2
|
+
|
3
|
+
on:
|
4
|
+
workflow_dispatch:
|
5
|
+
inputs:
|
6
|
+
increment:
|
7
|
+
description: "Version increment type (PATCH, MINOR, MAJOR)"
|
8
|
+
required: false
|
9
|
+
default: "PATCH"
|
10
|
+
type: choice
|
11
|
+
options:
|
12
|
+
- "PATCH"
|
13
|
+
- "MINOR"
|
14
|
+
- "MAJOR"
|
15
|
+
publish_to_pypi:
|
16
|
+
description: "Publish to PyPI (yes or no)"
|
17
|
+
required: false
|
18
|
+
default: true
|
19
|
+
type: boolean
|
20
|
+
|
21
|
+
permissions:
|
22
|
+
contents: write
|
23
|
+
id-token: write
|
24
|
+
|
25
|
+
jobs:
|
26
|
+
run-tests:
|
27
|
+
uses: ./.github/workflows/test.yaml
|
28
|
+
|
29
|
+
bump-version:
|
30
|
+
needs: run-tests
|
31
|
+
runs-on: ubuntu-latest
|
32
|
+
permissions:
|
33
|
+
contents: write
|
34
|
+
steps:
|
35
|
+
- name: Checkout code
|
36
|
+
uses: actions/checkout@v4
|
37
|
+
|
38
|
+
- name: Set up Python
|
39
|
+
uses: actions/setup-python@v5
|
40
|
+
with:
|
41
|
+
python-version: "3.x"
|
42
|
+
|
43
|
+
- name: Bump version
|
44
|
+
run: |
|
45
|
+
NEW_VERSION=$(python .github/scripts/update_version.py web_novel_scraper/version.py ${{ inputs.increment }})
|
46
|
+
echo "NEW_VERSION=$NEW_VERSION" >> $GITHUB_ENV
|
47
|
+
|
48
|
+
- name: Commit updated version
|
49
|
+
run: |
|
50
|
+
git config --global user.name "github-actions[bot]"
|
51
|
+
git config --global user.email "github-actions[bot]@users.noreply.github.com"
|
52
|
+
git add web_novel_scraper/version.py
|
53
|
+
git commit -m "Bump version to ${{ env.NEW_VERSION }}"
|
54
|
+
git push
|
55
|
+
|
56
|
+
- name: Tag the new version
|
57
|
+
run: |
|
58
|
+
git tag -a "${{ env.NEW_VERSION }}" -m "Version ${{ env.NEW_VERSION }}"
|
59
|
+
git push origin "${{ env.NEW_VERSION }}"
|
60
|
+
|
61
|
+
build:
|
62
|
+
needs: bump-version
|
63
|
+
uses: ./.github/workflows/build.yaml
|
64
|
+
permissions:
|
65
|
+
contents: read
|
66
|
+
packages: write
|
67
|
+
actions: write
|
68
|
+
|
69
|
+
pypi-publish:
|
70
|
+
needs: build
|
71
|
+
if: ${{ inputs.publish_to_pypi }}
|
72
|
+
runs-on: ubuntu-latest
|
73
|
+
permissions:
|
74
|
+
id-token: write
|
75
|
+
environment:
|
76
|
+
name: pypi
|
77
|
+
url: https://pypi.org/p/web-novel-scraper
|
78
|
+
steps:
|
79
|
+
- name: Retrieve release distributions
|
80
|
+
uses: actions/download-artifact@v4
|
81
|
+
with:
|
82
|
+
name: release-dists
|
83
|
+
path: dist/
|
84
|
+
|
85
|
+
- name: Publish release distributions to PyPI
|
86
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
87
|
+
with:
|
88
|
+
packages-dir: dist/
|
@@ -0,0 +1,21 @@
|
|
1
|
+
name: Run tests
|
2
|
+
on:
|
3
|
+
[workflow_call, workflow_dispatch]
|
4
|
+
permissions:
|
5
|
+
contents: read
|
6
|
+
|
7
|
+
jobs:
|
8
|
+
release-build:
|
9
|
+
runs-on: ubuntu-latest
|
10
|
+
|
11
|
+
steps:
|
12
|
+
- uses: actions/checkout@v4
|
13
|
+
|
14
|
+
- uses: actions/setup-python@v5
|
15
|
+
with:
|
16
|
+
python-version: "3.x"
|
17
|
+
|
18
|
+
- name: Build release distributions
|
19
|
+
run: |
|
20
|
+
python -m pip install .
|
21
|
+
web-novel-scraper --help
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# Read the Docs configuration file
|
2
|
+
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
|
3
|
+
|
4
|
+
# Required
|
5
|
+
version: 2
|
6
|
+
|
7
|
+
# Set the OS, Python version, and other tools you might need
|
8
|
+
build:
|
9
|
+
os: ubuntu-24.04
|
10
|
+
tools:
|
11
|
+
python: "3.13"
|
12
|
+
|
13
|
+
# Build documentation in the "docs/source" directory with Sphinx
|
14
|
+
sphinx:
|
15
|
+
configuration: docs/source/conf.py
|
16
|
+
|
17
|
+
# Optionally, but recommended,
|
18
|
+
# declare the Python requirements required to build your documentation
|
19
|
+
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
|
20
|
+
python:
|
21
|
+
install:
|
22
|
+
- requirements: docs/requirements.txt
|
23
|
+
# We need to install the requirements to autogenerate the click docs
|
24
|
+
- requirements: requirements.txt
|
25
|
+
|
@@ -0,0 +1,231 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: web-novel-scraper
|
3
|
+
Version: 1.0.2
|
4
|
+
Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
|
5
|
+
Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
|
6
|
+
Project-URL: Documentation, https://web-novel-scraper.readthedocs.io
|
7
|
+
Project-URL: Repository, https://github.com/ImagineBrkr/web-novel-scraper.git
|
8
|
+
Author-email: ImagineBrkr <salvattore_25@hotmail.com>
|
9
|
+
Keywords: Novel Downloader,Scraper,Web Novel,Web Novel Downloader,Web Novel Scraper
|
10
|
+
Requires-Python: >=3.10
|
11
|
+
Requires-Dist: bs4>=0.0.2
|
12
|
+
Requires-Dist: click<9,>=8.0
|
13
|
+
Requires-Dist: dataclasses-json<1,>=0.6.7
|
14
|
+
Requires-Dist: ebooklib<1,>=0.18
|
15
|
+
Requires-Dist: platformdirs
|
16
|
+
Requires-Dist: python-dotenv
|
17
|
+
Requires-Dist: requests
|
18
|
+
Description-Content-Type: text/markdown
|
19
|
+
|
20
|
+
# Web Novel scraper CLI
|
21
|
+
|
22
|
+
## Table of Contents
|
23
|
+
- [Introduction](#introduction)
|
24
|
+
- [Installation](#installation)
|
25
|
+
- [Basic Concepts](#basic-concepts)
|
26
|
+
- [Commands](#commands)
|
27
|
+
- [Basic Examples](#basic-examples)
|
28
|
+
|
29
|
+
|
30
|
+
## Introduction
|
31
|
+
This tool allows you to scrape web novels from various sources. I made it because my hands hurt from scrolling too much.
|
32
|
+
|
33
|
+
## Installation
|
34
|
+
To install the Web Novel Scraping CLI, you can use pip:
|
35
|
+
|
36
|
+
```bash
|
37
|
+
pip install web-novel-scraper
|
38
|
+
```
|
39
|
+
Or you can manually install it:
|
40
|
+
|
41
|
+
1. Clone the repository:
|
42
|
+
```bash
|
43
|
+
git clone https://github.com/ImagineBrkr/web-novel-scraper.git
|
44
|
+
```
|
45
|
+
2. Navigate to the project directory:
|
46
|
+
```bash
|
47
|
+
cd web-novel-scraper
|
48
|
+
```
|
49
|
+
3. Install the project:
|
50
|
+
```bash
|
51
|
+
python -m pip install .
|
52
|
+
```
|
53
|
+
4. Run the CLI tool:
|
54
|
+
```bash
|
55
|
+
web-novel-scraper
|
56
|
+
```
|
57
|
+
|
58
|
+
## Basic Concepts
|
59
|
+
### Novel
|
60
|
+
Refers to a novel which has at least, a Table of Contents (can be one or more) and chapters.
|
61
|
+
It also has some metadata that can be saved like author, language, tags, creation or end date, etc.
|
62
|
+
|
63
|
+
### Table of Contents (TOC)
|
64
|
+
Source of Truth for all the chapters the novel will have. It can be from a main URL (it will be requested and saved; if there is more than one page, they will also get requested and saved), or the HTML files can be added directly from a file. All the chapters are autogenerated from this TOC.
|
65
|
+
|
66
|
+
### Chapters
|
67
|
+
A Chapter comes from a URL, is requested and saved as a file on your local machine. Once a file is saved, you will not need to request it anymore.
|
68
|
+
From this chapter you can get the Title and the Chapter Content.
|
69
|
+
|
70
|
+
### Decoder
|
71
|
+
A set of rules used to extract information from a chapter, such as links, content, title, etc.
|
72
|
+
We use the host to identify which set of rules we will use. This can be added manually or generated from a TOC URL.
|
73
|
+
Example:
|
74
|
+
```json
|
75
|
+
{
|
76
|
+
"host": "novelbin.me",
|
77
|
+
"has_pagination": false,
|
78
|
+
"title": {
|
79
|
+
"element": "h2 a.chr-title",
|
80
|
+
"id": null,
|
81
|
+
"class": null,
|
82
|
+
"selector": null,
|
83
|
+
"attributes": null,
|
84
|
+
"array": false,
|
85
|
+
"extract": {
|
86
|
+
"type": "attr",
|
87
|
+
"key": "title"
|
88
|
+
}
|
89
|
+
},
|
90
|
+
"content": {
|
91
|
+
"element": "div#chr-content",
|
92
|
+
"id": null,
|
93
|
+
"class": null,
|
94
|
+
"selector": null,
|
95
|
+
"attributes": null,
|
96
|
+
"array": true
|
97
|
+
},
|
98
|
+
"index": {
|
99
|
+
"element": null,
|
100
|
+
"id": null,
|
101
|
+
"class": null,
|
102
|
+
"selector": "ul.list-chapter li a",
|
103
|
+
"attributes": null,
|
104
|
+
"array": true
|
105
|
+
},
|
106
|
+
"next_page": {
|
107
|
+
"element": null,
|
108
|
+
"id": null,
|
109
|
+
"class": null,
|
110
|
+
"selector": null,
|
111
|
+
"attributes": null,
|
112
|
+
"array": true
|
113
|
+
}
|
114
|
+
}
|
115
|
+
```
|
116
|
+
Uses BeautifulSoup selectors for more flexibility. You can specify the element, id, class, selector, and whether multiple tags will be used.
|
117
|
+
|
118
|
+
- `has_pagination`: Used if there is a `toc_main_url` to find the URL of the next page, using `next_page`.
|
119
|
+
- `index`: Gets the `href` of all tags found when searching the TOC.
|
120
|
+
- `title` and `content`: The title and content of the chapter, respectively.
|
121
|
+
|
122
|
+
In the example above:
|
123
|
+
- The title is in an `a` tag within an `h2` tag with class `chr-title`, extracting the `title` attribute:
|
124
|
+
```html
|
125
|
+
<h2><a class="chr-title" href="https://url-of-chapter" title="Chapter 1"><span class="chr-text">Chapter 1</span></a></h2>
|
126
|
+
```
|
127
|
+
- The content is in a `div` with id `chr-content`:
|
128
|
+
```html
|
129
|
+
<div id="chr-content" class="chr-c" style="font-family: Arial, sans-serif, serif; font-size: 18px; line-height: 160%; margin-top: 15px;">Content...</div>
|
130
|
+
```
|
131
|
+
- The URL of each chapter is in the `href` of an `a` tag within an `li` tag, which is within a `ul` tag with class `list-chapter`:
|
132
|
+
```html
|
133
|
+
<ul class="list-chapter">
|
134
|
+
<li><span class="glyphicon glyphicon-certificate"></span> <a href="https://url-of-chapter-1" title="Chapter 1"><span class="nchr-text chapter-title">Chapter 1</span></a></li>
|
135
|
+
</ul>
|
136
|
+
```
|
137
|
+
## Commands
|
138
|
+
The following commands are available in the Web Novel Scraping CLI:
|
139
|
+
|
140
|
+
```bash
|
141
|
+
Usage: main.py [OPTIONS] COMMAND [ARGS]...
|
142
|
+
|
143
|
+
CLI Tool for web novel scraping.
|
144
|
+
|
145
|
+
Options:
|
146
|
+
--help Show this message and exit.
|
147
|
+
|
148
|
+
Commands:
|
149
|
+
add-tags Add tags to a novel.
|
150
|
+
add-toc-html Add TOC HTML to a novel.
|
151
|
+
clean-files Clean files of a novel.
|
152
|
+
create-novel Create a new novel.
|
153
|
+
delete-toc Delete the TOC of a novel.
|
154
|
+
remove-tags Remove tags from a novel.
|
155
|
+
request-all-chapters Request all chapters of a novel.
|
156
|
+
save-novel-to-epub Save the novel to EPUB format.
|
157
|
+
scrap-chapter Scrap a chapter of a novel.
|
158
|
+
set-cover-image Set the cover image for a novel.
|
159
|
+
set-host Set the host for a novel.
|
160
|
+
set-metadata Set metadata for a novel.
|
161
|
+
set-scraper-behavior Set scraper behavior for a novel.
|
162
|
+
set-toc-main-url Set the main URL for the TOC of a novel.
|
163
|
+
show-chapters Show chapters of a novel.
|
164
|
+
show-metadata Show metadata of a novel.
|
165
|
+
show-novel-info Show information about a novel.
|
166
|
+
show-scraper-behavior Show scraper behavior of a novel.
|
167
|
+
show-tags Show tags of a novel.
|
168
|
+
show-toc Show the TOC of a novel.
|
169
|
+
sync-toc Sync the TOC of a novel.
|
170
|
+
version Show program version.
|
171
|
+
```
|
172
|
+
|
173
|
+
## Basic Examples
|
174
|
+
Here are some basic examples:
|
175
|
+
|
176
|
+
### Example 1: Creating a Novel using a main URL
|
177
|
+
```bash
|
178
|
+
python src/main.py create-novel --title 'Novel 1' --author 'ImagineBrkr' --toc-main-url 'https://page.me/Novel-1/toc' --cover 'cover.jpg'
|
179
|
+
```
|
180
|
+
Some pages have too much JavaScript, so you can just copy the HTML manually to a file and create the novel from it:
|
181
|
+
```bash
|
182
|
+
python src/main.py create-novel --title 'Novel 1' --author 'ImagineBrkr' --toc-html 'toc.html' --host 'page.me' --cover 'cover.jpg'
|
183
|
+
```
|
184
|
+
If there is more than one page for the TOC, you can add them:
|
185
|
+
```bash
|
186
|
+
python src/main.py add-toc-html --title 'Novel 1' --toc-html 'toc2.html'
|
187
|
+
```
|
188
|
+
You can create the chapters from this TOC, or synchronize if they were already created but there are new chapters.
|
189
|
+
```bash
|
190
|
+
python src/main.py sync-toc --title 'Novel 1'
|
191
|
+
```
|
192
|
+
The default directory will be %APPDATA%/ImagineBrkr/web-novel-scraper for Windows, all the files will be saved there, but you can change it.
|
193
|
+
|
194
|
+
### Example 2: Requesting files
|
195
|
+
We can now download all the chapters
|
196
|
+
```bash
|
197
|
+
python src/main.py request-all-chapters --title 'Novel 1'
|
198
|
+
```
|
199
|
+
|
200
|
+
### Example 3: Saving to EPUB
|
201
|
+
With
|
202
|
+
```bash
|
203
|
+
python src/main.py save-novel-to-epub --title 'Novel 1'
|
204
|
+
```
|
205
|
+
|
206
|
+
For more detailed usage and options, use --help for each command.
|
207
|
+
|
208
|
+
## Configuration
|
209
|
+
### Environment Variables
|
210
|
+
|
211
|
+
The Web Novel Scraping CLI uses the following environment variables for configuration:
|
212
|
+
|
213
|
+
- `SCRAPER_LOGGING_LEVEL`: Sets the logging level for the application. By default no logs are written, it accepts the following log levels: (DEBUG, INFO, WARNING, ERROR, CRITICAL).
|
214
|
+
```bash
|
215
|
+
export SCRAPER_LOGGING_LEVEL=INFO
|
216
|
+
```
|
217
|
+
|
218
|
+
- `SCRAPER_LOGGING_FILE`: Specifies the file where logs will be written. Default is written to the terminal.
|
219
|
+
```bash
|
220
|
+
export SCRAPER_LOGGING_FILE=/path/to/logfile.log
|
221
|
+
```
|
222
|
+
|
223
|
+
- `SCRAPER_BASE_DATA_DIR`: Defines the base directory for storing novel data. Default is the user data directory.
|
224
|
+
```bash
|
225
|
+
export SCRAPER_BASE_DATA_DIR=/path/to/data/dir
|
226
|
+
```
|
227
|
+
|
228
|
+
- `SCRAPER_FLARESOLVER_URL`: URL for the FlareSolverr service. Default is `http://localhost:8191/v1`.
|
229
|
+
```bash
|
230
|
+
export SCRAPER_FLARESOLVER_URL=http://localhost:8191/v1
|
231
|
+
```
|
@@ -0,0 +1,212 @@
|
|
1
|
+
# Web Novel scraper CLI
|
2
|
+
|
3
|
+
## Table of Contents
|
4
|
+
- [Introduction](#introduction)
|
5
|
+
- [Installation](#installation)
|
6
|
+
- [Basic Concepts](#basic-concepts)
|
7
|
+
- [Commands](#commands)
|
8
|
+
- [Basic Examples](#basic-examples)
|
9
|
+
|
10
|
+
|
11
|
+
## Introduction
|
12
|
+
This tool allows you to scrape web novels from various sources. I made it because my hands hurt from scrolling too much.
|
13
|
+
|
14
|
+
## Installation
|
15
|
+
To install the Web Novel Scraping CLI, you can use pip:
|
16
|
+
|
17
|
+
```bash
|
18
|
+
pip install web-novel-scraper
|
19
|
+
```
|
20
|
+
Or you can manually install it:
|
21
|
+
|
22
|
+
1. Clone the repository:
|
23
|
+
```bash
|
24
|
+
git clone https://github.com/ImagineBrkr/web-novel-scraper.git
|
25
|
+
```
|
26
|
+
2. Navigate to the project directory:
|
27
|
+
```bash
|
28
|
+
cd web-novel-scraper
|
29
|
+
```
|
30
|
+
3. Install the project:
|
31
|
+
```bash
|
32
|
+
python -m pip install .
|
33
|
+
```
|
34
|
+
4. Run the CLI tool:
|
35
|
+
```bash
|
36
|
+
web-novel-scraper
|
37
|
+
```
|
38
|
+
|
39
|
+
## Basic Concepts
|
40
|
+
### Novel
|
41
|
+
Refers to a novel which has at least, a Table of Contents (can be one or more) and chapters.
|
42
|
+
It also has some metadata that can be saved like author, language, tags, creation or end date, etc.
|
43
|
+
|
44
|
+
### Table of Contents (TOC)
|
45
|
+
Source of Truth for all the chapters the novel will have. It can be from a main URL (it will be requested and saved; if there is more than one page, they will also get requested and saved), or the HTML files can be added directly from a file. All the chapters are autogenerated from this TOC.
|
46
|
+
|
47
|
+
### Chapters
|
48
|
+
A Chapter comes from a URL, is requested and saved as a file on your local machine. Once a file is saved, you will not need to request it anymore.
|
49
|
+
From this chapter you can get the Title and the Chapter Content.
|
50
|
+
|
51
|
+
### Decoder
|
52
|
+
A set of rules used to extract information from a chapter, such as links, content, title, etc.
|
53
|
+
We use the host to identify which set of rules we will use. This can be added manually or generated from a TOC URL.
|
54
|
+
Example:
|
55
|
+
```json
|
56
|
+
{
|
57
|
+
"host": "novelbin.me",
|
58
|
+
"has_pagination": false,
|
59
|
+
"title": {
|
60
|
+
"element": "h2 a.chr-title",
|
61
|
+
"id": null,
|
62
|
+
"class": null,
|
63
|
+
"selector": null,
|
64
|
+
"attributes": null,
|
65
|
+
"array": false,
|
66
|
+
"extract": {
|
67
|
+
"type": "attr",
|
68
|
+
"key": "title"
|
69
|
+
}
|
70
|
+
},
|
71
|
+
"content": {
|
72
|
+
"element": "div#chr-content",
|
73
|
+
"id": null,
|
74
|
+
"class": null,
|
75
|
+
"selector": null,
|
76
|
+
"attributes": null,
|
77
|
+
"array": true
|
78
|
+
},
|
79
|
+
"index": {
|
80
|
+
"element": null,
|
81
|
+
"id": null,
|
82
|
+
"class": null,
|
83
|
+
"selector": "ul.list-chapter li a",
|
84
|
+
"attributes": null,
|
85
|
+
"array": true
|
86
|
+
},
|
87
|
+
"next_page": {
|
88
|
+
"element": null,
|
89
|
+
"id": null,
|
90
|
+
"class": null,
|
91
|
+
"selector": null,
|
92
|
+
"attributes": null,
|
93
|
+
"array": true
|
94
|
+
}
|
95
|
+
}
|
96
|
+
```
|
97
|
+
Uses BeautifulSoup selectors for more flexibility. You can specify the element, id, class, selector, and whether multiple tags will be used.
|
98
|
+
|
99
|
+
- `has_pagination`: Used if there is a `toc_main_url` to find the URL of the next page, using `next_page`.
|
100
|
+
- `index`: Gets the `href` of all tags found when searching the TOC.
|
101
|
+
- `title` and `content`: The title and content of the chapter, respectively.
|
102
|
+
|
103
|
+
In the example above:
|
104
|
+
- The title is in an `a` tag within an `h2` tag with class `chr-title`, extracting the `title` attribute:
|
105
|
+
```html
|
106
|
+
<h2><a class="chr-title" href="https://url-of-chapter" title="Chapter 1"><span class="chr-text">Chapter 1</span></a></h2>
|
107
|
+
```
|
108
|
+
- The content is in a `div` with id `chr-content`:
|
109
|
+
```html
|
110
|
+
<div id="chr-content" class="chr-c" style="font-family: Arial, sans-serif, serif; font-size: 18px; line-height: 160%; margin-top: 15px;">Content...</div>
|
111
|
+
```
|
112
|
+
- The URL of each chapter is in the `href` of an `a` tag within an `li` tag, which is within a `ul` tag with class `list-chapter`:
|
113
|
+
```html
|
114
|
+
<ul class="list-chapter">
|
115
|
+
<li><span class="glyphicon glyphicon-certificate"></span> <a href="https://url-of-chapter-1" title="Chapter 1"><span class="nchr-text chapter-title">Chapter 1</span></a></li>
|
116
|
+
</ul>
|
117
|
+
```
|
118
|
+
## Commands
|
119
|
+
The following commands are available in the Web Novel Scraping CLI:
|
120
|
+
|
121
|
+
```bash
|
122
|
+
Usage: main.py [OPTIONS] COMMAND [ARGS]...
|
123
|
+
|
124
|
+
CLI Tool for web novel scraping.
|
125
|
+
|
126
|
+
Options:
|
127
|
+
--help Show this message and exit.
|
128
|
+
|
129
|
+
Commands:
|
130
|
+
add-tags Add tags to a novel.
|
131
|
+
add-toc-html Add TOC HTML to a novel.
|
132
|
+
clean-files Clean files of a novel.
|
133
|
+
create-novel Create a new novel.
|
134
|
+
delete-toc Delete the TOC of a novel.
|
135
|
+
remove-tags Remove tags from a novel.
|
136
|
+
request-all-chapters Request all chapters of a novel.
|
137
|
+
save-novel-to-epub Save the novel to EPUB format.
|
138
|
+
scrap-chapter Scrap a chapter of a novel.
|
139
|
+
set-cover-image Set the cover image for a novel.
|
140
|
+
set-host Set the host for a novel.
|
141
|
+
set-metadata Set metadata for a novel.
|
142
|
+
set-scraper-behavior Set scraper behavior for a novel.
|
143
|
+
set-toc-main-url Set the main URL for the TOC of a novel.
|
144
|
+
show-chapters Show chapters of a novel.
|
145
|
+
show-metadata Show metadata of a novel.
|
146
|
+
show-novel-info Show information about a novel.
|
147
|
+
show-scraper-behavior Show scraper behavior of a novel.
|
148
|
+
show-tags Show tags of a novel.
|
149
|
+
show-toc Show the TOC of a novel.
|
150
|
+
sync-toc Sync the TOC of a novel.
|
151
|
+
version Show program version.
|
152
|
+
```
|
153
|
+
|
154
|
+
## Basic Examples
|
155
|
+
Here are some basic examples:
|
156
|
+
|
157
|
+
### Example 1: Creating a Novel using a main URL
|
158
|
+
```bash
|
159
|
+
python src/main.py create-novel --title 'Novel 1' --author 'ImagineBrkr' --toc-main-url 'https://page.me/Novel-1/toc' --cover 'cover.jpg'
|
160
|
+
```
|
161
|
+
Some pages have too much JavaScript, so you can just copy the HTML manually to a file and create the novel from it:
|
162
|
+
```bash
|
163
|
+
python src/main.py create-novel --title 'Novel 1' --author 'ImagineBrkr' --toc-html 'toc.html' --host 'page.me' --cover 'cover.jpg'
|
164
|
+
```
|
165
|
+
If there is more than one page for the TOC, you can add them:
|
166
|
+
```bash
|
167
|
+
python src/main.py add-toc-html --title 'Novel 1' --toc-html 'toc2.html'
|
168
|
+
```
|
169
|
+
You can create the chapters from this TOC, or synchronize if they were already created but there are new chapters.
|
170
|
+
```bash
|
171
|
+
python src/main.py sync-toc --title 'Novel 1'
|
172
|
+
```
|
173
|
+
The default directory will be %APPDATA%/ImagineBrkr/web-novel-scraper for Windows, all the files will be saved there, but you can change it.
|
174
|
+
|
175
|
+
### Example 2: Requesting files
|
176
|
+
We can now download all the chapters
|
177
|
+
```bash
|
178
|
+
python src/main.py request-all-chapters --title 'Novel 1'
|
179
|
+
```
|
180
|
+
|
181
|
+
### Example 3: Saving to EPUB
|
182
|
+
With
|
183
|
+
```bash
|
184
|
+
python src/main.py save-novel-to-epub --title 'Novel 1'
|
185
|
+
```
|
186
|
+
|
187
|
+
For more detailed usage and options, use --help for each command.
|
188
|
+
|
189
|
+
## Configuration
|
190
|
+
### Environment Variables
|
191
|
+
|
192
|
+
The Web Novel Scraping CLI uses the following environment variables for configuration:
|
193
|
+
|
194
|
+
- `SCRAPER_LOGGING_LEVEL`: Sets the logging level for the application. By default no logs are written, it accepts the following log levels: (DEBUG, INFO, WARNING, ERROR, CRITICAL).
|
195
|
+
```bash
|
196
|
+
export SCRAPER_LOGGING_LEVEL=INFO
|
197
|
+
```
|
198
|
+
|
199
|
+
- `SCRAPER_LOGGING_FILE`: Specifies the file where logs will be written. Default is written to the terminal.
|
200
|
+
```bash
|
201
|
+
export SCRAPER_LOGGING_FILE=/path/to/logfile.log
|
202
|
+
```
|
203
|
+
|
204
|
+
- `SCRAPER_BASE_DATA_DIR`: Defines the base directory for storing novel data. Default is the user data directory.
|
205
|
+
```bash
|
206
|
+
export SCRAPER_BASE_DATA_DIR=/path/to/data/dir
|
207
|
+
```
|
208
|
+
|
209
|
+
- `SCRAPER_FLARESOLVER_URL`: URL for the FlareSolverr service. Default is `http://localhost:8191/v1`.
|
210
|
+
```bash
|
211
|
+
export SCRAPER_FLARESOLVER_URL=http://localhost:8191/v1
|
212
|
+
```
|