PyPI - web-novel-scraper - Versions diffs - 1.0.2__py3-none-any.whl - Mend

web-novel-scraper 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

web_novel_scraper/__init__.py +0 -0
web_novel_scraper/__main__.py +430 -0
web_novel_scraper/decode.py +141 -0
web_novel_scraper/decode_guide/decode_guide.json +213 -0
web_novel_scraper/file_manager.py +292 -0
web_novel_scraper/logger_manager.py +72 -0
web_novel_scraper/novel_scraper.py +723 -0
web_novel_scraper/request_manager.py +135 -0
web_novel_scraper/utils.py +66 -0
web_novel_scraper/version.py +1 -0
web_novel_scraper-1.0.2.dist-info/METADATA +231 -0
web_novel_scraper-1.0.2.dist-info/RECORD +14 -0
web_novel_scraper-1.0.2.dist-info/WHEEL +4 -0
web_novel_scraper-1.0.2.dist-info/entry_points.txt +2 -0

web_novel_scraper/request_manager.py ADDED Viewed

@@ -0,0 +1,135 @@
+import requests
+import os
+from . import logger_manager
+from dotenv import load_dotenv
+import json
+import time
+load_dotenv()
+FLARESOLVER_URL = os.getenv('SCRAPER_FLARESOLVER_URL', 'http://localhost:8191/v1')
+FLARE_HEADERS = {'Content-Type': 'application/json'}
+FORCE_FLARESOLVER = os.getenv('FORCE_FLARESOLVER', '0') == '1'
+logger = logger_manager.create_logger('GET HTML CONTENT')
+def get_request(url: str,
+                timeout: int = 20,
+                retries: int = 3,
+                time_between_retries: int = 1) -> requests.Response | None:
+    logger.debug(f'Starting get_request for {url} with timeout={timeout}, retries={retries}, time_between_retries={time_between_retries}')
+    for attempt in range(retries):
+        logger.debug(f'Attempt {attempt + 1} for {url}')
+        try:
+            response = requests.get(url, timeout=timeout)
+            response.raise_for_status()
+            logger.debug(f'Successful response for {url} on attempt {attempt + 1}')
+            return response
+        except requests.exceptions.ConnectionError as e:
+            logger.error(f'Connection error ({attempt + 1}/{retries}): {e}')
+        except requests.exceptions.Timeout as e:
+            logger.error(f'Request timed out ({attempt + 1}/{retries}): {e}')
+        except requests.exceptions.HTTPError as e:
+            logger.error(f'HTTP error ({attempt + 1}/{retries}): {e}')
+        except requests.exceptions.InvalidSchema as e:
+            logger.error(f'Invalid URL schema for "{url}": {e}')
+            break  # Don't retry on invalid schema
+        except requests.exceptions.RequestException as e:
+            logger.error(f'Request failed ({attempt + 1}/{retries}): {e}')
+        if attempt < retries - 1:
+            logger.debug(f'Waiting {time_between_retries} seconds before retrying')
+            time.sleep(time_between_retries)  # Wait before retrying
+    logger.debug(f'Failed to get a successful response for {url} after {retries} attempts')
+    return None
+def get_request_flaresolver(url: str,
+                            timeout: int = 20,
+                            flaresolver_url: str = FLARESOLVER_URL,
+                            retries: int = 3,
+                            time_between_retries: int = 1) -> requests.Response | None:
+    logger.debug(f'Starting get_request_flaresolver for {url} with timeout={timeout}, retries={retries}, time_between_retries={time_between_retries}')
+    for attempt in range(retries):
+        logger.debug(f'Attempt {attempt + 1} for {url} using FlareSolver')
+        try:
+            response = requests.post(
+                flaresolver_url,
+                headers=FLARE_HEADERS,
+                json={
+                    'cmd': 'request.get',
+                    'url': url,
+                    'maxTimeout': timeout * 1000
+                },
+                timeout=timeout
+            )
+            response.raise_for_status()
+            logger.debug(f'Successful response for {url} on attempt {attempt + 1} using FlareSolver')
+            return response
+        except requests.exceptions.ConnectionError as e:
+            logger.error(f'Connection error ({attempt + 1}/{retries}), check FlareSolver host: {flaresolver_url}: {e}')
+        except requests.exceptions.Timeout as e:
+            logger.error(f'Request timed out ({attempt + 1}/{retries}): {e}')
+        except requests.exceptions.InvalidSchema as e:
+            logger.error(f'Invalid FlareSolver URL schema "{flaresolver_url}": {e}')
+            break  # Don't retry on invalid schema
+        except requests.exceptions.HTTPError as e:
+            logger.error(f'HTTP error ({attempt + 1}/{retries}): {e}')
+        except requests.exceptions.RequestException as e:
+            logger.error(f'Request failed ({attempt + 1}/{retries}): {e}')
+        except json.JSONDecodeError as e:
+            logger.error(f'Invalid JSON response ({attempt + 1}/{retries}): {e}')
+        if attempt < retries - 1:
+            logger.debug(f'Waiting {time_between_retries} seconds before retrying')
+            time.sleep(time_between_retries)  # Wait before retrying
+    logger.debug(f'Failed to get a successful response for {url} using FlareSolver after {retries} attempts')
+    return None
+def get_html_content(url: str,
+                     retries: int = 5,
+                     flaresolver: bool = True,
+                     flaresolver_url: str = FLARESOLVER_URL,
+                     time_between_retries: int = 1,
+                     force_flaresolver: bool = FORCE_FLARESOLVER) -> str | None:
+    logger.debug(f'Starting get_html_content for {url} with retries={retries}, flaresolver={flaresolver}, flaresolver_url={flaresolver_url}, time_between_retries={time_between_retries}, force_flaresolver={force_flaresolver}')
+    # First try with common HTTP request
+    if not force_flaresolver:
+        response = get_request(
+            url, timeout=20, retries=retries, time_between_retries=time_between_retries)
+        if not response:
+            logger.warning(f'Failed to get response from {url} using common HTTP request')
+        elif not response.ok:
+            logger.warning(f'Response with errors from {url} using common HTTP request')
+        else:
+            logger.debug(f'Successfully retrieved HTML content from {url} using common HTTP request')
+            return response.text
+    # If flaresolver is disabled, return None
+    if not flaresolver:
+        logger.debug(f'Flaresolver is disabled, returning None for {url}')
+        return None
+    # Try with Flaresolver
+    logger.debug(f'Trying with Flaresolver for {url}')
+    response = get_request_flaresolver(
+        url, timeout=20, flaresolver_url=flaresolver_url, time_between_retries=time_between_retries)
+    if not response:
+        logger.critical(f'Failed to get response from {url} using FlareSolver')
+        return None
+    if not response.ok:
+        logger.critical(f'Response with errors from {url} using FlareSolver')
+        return None
+    response_json = response.json()
+    if 'solution' not in response_json:
+        logger.critical(f'No solution found in FlareSolver response for {url}')
+        return None
+    if 'response' not in response_json['solution']:
+        logger.critical(f'No response found in FlareSolver solution for {url}')
+        return None
+    logger.debug(f'Successfully retrieved HTML content from {url} using FlareSolver')
+    return response_json['solution']['response']

web_novel_scraper/utils.py ADDED Viewed

@@ -0,0 +1,66 @@
+from .file_manager import FileManager
+from . import request_manager
+import hashlib
+from urllib.parse import urlparse
+import re
+import unicodedata
+def generate_file_name_from_url(url: str) -> str:
+    # Parsea URL
+    parsed_url = urlparse(url)
+    # Delete slash
+    path = parsed_url.path.strip('/')
+    path_parts = path.split('/')
+    last_two_parts = path_parts[-2:] if len(path_parts) >= 2 else path_parts
+    base_name = '_'.join(last_two_parts) if last_two_parts else 'index'
+    # Replace not allowed characters
+    safe_base_name = re.sub(r'[^a-zA-Z0-9_\-]', '_', base_name)
+    # Limit the path length
+    if len(safe_base_name) > 50:
+        safe_base_name = safe_base_name[:50]
+    # Hash if neccesary
+    url_hash = hashlib.md5(url.encode('utf-8')).hexdigest()[:8]
+    filename = f"{safe_base_name}_{url_hash}.html"
+    return filename
+def generate_epub_file_name_from_title(title: str) -> str:
+    normalized_title = unicodedata.normalize(
+        'NFKD', title).encode('ASCII', 'ignore').decode('ASCII')
+    normalized_title = normalized_title.lower()
+    normalized_title = re.sub(r'[\s\-]+', '_', normalized_title)
+    sanitized_title = re.sub(r'[^a-zA-Z0-9_]', '', normalized_title)
+    title_hash = hashlib.md5(sanitized_title.encode('utf-8')).hexdigest()[:8]
+    max_length = 50
+    if len(sanitized_title) > max_length:
+        sanitized_title = sanitized_title[:max_length]
+    if not sanitized_title:
+        sanitized_title = 'chapter'
+    filename = f"{sanitized_title}_{title_hash}.xhtml"
+    return filename
+def delete_duplicates(str_list: list[str]) -> list[str]:
+    return list(dict.fromkeys(str_list))
+def obtain_host(url: str):
+    host = url.split(':')[1]
+    # try:
+    #     host = url.split(':')[1]
+    # except Exception as e:
+    #     pass
+    while host.startswith('/'):
+        host = host[1:]
+    host = host.split('/')[0].replace('www.', '')
+    return host
+def check_exclusive_params(param1: any, param2: any) -> bool:
+    return (param1 is None) != (param2 is None)
+def create_volume_id(n: int):
+    return f'v{n:02}'

web_novel_scraper/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.0.2"

web_novel_scraper-1.0.2.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,231 @@
+Metadata-Version: 2.4
+Name: web-novel-scraper
+Version: 1.0.2
+Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
+Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
+Project-URL: Documentation, https://web-novel-scraper.readthedocs.io
+Project-URL: Repository, https://github.com/ImagineBrkr/web-novel-scraper.git
+Author-email: ImagineBrkr <salvattore_25@hotmail.com>
+Keywords: Novel Downloader,Scraper,Web Novel,Web Novel Downloader,Web Novel Scraper
+Requires-Python: >=3.10
+Requires-Dist: bs4>=0.0.2
+Requires-Dist: click<9,>=8.0
+Requires-Dist: dataclasses-json<1,>=0.6.7
+Requires-Dist: ebooklib<1,>=0.18
+Requires-Dist: platformdirs
+Requires-Dist: python-dotenv
+Requires-Dist: requests
+Description-Content-Type: text/markdown
+# Web Novel scraper CLI
+## Table of Contents
+- [Introduction](#introduction)
+- [Installation](#installation)
+- [Basic Concepts](#basic-concepts)
+- [Commands](#commands)
+- [Basic Examples](#basic-examples)
+## Introduction
+This tool allows you to scrape web novels from various sources. I made it because my hands hurt from scrolling too much.
+## Installation
+To install the Web Novel Scraping CLI, you can use pip:
+```bash
+    pip install web-novel-scraper
+```
+Or you can manually install it:
+1. Clone the repository:
+    ```bash
+    git clone https://github.com/ImagineBrkr/web-novel-scraper.git
+    ```
+2. Navigate to the project directory:
+    ```bash
+    cd web-novel-scraper
+    ```
+3. Install the project:
+    ```bash
+    python -m pip install .
+    ```
+4. Run the CLI tool:
+    ```bash
+    web-novel-scraper
+    ```
+## Basic Concepts
+### Novel
+Refers to a novel which has at least, a Table of Contents (can be one or more) and chapters.
+It also has some metadata that can be saved like author, language, tags, creation or end date, etc.
+### Table of Contents (TOC)
+Source of Truth for all the chapters the novel will have. It can be from a main URL (it will be requested and saved; if there is more than one page, they will also get requested and saved), or the HTML files can be added directly from a file. All the chapters are autogenerated from this TOC.
+### Chapters
+A Chapter comes from a URL, is requested and saved as a file on your local machine. Once a file is saved, you will not need to request it anymore.
+From this chapter you can get the Title and the Chapter Content.
+### Decoder
+A set of rules used to extract information from a chapter, such as links, content, title, etc.
+We use the host to identify which set of rules we will use. This can be added manually or generated from a TOC URL.
+Example:
+```json
+{
+    "host": "novelbin.me",
+    "has_pagination": false,
+    "title": {
+        "element": "h2 a.chr-title",
+        "id": null,
+        "class": null,
+        "selector": null,
+        "attributes": null,
+        "array": false,
+        "extract": {
+            "type": "attr",
+            "key": "title"
+        }
+    },
+    "content": {
+        "element": "div#chr-content",
+        "id": null,
+        "class": null,
+        "selector": null,
+        "attributes": null,
+        "array": true
+    },
+    "index": {
+        "element": null,
+        "id": null,
+        "class": null,
+        "selector": "ul.list-chapter li a",
+        "attributes": null,
+        "array": true
+    },
+    "next_page": {
+        "element": null,
+        "id": null,
+        "class": null,
+        "selector": null,
+        "attributes": null,
+        "array": true
+    }
+}
+```
+Uses BeautifulSoup selectors for more flexibility. You can specify the element, id, class, selector, and whether multiple tags will be used.
+- `has_pagination`: Used if there is a `toc_main_url` to find the URL of the next page, using `next_page`.
+- `index`: Gets the `href` of all tags found when searching the TOC.
+- `title` and `content`: The title and content of the chapter, respectively.
+In the example above:
+- The title is in an `a` tag within an `h2` tag with class `chr-title`, extracting the `title` attribute:
+    ```html
+    <h2><a class="chr-title" href="https://url-of-chapter" title="Chapter 1"><span class="chr-text">Chapter 1</span></a></h2>
+    ```
+- The content is in a `div` with id `chr-content`:
+    ```html
+    <div id="chr-content" class="chr-c" style="font-family: Arial, sans-serif, serif; font-size: 18px; line-height: 160%; margin-top: 15px;">Content...</div>
+    ```
+- The URL of each chapter is in the `href` of an `a` tag within an `li` tag, which is within a `ul` tag with class `list-chapter`:
+    ```html
+    <ul class="list-chapter">
+        <li><span class="glyphicon glyphicon-certificate"></span>&nbsp;<a href="https://url-of-chapter-1" title="Chapter 1"><span class="nchr-text chapter-title">Chapter 1</span></a></li>
+    </ul>
+    ```
+## Commands
+The following commands are available in the Web Novel Scraping CLI:
+```bash
+Usage: main.py [OPTIONS] COMMAND [ARGS]...
+  CLI Tool for web novel scraping.
+Options:
+  --help  Show this message and exit.
+Commands:
+  add-tags                Add tags to a novel.
+  add-toc-html            Add TOC HTML to a novel.
+  clean-files             Clean files of a novel.
+  create-novel            Create a new novel.
+  delete-toc              Delete the TOC of a novel.
+  remove-tags             Remove tags from a novel.
+  request-all-chapters    Request all chapters of a novel.
+  save-novel-to-epub      Save the novel to EPUB format.
+  scrap-chapter           Scrap a chapter of a novel.
+  set-cover-image         Set the cover image for a novel.
+  set-host                Set the host for a novel.
+  set-metadata            Set metadata for a novel.
+  set-scraper-behavior   Set scraper behavior for a novel.
+  set-toc-main-url        Set the main URL for the TOC of a novel.
+  show-chapters           Show chapters of a novel.
+  show-metadata           Show metadata of a novel.
+  show-novel-info         Show information about a novel.
+  show-scraper-behavior  Show scraper behavior of a novel.
+  show-tags               Show tags of a novel.
+  show-toc                Show the TOC of a novel.
+  sync-toc                Sync the TOC of a novel.
+  version                 Show program version.
+```
+## Basic Examples
+Here are some basic examples:
+### Example 1: Creating a Novel using a main URL
+```bash
+python src/main.py create-novel --title 'Novel 1' --author 'ImagineBrkr' --toc-main-url 'https://page.me/Novel-1/toc' --cover 'cover.jpg'
+```
+Some pages have too much JavaScript, so you can just copy the HTML manually to a file and create the novel from it:
+```bash
+python src/main.py create-novel --title 'Novel 1' --author 'ImagineBrkr' --toc-html 'toc.html' --host 'page.me' --cover 'cover.jpg'
+```
+If there is more than one page for the TOC, you can add them:
+```bash
+python src/main.py add-toc-html --title 'Novel 1' --toc-html 'toc2.html'
+```
+You can create the chapters from this TOC, or synchronize if they were already created but there are new chapters.
+```bash
+python src/main.py sync-toc --title 'Novel 1'
+```
+The default directory will be %APPDATA%/ImagineBrkr/web-novel-scraper for Windows, all the files will be saved there, but you can change it.
+### Example 2: Requesting files
+We can now download all the chapters
+```bash
+python src/main.py request-all-chapters --title 'Novel 1'
+```
+### Example 3: Saving to EPUB
+With
+```bash
+python src/main.py save-novel-to-epub --title 'Novel 1'
+```
+For more detailed usage and options, use --help for each command.
+## Configuration
+### Environment Variables
+The Web Novel Scraping CLI uses the following environment variables for configuration:
+- `SCRAPER_LOGGING_LEVEL`: Sets the logging level for the application. By default no logs are written, it accepts the following log levels: (DEBUG, INFO, WARNING, ERROR, CRITICAL).
+    ```bash
+    export SCRAPER_LOGGING_LEVEL=INFO
+    ```
+- `SCRAPER_LOGGING_FILE`: Specifies the file where logs will be written. Default is written to the terminal.
+    ```bash
+    export SCRAPER_LOGGING_FILE=/path/to/logfile.log
+    ```
+- `SCRAPER_BASE_DATA_DIR`: Defines the base directory for storing novel data. Default is the user data directory.
+    ```bash
+    export SCRAPER_BASE_DATA_DIR=/path/to/data/dir
+    ```
+- `SCRAPER_FLARESOLVER_URL`: URL for the FlareSolverr service. Default is `http://localhost:8191/v1`.
+    ```bash
+    export SCRAPER_FLARESOLVER_URL=http://localhost:8191/v1
+    ```

web_novel_scraper-1.0.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+web_novel_scraper/__main__.py,sha256=PBIAG1vshnRdvYwyeD-RxlHS5mNEby-d4puV9kEyfpA,17615
+web_novel_scraper/decode.py,sha256=dqGv_8nFSKwO6GBj3jhaO9SQeLHeBjDzoV1C_YcN40k,5085
+web_novel_scraper/file_manager.py,sha256=PJu8kKeng49DTNQBbbMekFtIcTZOkeCEjFYqYJarv9M,11363
+web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
+web_novel_scraper/novel_scraper.py,sha256=eiic2i3AdK9lcFK9aNb4d8ptnKv9ua1B_9kcUY8_liM,28660
+web_novel_scraper/request_manager.py,sha256=0M_ekBuDCMRGZIWxDbZ_yAwPOxJr2mBpP-Yj8zsE13o,6449
+web_novel_scraper/utils.py,sha256=vq5ROuPv04k3MhbksTe0ni_yP0i_a7T_33mkBB1DUbQ,2076
+web_novel_scraper/version.py,sha256=Y3LSfRioSl2xch70pq_ULlvyECXyEtN3krVaWeGyaxk,22
+web_novel_scraper/decode_guide/decode_guide.json,sha256=Q4v-OZh_1MwdrFxDDVvj8T3evW3zzbSapRaGwFCdnX8,5425
+web_novel_scraper-1.0.2.dist-info/METADATA,sha256=OBhkSUWS02JIFh4qbsRdS_s_UL15_gAOWAqsQu-Day4,8419
+web_novel_scraper-1.0.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+web_novel_scraper-1.0.2.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
+web_novel_scraper-1.0.2.dist-info/RECORD,,

web_novel_scraper-1.0.2.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.27.0
+Root-Is-Purelib: true
+Tag: py3-none-any

web_novel_scraper-1.0.2.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ web-novel-scraper = web_novel_scraper.__main__:cli