PyPI - web-novel-scraper - Versions diffs - 1.0.2__py3-none-any.whl - Mend

web-novel-scraper 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

web_novel_scraper/__init__.py +0 -0
web_novel_scraper/__main__.py +430 -0
web_novel_scraper/decode.py +141 -0
web_novel_scraper/decode_guide/decode_guide.json +213 -0
web_novel_scraper/file_manager.py +292 -0
web_novel_scraper/logger_manager.py +72 -0
web_novel_scraper/novel_scraper.py +723 -0
web_novel_scraper/request_manager.py +135 -0
web_novel_scraper/utils.py +66 -0
web_novel_scraper/version.py +1 -0
web_novel_scraper-1.0.2.dist-info/METADATA +231 -0
web_novel_scraper-1.0.2.dist-info/RECORD +14 -0
web_novel_scraper-1.0.2.dist-info/WHEEL +4 -0
web_novel_scraper-1.0.2.dist-info/entry_points.txt +2 -0

web_novel_scraper/__init__.py ADDED Viewed

File without changes

web_novel_scraper/__main__.py ADDED Viewed

@@ -0,0 +1,430 @@
+import json
+from pathlib import Path
+import sys
+from datetime import datetime
+import click
+from .file_manager import FileManager
+from .novel_scraper import Novel
+from .version import __version__
+CURRENT_DIR = Path(__file__).resolve().parent
+def obtain_novel(novel_title: str, novel_base_dir: str = None, allow_not_exists: bool = False) -> Novel:
+    """Obtain a novel instance from the file system."""
+    file_manager = FileManager(
+        novel_title=novel_title, novel_base_dir=novel_base_dir, read_only=True)
+    novel_json = file_manager.load_novel_json()
+    if novel_json:
+        try:
+            novel = Novel.from_json(novel_json)
+            return novel
+        except KeyError:
+            click.echo(
+                'JSON file seems to be manipulated, please check it.', err=True)
+        except json.decoder.JSONDecodeError:
+            click.echo(
+                'JSON file seems to be corrupted, please check it.', err=True)
+    elif allow_not_exists:
+        return None
+    else:
+        click.echo(
+            'Novel with that title does not exist or the main data file was deleted.', err=True)
+    sys.exit(1)
+def validate_date(ctx, param, value):
+    """Validate the date format."""
+    if value:
+        try:
+            if len(value) == 4:
+                datetime.strptime(value, '%Y')
+            elif len(value) == 7:
+                datetime.strptime(value, '%Y-%m')
+            elif len(value) == 10:
+                datetime.strptime(value, '%Y-%m-%d')
+            else:
+                raise ValueError
+        except ValueError as exc:
+            raise click.BadParameter(
+                'Date should be a valid date and must be in the format YYYY-MM-DD, YYYY-MM or YYYY') from exc
+    return value
+# COMMON ARGUMENTS
+title_option = click.option(
+    '-t', '--title', type=str, required=True, help='Title of the novel, this server as the identifier.')
+novel_base_dir_option = click.option(
+    '-nb', '--novel-base-dir', type=str, help='Alternative base directory for the novel files.')
+@click.group()
+def cli():
+    """CLI Tool for web novel scraping."""
+# Metadata:
+metadata_author_option = click.option(
+    '--author', type=str, help='Name of the novel author.')
+metadata_language_option = click.option(
+    '--language', type=str, help='Language of the novel.')
+metadata_description_option = click.option(
+    '--description', type=str, help='Description of the novel.')
+metadata_start_date_option = click.option(
+    '--start-date', callback=validate_date, type=str, help='Start date of the novel, should be in the format YYYY-MM-DD, YYYY-MM or YYYY.')
+metadata_end_date_option = click.option(
+    '--end-date', callback=validate_date, type=str, help='End date of the novel, should be in the format YYYY-MM-DD, YYYY-MM or YYYY.')
+# TOC options
+toc_main_url_option = click.option(
+    '--toc-main-url', type=str, help='Main URL of the TOC, required if not loading from file.')
+sync_toc_option = click.option('--sync-toc', is_flag=True, default=False, show_default=True,
+                               help='Reload the TOC before requesting chapters.')
+def create_toc_html_option(required: bool = False):
+    return click.option(
+        '--toc-html',
+        type=click.File(encoding='utf-8'),
+        required=required,
+        help=('Novel TOC HTML loaded from file.' if required else 'Novel TOC HTML loaded from file (required if not loading from URL)')
+    )
+host_option = click.option(
+    '--host', type=str, help='Host used for decoding, optional if toc-main-url is provided.')
+# Scraper behavior options
+save_title_to_content_option = click.option('--save-title-to-content', is_flag=True, show_default=True,
+                                            default=False, help='Add the chapter title to the content.')
+auto_add_host_option = click.option('--auto-add-host', is_flag=True, show_default=True,
+                                    default=False, help='Automatically add the host to chapter URLs.')
+force_flaresolver_option = click.option('--force-flaresolver', is_flag=True, show_default=True,
+                                        default=False, help='Force the use of FlareSolver for requests.')
+# Novel creation and data management commands
+@cli.command()
+@title_option
+@novel_base_dir_option
+@toc_main_url_option
+@create_toc_html_option()
+@host_option
+@metadata_author_option
+@metadata_start_date_option
+@metadata_end_date_option
+@metadata_language_option
+@metadata_description_option
+@click.option('--tag', 'tags', type=str, help='Novel tag, you can add multiple of them.', multiple=True)
+@click.option('--cover', type=str, help='Path of the image to be used as cover.')
+@save_title_to_content_option
+@auto_add_host_option
+@force_flaresolver_option
+def create_novel(title, novel_base_dir, toc_main_url, toc_html, host, author, start_date, end_date, language, description, tags, cover, save_title_to_content, auto_add_host, force_flaresolver):
+    """Creates a new novel and saves it."""
+    novel = obtain_novel(title, novel_base_dir, allow_not_exists=True)
+    if novel:
+        click.confirm(f'A novel with the title {title} already exists, do you want to replace it?', abort=True)
+        novel.delete_toc()
+    if toc_main_url and toc_html:
+        click.echo(
+            'You must provide either a TOC URL or a TOC HTML file, not both.', err=True)
+        return
+    if not toc_main_url and not toc_html:
+        click.echo(
+            'You must provide either a TOC URL or a TOC HTML file.', err=True)
+        return
+    if not host and not toc_main_url:
+        click.echo(
+            'You must provide a host if you are not providing a TOC URL.', err=True)
+        return
+    toc_html_content = None
+    if toc_html:
+        toc_html_content = toc_html.read()
+    novel = Novel(title, toc_main_url=toc_main_url,
+                  toc_html=toc_html_content, host=host, novel_base_dir=novel_base_dir)
+    novel.set_metadata(author=author, start_date=start_date,
+                       end_date=end_date, language=language, description=description)
+    novel.set_scraper_behavior(save_title_to_content=save_title_to_content,
+                                auto_add_host=auto_add_host, force_flaresolver=force_flaresolver)
+    if tags:
+        for tag in tags:
+            novel.add_tag(tag)
+    if cover:
+        if not novel.set_cover_image(cover):
+            click.echo('Error saving the novel cover image.', err=True)
+    click.echo('Novel saved successfully.')
+@cli.command()
+@title_option
+@novel_base_dir_option
+def show_novel_info(title, novel_base_dir):
+    """Show information about a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    click.echo(novel)
+@cli.command()
+@title_option
+@novel_base_dir_option
+@metadata_author_option
+@metadata_start_date_option
+@metadata_end_date_option
+@metadata_language_option
+@metadata_description_option
+def set_metadata(title, novel_base_dir, author, start_date, end_date, language, description):
+    """Set metadata for a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    novel.set_metadata(author=author, start_date=start_date,
+                       end_date=end_date, language=language, description=description)
+    click.echo('Novel metadata saved successfully.')
+    click.echo(novel.metadata)
+@cli.command()
+@title_option
+@novel_base_dir_option
+def show_metadata(title, novel_base_dir):
+    """Show metadata of a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    click.echo(novel.metadata)
+@cli.command()
+@title_option
+@novel_base_dir_option
+@click.option('--tag', 'tags', type=str, help='Tag to be added', multiple=True)
+def add_tags(title, novel_base_dir, tags):
+    """Add tags to a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    for tag in tags:
+        if not novel.add_tag(tag):
+            click.echo(f'Tag {tag} already exists', err=True)
+    click.echo(f'Tags: {", ".join(novel.metadata.tags)}')
+@cli.command()
+@title_option
+@novel_base_dir_option
+@click.option('--tag', 'tags', type=str, help='Tag to be removed.', multiple=True)
+def remove_tags(title, novel_base_dir, tags):
+    """Remove tags from a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    for tag in tags:
+        if not novel.remove_tag(tag):
+            click.echo(f'Tag {tag} does not exist.', err=True)
+    click.echo(f'Tags: {", ".join(novel.metadata.tags)}')
+@cli.command()
+@title_option
+@novel_base_dir_option
+def show_tags(title, novel_base_dir):
+    """Show tags of a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    click.echo(f'Tags: {", ".join(novel.metadata.tags)}')
+@cli.command()
+@title_option
+@novel_base_dir_option
+@click.option('--cover-image', type=str, required=True, help='Filepath of the cover image.')
+def set_cover_image(title, novel_base_dir, cover_image):
+    """Set the cover image for a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    if not novel.set_cover_image(cover_image):
+        click.echo('Error saving the cover image.', err=True)
+    else:
+        click.echo('New cover image set successfully.')
+@cli.command()
+@title_option
+@novel_base_dir_option
+@click.option('--save-title-to-content', type=bool, help='Toggle the title of the chapter being added to the content (use true or false).')
+@click.option('--auto-add-host', type=bool, help='Toggle automatic addition of the host to chapter URLs (use true or false).')
+@click.option('--force-flaresolver', type=bool, help='Toggle forcing the use of FlareSolver (use true or false).')
+@click.option('--hard-clean', type=bool, help='Toggle using a hard clean when cleaning HTML files (use true or false).')
+def set_scraper_behavior(title, novel_base_dir, save_title_to_content, auto_add_host, force_flaresolver, hard_clean):
+    """Set scraper behavior for a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    novel.set_scraper_behavior(
+        save_title_to_content=save_title_to_content,
+        auto_add_host=auto_add_host,
+        force_flaresolver=force_flaresolver,
+        hard_clean=hard_clean
+    )
+    click.echo('New scraper behavior added successfully.')
+@cli.command()
+@title_option
+@novel_base_dir_option
+def show_scraper_behavior(title, novel_base_dir):
+    """Show scraper behavior of a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    click.echo(novel.scraper_behavior)
+@cli.command()
+@title_option
+@novel_base_dir_option
+@host_option
+def set_host(title, novel_base_dir, host):
+    """Set the host for a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    novel.set_host(host)
+    click.echo('New host set successfully.')
+# TOC MANAGEMENT COMMANDS
+@cli.command()
+@title_option
+@novel_base_dir_option
+@click.option('--toc-main-url', type=str, required=True, help='New TOC main URL (Previous links will be deleted).')
+def set_toc_main_url(title, novel_base_dir, toc_main_url):
+    """Set the main URL for the TOC of a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    novel.set_toc_main_url(toc_main_url)
+@cli.command()
+@title_option
+@novel_base_dir_option
+@create_toc_html_option(required=True)
+@host_option
+def add_toc_html(title, novel_base_dir, toc_html, host):
+    """Add TOC HTML to a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    html_content = toc_html.read()
+    novel.add_toc_html(html_content, host)
+@cli.command()
+@title_option
+@novel_base_dir_option
+@click.option('--reload-files', is_flag=True, required=False, default=False, show_default=True, help='Reload the TOC files before sync (only works if using a TOC URL).')
+def sync_toc(title, novel_base_dir, reload_files):
+    """Sync the TOC of a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    if novel.sync_toc(reload_files):
+        click.echo(
+            'Table of Contents synced with files, to see the new TOC use the command show-toc.')
+    else:
+        click.echo(
+            'Error with the TOC syncing, please check the TOC files and decoding options.', err=True)
+@cli.command()
+@title_option
+@novel_base_dir_option
+@click.option('--auto-approve', is_flag=True, required=False, default=False, show_default=True, help='Auto approve.')
+def delete_toc(title, novel_base_dir, auto_approve):
+    """Delete the TOC of a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    if not auto_approve:
+        click.confirm(f'Are you sure you want to delete the TOC for {title}?', abort=True)
+    novel.delete_toc()
+@cli.command()
+@title_option
+@novel_base_dir_option
+def show_toc(title, novel_base_dir):
+    """Show the TOC of a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    click.echo(novel.show_toc())
+# CHAPTER MANAGEMENT COMMANDS
+@cli.command()
+@title_option
+@novel_base_dir_option
+@click.option('--chapter-url', type=str, required=False, help='Chapter URL to be scrapped.')
+@click.option('--chapter-num', type=int, required=False, help='Chapter number to be scrapped.')
+@click.option('--update-html', is_flag=True, default=False, show_default=True, help='If the chapter HTML is saved, it will be updated.')
+def scrap_chapter(title, novel_base_dir, chapter_url, chapter_num, update_html):
+    """Scrap a chapter of a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    if not chapter_url and not chapter_num:
+        click.echo('Chapter URL or chapter number should be set.', err=True)
+    if chapter_num and chapter_url:
+        click.echo('It should be either chapter URL or chapter number.', err=True)
+    if chapter_num <= 0 or chapter_num > len(novel.chapters):
+        raise click.BadParameter(
+            'Chapter number should be positive and an existing chapter.', param_hint='--chapter-num')
+    chapter = novel.scrap_chapter(
+        chapter_url=chapter_url, chapter_idx=chapter_num - 1, update_html=update_html)
+    if not chapter:
+        click.echo('Chapter number or URL not found.', err=True)
+        return
+    click.echo(chapter)
+    click.echo('Content:')
+    click.echo(chapter.chapter_content)
+@cli.command()
+@title_option
+@novel_base_dir_option
+@sync_toc_option
+@click.option('--update-html', is_flag=True, default=False, show_default=True, help='If the chapter HTML is saved, it will be updated.')
+@click.option('--clean-chapters', is_flag=True, default=False, show_default=True, help='If the chapter HTML should be cleaned upon saving.')
+def request_all_chapters(title, novel_base_dir, sync_toc, update_html, clean_chapters):
+    """Request all chapters of a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    novel.request_all_chapters(
+        sync_toc=sync_toc, update_html=update_html, clean_chapters=clean_chapters)
+    click.echo('All chapters requested and saved.')
+@cli.command()
+@title_option
+@novel_base_dir_option
+def show_chapters(title, novel_base_dir):
+    """Show chapters of a novel."""
+    novel = obtain_novel(title, novel_base_dir)
+    click.echo(novel.show_chapters())
+@cli.command()
+@title_option
+@novel_base_dir_option
+@sync_toc_option
+@click.option('--start-chapter', type=int, default=1, show_default=True, help='The start chapter for the books (position in the TOC, may differ from the actual number).')
+@click.option('--end-chapter', type=int, default=None, show_default=True, help='The end chapter for the books (if not defined, every chapter will be saved).')
+@click.option('--chapters-by-book', type=int, default=100, show_default=True, help='The number of chapters each book will have.')
+def save_novel_to_epub(title, novel_base_dir, sync_toc, start_chapter, end_chapter, chapters_by_book):
+    """Save the novel to EPUB format."""
+    if start_chapter <= 0:
+        raise click.BadParameter(
+            'Should be a positive number.', param_hint='--start-chapter')
+    if end_chapter is not None:
+        if end_chapter < start_chapter or end_chapter <= 0:
+            raise click.BadParameter(
+                'Should be a positive number and bigger than the start chapter.', param_hint='--end-chapter')
+    if chapters_by_book is not None:
+        if chapters_by_book <= 0:
+            raise click.BadParameter(
+                'Should be a positive number.', param_hint='--chapters-by-book')
+    novel = obtain_novel(title, novel_base_dir)
+    if novel.save_novel_to_epub(sync_toc=sync_toc, start_chapter=start_chapter, end_chapter=end_chapter, chapters_by_book=chapters_by_book):
+        click.echo('All books saved.')
+    else:
+        click.echo('Error saving EPUB.')
+# UTILS
+@cli.command()
+@title_option
+@novel_base_dir_option
+@click.option('--clean-chapters', is_flag=True, default=False, show_default=True, help='If the chapters HTML files are cleaned.')
+@click.option('--clean-toc', is_flag=True, default=False, show_default=True, help='If the TOC files are cleaned.')
+@click.option('--hard-clean', is_flag=True, default=False, show_default=True, help='If the files are more deeply cleaned.')
+def clean_files(title, novel_base_dir, clean_chapters, clean_toc, hard_clean):
+    """Clean files of a novel."""
+    if not clean_chapters and not clean_toc:
+        click.echo(
+            'You must choose at least one of the options: --clean-chapters, --clean-toc.', err=True)
+        return
+    novel = obtain_novel(title, novel_base_dir)
+    novel.clean_files(clean_chapters=clean_chapters,
+                      clean_toc=clean_toc, hard_clean=hard_clean)
+@cli.command()
+@title_option
+@novel_base_dir_option
+def show_novel_dir(title, novel_base_dir):
+    """Show the directory where the novel is saved."""
+    novel = obtain_novel(title, novel_base_dir)
+    click.echo(novel.show_novel_dir())
+@cli.command()
+def version():
+    """Show program version."""
+    click.echo(f'Version {__version__}')
+if __name__ == '__main__':
+    cli()

web_novel_scraper/decode.py ADDED Viewed

@@ -0,0 +1,141 @@
+import os
+import json
+from pathlib import Path
+from . import logger_manager
+from bs4 import BeautifulSoup
+logger = logger_manager.create_logger('DECODE HTML')
+CURRENT_DIR = Path(__file__).resolve().parent
+DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{
+                              CURRENT_DIR}/decode_guide/decode_guide.json')
+XOR_SEPARATOR = "XOR"
+try:
+    with open(DECODE_GUIDE_FILE, 'r', encoding='UTF-8') as f:
+        DECODE_GUIDE = json.load(f)
+except FileNotFoundError:
+    logger.error(f"File {DECODE_GUIDE_FILE} not found.")
+    raise
+except PermissionError:
+    logger.error(f"Permission error {DECODE_GUIDE_FILE}.")
+    raise
+except json.JSONDecodeError:
+    logger.error(f"Json Decode error {DECODE_GUIDE_FILE}.")
+    raise
+except Exception as e:
+    logger.error(f"Error {DECODE_GUIDE_FILE}: {e}")
+    raise
+class Decoder:
+    host: str
+    decode_guide: json
+    def __init__(self, host: str):
+        self.host = host
+        self.decode_guide = self._get_element_by_key(
+            DECODE_GUIDE, 'host', host)
+    def decode_html(self, html: str, content_type: str):
+        if not content_type in self.decode_guide:
+            logger.error(f'{content_type} key does not exists on decode guide {
+                         DECODE_GUIDE_FILE} for host {self.host}')
+            return
+        soup = BeautifulSoup(html, 'html.parser')
+        decoder = self.decode_guide[content_type]
+        elements = self._find_elements(soup, decoder)
+        if not elements:
+            logger.warning(f'{content_type} not found on html using {
+                           DECODE_GUIDE_FILE} for host {self.host}')
+        return elements
+    def has_pagination(self, host: str = None):
+        if host:
+            decode_guide = self._get_element_by_key(DECODE_GUIDE, 'host', host)
+            return decode_guide['has_pagination']
+        return self.decode_guide['has_pagination']
+    def clean_html(self, html: str, hard_clean: bool = False):
+        tags_for_soft_clean = ['script', 'style', 'link',
+                               'form', 'meta', 'hr', 'noscript', 'button']
+        tags_for_hard_clean = ['header', 'footer', 'nav', 'aside', 'iframe', 'object', 'embed', 'svg', 'canvas', 'map', 'area',
+                               'audio', 'video', 'track', 'source', 'applet', 'frame', 'frameset', 'noframes', 'noembed', 'blink', 'marquee']
+        tags_for_custom_clean = []
+        if 'clean' in self.decode_guide:
+            tags_for_custom_clean = self.decode_guide['clean']
+        tags_for_clean = tags_for_soft_clean + tags_for_custom_clean
+        if hard_clean:
+            tags_for_clean += tags_for_hard_clean
+        soup = BeautifulSoup(html, 'html.parser')
+        for unwanted_tags in soup(tags_for_clean):
+            unwanted_tags.decompose()
+        return "\n".join([line.strip() for line in str(soup).splitlines() if line.strip()])
+    def _find_elements(self, soup: BeautifulSoup, decoder: dict):
+        selector = decoder.get('selector')
+        if selector is None:
+            selector = ''
+            element = decoder.get('element')
+            _id = decoder.get('id')
+            _class = decoder.get('class')
+            attributes = decoder.get('attributes')
+            if element:
+                selector += element
+            if _id:
+                selector += f'#{_id}'
+            if _class:
+                selector += f'.{_class}'
+            if attributes:
+                for attr, value in attributes.items():
+                    selector += f'[{attr}="{value}"]' if value else f'[{attr}]'
+            selectors = [selector]
+        else:
+            if XOR_SEPARATOR in selector:
+                selectors = selector.split(XOR_SEPARATOR)
+            else:
+                selectors = [selector]
+        for selector in selectors:
+            logger.debug(f'Attempt using selector {selector}')
+            elements = soup.select(selector)
+            if elements:
+                logger.debug(f'{len(elements)} found using selector {selector}')
+                break
+        extract = decoder.get('extract')
+        if extract:
+            if extract["type"] == "attr":
+                attr_key = extract["key"]
+                elements_aux = elements
+                elements = []
+                for element in elements_aux:
+                    try:
+                        attr = element[attr_key]
+                        if attr:
+                            elements.append(attr)
+                    except KeyError:
+                        pass
+            if extract["type"] == "text":
+                elements = [element.string for element in elements]
+        inverted = decoder.get('inverted')
+        if inverted:
+            elements = elements[::-1]
+        return elements if decoder.get('array') else elements[0] if elements else None
+    def _get_element_by_key(self, json_data, key, value):
+        for item in json_data:
+            if item[key] == value:
+                return item
+        logger.warning('Host not found, using default decoder.')
+        return json_data[0]