PyPI - web-novel-scraper - Versions diffs - 2.0.3__py3-none-any.whl → 2.1.1__py3-none-any.whl - Mend

web-novel-scraper 2.0.3py3-none-any.whl → 2.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

web_novel_scraper/__main__.py +123 -69
web_novel_scraper/config_manager.py +12 -12
web_novel_scraper/custom_processor/__init__.py +1 -1
web_novel_scraper/custom_processor/sites/fanmtl.py +15 -0
web_novel_scraper/decode.py +225 -80
web_novel_scraper/decode_guide/decode_guide.json +28 -0
web_novel_scraper/file_manager.py +292 -110
web_novel_scraper/models.py +76 -0
web_novel_scraper/novel_scraper.py +893 -424
web_novel_scraper/request_manager.py +50 -17
web_novel_scraper/utils.py +22 -1
web_novel_scraper/version.py +1 -1
{web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.1.dist-info}/METADATA +1 -1
web_novel_scraper-2.1.1.dist-info/RECORD +21 -0
web_novel_scraper-2.0.3.dist-info/RECORD +0 -19
{web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.1.dist-info}/WHEEL +0 -0
{web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.1.dist-info}/entry_points.txt +0 -0

web_novel_scraper/__main__.py CHANGED Viewed

@@ -1,27 +1,41 @@
 from pathlib import Path
 from datetime import datetime
-from typing import Optional
 import click
 from .config_manager import ScraperConfig
 from .novel_scraper import Novel
+from .models import Chapter
+from .utils import ValidationError, ScraperError, NetworkError, DecodeError, FileManagerError
 from .version import __version__
 CURRENT_DIR = Path(__file__).resolve().parent
 def global_options(f):
-    f = click.option('-nb', '--novel-base-dir', type=click.Path(), required=False, help="Alternative directory for this novel.")(f)
+    f = click.option('-nb', '--novel-base-dir', type=click.Path(), required=False,
+                     help="Alternative directory for this novel.")(f)
     f = click.option('--config-file', type=click.Path(), required=False, help="Path to config file.")(f)
-    f = click.option('--base-novels-dir', type=click.Path(), required=False, help="Alternative base directory for all novels.")(f)
-    f = click.option('--decode-guide-file', type=click.Path(), required=False, help="Path to alternative decode guide file.")(f)
+    f = click.option('--base-novels-dir', type=click.Path(), required=False,
+                     help="Alternative base directory for all novels.")(f)
+    f = click.option('--decode-guide-file', type=click.Path(), required=False,
+                     help="Path to alternative decode guide file.")(f)
     return f
+@click.group()
+@global_options
+@click.pass_context
+def cli(ctx, **kwargs):
+    """CLI Tool for web novel scraping."""
+    ctx.obj = kwargs
 def obtain_novel(title, ctx_opts, allow_missing=False):
-    cfg = ScraperConfig(ctx_opts.get("CONFIG_FILE"), ctx_opts.get("BASE_NOVELS_DIR"))
+    cfg = ScraperConfig(parameters=ctx_opts)
     try:
-        return Novel.load(title, cfg, ctx_opts.get("NOVEL_BASE_DIR"))
-    except ValueError:
+        return Novel.load(title, cfg, ctx_opts.get("novel_base_dir"))
+    except ValidationError:
         if allow_missing:
             return None
         click.echo("Novel not found.", err=True)
@@ -45,23 +59,14 @@ def validate_date(ctx, param, value):
                 'Date should be a valid date and must be in the format YYYY-MM-DD, YYYY-MM or YYYY') from exc
     return value
 # COMMON ARGUMENTS
 title_option = click.option(
-    '-t', '--title', type=str, required=True, envvar='SCRAPER_NOVEL_TITLE', help='Title of the novel, this server as the identifier.')
+    '-t', '--title', type=str, required=True, envvar='SCRAPER_NOVEL_TITLE',
+    help='Title of the novel, this server as the identifier.')
 novel_base_dir_option = click.option(
     '-nb', '--novel-base-dir', type=str, help='Alternative base directory for the novel files.')
-@click.group()
-@global_options
-@click.pass_context
-def cli(ctx, novel_base_dir, config_file, base_novels_dir, decode_guide_file):
-    """CLI Tool for web novel scraping."""
-    ctx.ensure_object(dict)
-    ctx.obj['NOVEL_BASE_DIR'] = novel_base_dir
-    ctx.obj['CONFIG_FILE'] = config_file
-    ctx.obj['BASE_NOVELS_DIR'] = base_novels_dir
-    ctx.obj['DECODE_GUIDE_FILE'] = decode_guide_file
 # Metadata:
 metadata_author_option = click.option(
     '--author', type=str, help='Name of the novel author.')
@@ -70,9 +75,11 @@ metadata_language_option = click.option(
 metadata_description_option = click.option(
     '--description', type=str, help='Description of the novel.')
 metadata_start_date_option = click.option(
-    '--start-date', callback=validate_date, type=str, help='Start date of the novel, should be in the format YYYY-MM-DD, YYYY-MM or YYYY.')
+    '--start-date', callback=validate_date, type=str,
+    help='Start date of the novel, should be in the format YYYY-MM-DD, YYYY-MM or YYYY.')
 metadata_end_date_option = click.option(
-    '--end-date', callback=validate_date, type=str, help='End date of the novel, should be in the format YYYY-MM-DD, YYYY-MM or YYYY.')
+    '--end-date', callback=validate_date, type=str,
+    help='End date of the novel, should be in the format YYYY-MM-DD, YYYY-MM or YYYY.')
 # TOC options
 toc_main_url_option = click.option(
@@ -80,14 +87,17 @@ toc_main_url_option = click.option(
 sync_toc_option = click.option('--sync-toc', is_flag=True, default=False, show_default=True,
                                help='Reload the TOC before requesting chapters.')
 def create_toc_html_option(required: bool = False):
     return click.option(
         '--toc-html',
         type=click.File(encoding='utf-8'),
         required=required,
-        help=('Novel TOC HTML loaded from file.' if required else 'Novel TOC HTML loaded from file (required if not loading from URL)')
+        help=(
+            'Novel TOC HTML loaded from file.' if required else 'Novel TOC HTML loaded from file (required if not loading from URL)')
     )
 host_option = click.option(
     '--host', type=str, help='Host used for decoding, optional if toc-main-url is provided.')
@@ -99,6 +109,7 @@ auto_add_host_option = click.option('--auto-add-host', is_flag=True, show_defaul
 force_flaresolver_option = click.option('--force-flaresolver', is_flag=True, show_default=True,
                                         default=False, help='Force the use of FlareSolver for requests.')
 # Novel creation and data management commands
 @cli.command()
@@ -117,7 +128,8 @@ force_flaresolver_option = click.option('--force-flaresolver', is_flag=True, sho
 @save_title_to_content_option
 @auto_add_host_option
 @force_flaresolver_option
-def create_novel(ctx, title, toc_main_url, toc_html, host, author, start_date, end_date, language, description, tags, cover, save_title_to_content, auto_add_host, force_flaresolver):
+def create_novel(ctx, title, toc_main_url, toc_html, host, author, start_date, end_date, language, description, tags,
+                 cover, save_title_to_content, auto_add_host, force_flaresolver):
     """Creates a new novel and saves it."""
     novel = obtain_novel(title, ctx.obj, allow_missing=True)
     if novel:
@@ -140,29 +152,35 @@ def create_novel(ctx, title, toc_main_url, toc_html, host, author, start_date, e
     toc_html_content = None
     if toc_html:
         toc_html_content = toc_html.read()
-    novel = Novel(title=title,
-                  toc_main_url=toc_main_url,
-                  toc_html=toc_html_content,
-                  host=host
-                  )
-    novel.set_config(config_file=ctx.obj.get('CONFIG_FILE'),
-                     base_novels_dir=ctx.obj.get('BASE_NOVELS_DIR'),
-                     novel_base_dir=ctx.obj.get('NOVEL_BASE_DIR'),
-                     decode_guide_file=ctx.obj.get('DECODE_GUIDE_FILE')
-                    )
-    novel.set_metadata(author=author, start_date=start_date,
-                       end_date=end_date, language=language, description=description)
+    config = ScraperConfig(parameters=ctx.obj)
+    novel = Novel.new(title=title,
+                      cfg=config,
+                      host=host,
+                      toc_main_url=toc_main_url,
+                      toc_html=toc_html_content)
+    novel.set_config(cfg=config,
+                     novel_base_dir=ctx.obj.get('novel_base_dir'))
+    novel.set_metadata(author=author,
+                       start_date=start_date,
+                       end_date=end_date,
+                       language=language,
+                       description=description)
     novel.set_scraper_behavior(save_title_to_content=save_title_to_content,
-                                auto_add_host=auto_add_host, force_flaresolver=force_flaresolver)
+                               auto_add_host=auto_add_host,
+                               force_flaresolver=force_flaresolver)
     if tags:
         for tag in tags:
             novel.add_tag(tag)
     if cover:
         if not novel.set_cover_image(cover):
             click.echo('Error saving the novel cover image.', err=True)
     novel.save_novel()
     click.echo('Novel saved successfully.')
 @cli.command()
 @click.pass_context
 @title_option
@@ -171,6 +189,7 @@ def show_novel_info(ctx, title):
     novel = obtain_novel(title, ctx.obj)
     click.echo(novel)
 @cli.command()
 @click.pass_context
 @title_option
@@ -188,6 +207,7 @@ def set_metadata(ctx, title, author, start_date, end_date, language, description
     click.echo('Novel metadata saved successfully.')
     click.echo(novel.metadata)
 @cli.command()
 @click.pass_context
 @title_option
@@ -196,6 +216,7 @@ def show_metadata(ctx, title):
     novel = obtain_novel(title, ctx.obj)
     click.echo(novel.metadata)
 @cli.command()
 @click.pass_context
 @title_option
@@ -209,6 +230,7 @@ def add_tags(ctx, title, tags):
     novel.save_novel()
     click.echo(f'Tags: {", ".join(novel.metadata.tags)}')
 @cli.command()
 @click.pass_context
 @title_option
@@ -222,6 +244,7 @@ def remove_tags(ctx, title, tags):
     novel.save_novel()
     click.echo(f'Tags: {", ".join(novel.metadata.tags)}')
 @cli.command()
 @click.pass_context
 @title_option
@@ -230,6 +253,7 @@ def show_tags(ctx, title):
     novel = obtain_novel(title, ctx.obj)
     click.echo(f'Tags: {", ".join(novel.metadata.tags)}')
 @cli.command()
 @click.pass_context
 @title_option
@@ -240,11 +264,14 @@ def set_cover_image(ctx, title, cover_image):
     novel.set_cover_image(cover_image)
     click.echo(f'Cover image saved successfully.')
 @cli.command()
 @click.pass_context
 @title_option
-@click.option('--save-title-to-content', type=bool, help='Toggle the title of the chapter being added to the content (use true or false).')
-@click.option('--auto-add-host', type=bool, help='Toggle automatic addition of the host to chapter URLs (use true or false).')
+@click.option('--save-title-to-content', type=bool,
+              help='Toggle the title of the chapter being added to the content (use true or false).')
+@click.option('--auto-add-host', type=bool,
+              help='Toggle automatic addition of the host to chapter URLs (use true or false).')
 @click.option('--force-flaresolver', type=bool, help='Toggle forcing the use of FlareSolver (use true or false).')
 @click.option('--hard-clean', type=bool, help='Toggle using a hard clean when cleaning HTML files (use true or false).')
 def set_scraper_behavior(ctx, title, save_title_to_content, auto_add_host, force_flaresolver, hard_clean):
@@ -259,6 +286,7 @@ def set_scraper_behavior(ctx, title, save_title_to_content, auto_add_host, force
     novel.save_novel()
     click.echo('New scraper behavior added successfully.')
 @cli.command()
 @click.pass_context
 @title_option
@@ -267,6 +295,7 @@ def show_scraper_behavior(ctx, title):
     novel = obtain_novel(title, ctx.obj)
     click.echo(novel.scraper_behavior)
 @cli.command()
 @click.pass_context
 @title_option
@@ -278,6 +307,7 @@ def set_host(ctx, title, host):
     novel.save_novel()
     click.echo('New host set successfully.')
 # TOC MANAGEMENT COMMANDS
 @cli.command()
@@ -290,6 +320,7 @@ def set_toc_main_url(ctx, title, toc_main_url):
     novel.set_toc_main_url(toc_main_url)
     novel.save_novel()
 @cli.command()
 @click.pass_context
 @title_option
@@ -302,10 +333,12 @@ def add_toc_html(ctx, title, toc_html, host):
     novel.add_toc_html(html_content, host)
     novel.save_novel()
 @cli.command()
 @click.pass_context
 @title_option
-@click.option('--reload-files', is_flag=True, required=False, default=False, show_default=True, help='Reload the TOC files before sync (only works if using a TOC URL).')
+@click.option('--reload-files', is_flag=True, required=False, default=False, show_default=True,
+              help='Reload the TOC files before sync (only works if using a TOC URL).')
 def sync_toc(ctx, title, reload_files):
     """Sync the TOC of a novel."""
     novel = obtain_novel(title, ctx.obj)
@@ -317,6 +350,7 @@ def sync_toc(ctx, title, reload_files):
             'Error with the TOC syncing, please check the TOC files and decoding options.', err=True)
     novel.save_novel()
 @cli.command()
 @click.pass_context
 @title_option
@@ -329,6 +363,7 @@ def delete_toc(ctx, title, auto_approve):
     novel.delete_toc()
     novel.save_novel()
 @cli.command()
 @click.pass_context
 @title_option
@@ -337,6 +372,7 @@ def show_toc(ctx, title):
     novel = obtain_novel(title, ctx.obj)
     click.echo(novel.show_toc())
 # CHAPTER MANAGEMENT COMMANDS
 @cli.command()
@@ -344,46 +380,55 @@ def show_toc(ctx, title):
 @title_option
 @click.option('--chapter-url', type=str, required=False, help='Chapter URL to be scrapped.')
 @click.option('--chapter-num', type=int, required=False, help='Chapter number to be scrapped.')
-@click.option('--update-html', is_flag=True, default=False, show_default=True, help='If the chapter HTML is saved, it will be updated.')
+@click.option('--update-html', is_flag=True, default=False, show_default=True,
+              help='If the chapter HTML is saved, it will be updated.')
 def scrap_chapter(ctx, title, chapter_url, chapter_num, update_html):
     """Scrap a chapter of a novel."""
-    if (chapter_url is None and chapter_num is None) or (chapter_url and chapter_num):
-        raise click.UsageError("You must set exactly one: --chapter-url o --chapter-num.")
     novel = obtain_novel(title, ctx.obj)
+    try:
+        if chapter_num is not None:
+            chapter_num = chapter_num - 1
+        chapter = novel.get_chapter(chapter_index=chapter_num,
+                                    chapter_url=chapter_url)
+    except ValidationError:
+        raise click.UsageError(
+            'You must set exactly one: --chapter-url o --chapter-num.')
+    except ValueError:
+        raise click.UsageError('--chapter-num must be a positive number.')
-    if chapter_num is not None:
-        if chapter_num <= 0 or chapter_num > len(novel.chapters):
-            raise click.BadParameter(
-                'Chapter number should be positive and an existing chapter.', param_hint='--chapter-num')
-        chapter = novel.scrap_chapter(chapter_idx=chapter_num - 1,
-                                      update_html=update_html)
-    else:
-        chapter = novel.scrap_chapter(chapter_url=chapter_url,
-                                      update_html=update_html)
-    if not chapter:
-        raise click.ClickException('Chapter not found or scrap failed.')
+    if chapter is None:
+        if chapter_url is not None:
+            click.echo('Chapter not found on novel TOC, will try anyways with chapter url')
+            chapter = Chapter(chapter_url=chapter_url)
+        else:
+            raise click.ClickException('Chapter not found.')
+    chapter = novel.scrap_chapter(chapter=chapter,
+                                  reload_file=update_html)
     click.echo(chapter)
     click.echo('Content:')
     click.echo(chapter.chapter_content)
 @cli.command()
 @click.pass_context
 @title_option
 @sync_toc_option
-@click.option('--update-html', is_flag=True, default=False, show_default=True, help='If the chapter HTML is saved, it will be updated.')
-@click.option('--clean-chapters', is_flag=True, default=False, show_default=True, help='If the chapter HTML should be cleaned upon saving.')
+@click.option('--update-html', is_flag=True, default=False, show_default=True,
+              help='If the chapter HTML is saved, it will be updated.')
+@click.option('--clean-chapters', is_flag=True, default=False, show_default=True,
+              help='If the chapter HTML should be cleaned upon saving.')
 def request_all_chapters(ctx, title, sync_toc, update_html, clean_chapters):
     """Request all chapters of a novel."""
     novel = obtain_novel(title, ctx.obj)
     novel.request_all_chapters(
-        sync_toc=sync_toc, update_html=update_html, clean_chapters=clean_chapters)
+        sync_toc=sync_toc,
+        reload_files=update_html,
+        clean_chapters=clean_chapters)
     novel.save_novel()
     click.echo('All chapters requested and saved.')
 @cli.command()
 @click.pass_context
 @title_option
@@ -398,9 +443,12 @@ def show_chapters(ctx, title):
 @click.pass_context
 @title_option
 @sync_toc_option
-@click.option('--start-chapter', type=int, default=1, show_default=True, help='The start chapter for the books (position in the TOC, may differ from the actual number).')
-@click.option('--end-chapter', type=int, default=None, show_default=True, help='The end chapter for the books (if not defined, every chapter will be saved).')
-@click.option('--chapters-by-book', type=int, default=100, show_default=True, help='The number of chapters each book will have.')
+@click.option('--start-chapter', type=int, default=1, show_default=True,
+              help='The start chapter for the books (position in the TOC, may differ from the actual number).')
+@click.option('--end-chapter', type=int, default=None, show_default=True,
+              help='The end chapter for the books (if not defined, every chapter will be saved).')
+@click.option('--chapters-by-book', type=int, default=100, show_default=True,
+              help='The number of chapters each book will have.')
 def save_novel_to_epub(ctx, title, sync_toc, start_chapter, end_chapter, chapters_by_book):
     """Save the novel to EPUB format."""
     if start_chapter <= 0:
@@ -416,19 +464,22 @@ def save_novel_to_epub(ctx, title, sync_toc, start_chapter, end_chapter, chapter
                 'Should be a positive number.', param_hint='--chapters-by-book')
     novel = obtain_novel(title, ctx.obj)
-    if novel.save_novel_to_epub(sync_toc=sync_toc, start_chapter=start_chapter, end_chapter=end_chapter, chapters_by_book=chapters_by_book):
-        click.echo('All books saved.')
-    else:
-        click.echo('Error saving EPUB.')
+    novel.save_novel_to_epub(sync_toc=sync_toc, start_chapter=start_chapter, end_chapter=end_chapter,
+                                chapters_by_book=chapters_by_book)
+    click.echo('All books saved.')
 # UTILS
 @cli.command()
 @click.pass_context
 @title_option
-@click.option('--clean-chapters', is_flag=True, default=False, show_default=True, help='If the chapters HTML files are cleaned.')
+@click.option('--clean-chapters', is_flag=True, default=False, show_default=True,
+              help='If the chapters HTML files are cleaned.')
 @click.option('--clean-toc', is_flag=True, default=False, show_default=True, help='If the TOC files are cleaned.')
-@click.option('--hard-clean', is_flag=True, default=False, show_default=True, help='If the files are more deeply cleaned.')
+@click.option('--hard-clean', is_flag=True, default=False, show_default=True,
+              help='If the files are more deeply cleaned.')
 def clean_files(ctx, title, clean_chapters, clean_toc, hard_clean):
     """Clean files of a novel."""
     if not clean_chapters and not clean_toc:
@@ -439,6 +490,7 @@ def clean_files(ctx, title, clean_chapters, clean_toc, hard_clean):
     novel.clean_files(clean_chapters=clean_chapters,
                       clean_toc=clean_toc, hard_clean=hard_clean)
 @cli.command()
 @click.pass_context
 @title_option
@@ -447,10 +499,12 @@ def show_novel_dir(ctx, title):
     novel = obtain_novel(title, ctx.obj)
     click.echo(novel.show_novel_dir())
 @cli.command()
 def version():
-    """Show program version."""
+    """Shows the program version."""
     click.echo(f'Version {__version__}')
 if __name__ == '__main__':
     cli()

web_novel_scraper/config_manager.py CHANGED Viewed

@@ -4,10 +4,10 @@ import json
 import platformdirs
 from dotenv import load_dotenv
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Any
 from .logger_manager import create_logger
-from .utils import FileOps
+from .utils import FileOps, ValidationError
 load_dotenv()
@@ -30,18 +30,18 @@ logger = create_logger("CONFIG MANAGER")
 ## 3. CONFIG FILE VALUE
 ## 4. DEFAULT VALUE
 class ScraperConfig:
-    base_novels_dir: str
-    decode_guide_file: str
+    base_novels_dir: Path
+    decode_guide_file: Path
     def __init__(self,
-                 config_file: str = None,
-                 base_novels_dir: str = None,
-                 decode_guide_file: str = None):
+                 parameters: dict[str, Any] | None = None):
+        if parameters is None:
+            parameters = {}
         ## LOADING CONFIGURATION
         config_file = self._get_config(default_value=SCRAPER_CONFIG_FILE,
                                        config_file_value=None,
                                        env_variable="SCRAPER_CONFIG_FILE",
-                                       parameter_value=config_file)
+                                       parameter_value=parameters.get('config_file'))
         config_file = Path(config_file)
         logger.debug(f'Obtaining configuration from file "{config_file}"')
@@ -54,15 +54,15 @@ class ScraperConfig:
         ## SETTING CONFIGURATION VALUES
-        self.base_novels_dir = self._get_config(default_value=SCRAPER_BASE_NOVELS_DIR,
+        self.base_novels_dir = Path(self._get_config(default_value=SCRAPER_BASE_NOVELS_DIR,
                                                 config_file_value=config.get("base_novels_dir"),
                                                 env_variable="SCRAPER_BASE_NOVELS_DIR",
-                                                parameter_value=base_novels_dir)
+                                                parameter_value=parameters.get('base_novels_dir')))
-        self.decode_guide_file = self._get_config(default_value=SCRAPER_DECODE_GUIDE_FILE,
+        self.decode_guide_file = Path(self._get_config(default_value=SCRAPER_DECODE_GUIDE_FILE,
                                                   config_file_value=config.get("decode_guide_file"),
                                                   env_variable="SCRAPER_DECODE_GUIDE_FILE",
-                                                  parameter_value=decode_guide_file)
+                                                  parameter_value=parameters.get('decode_guide_file')))
     @staticmethod
     def _get_config(default_value: str,

web_novel_scraper/custom_processor/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
 from .custom_processor import CustomProcessor, ProcessorRegistry
-from .sites import royalroad, genesis
+from .sites import royalroad, genesis, fanmtl

web_novel_scraper/custom_processor/sites/fanmtl.py ADDED Viewed

@@ -0,0 +1,15 @@
+import re
+from typing import List, Optional
+from ..custom_processor import CustomProcessor, ProcessorRegistry
+class GenesisNextPageProcessor(CustomProcessor):
+    def process(self, html: str) -> Optional[str]:
+        pattern = r'href="([^"]+page=\d+[^"]*)">></a'
+        match = re.search(pattern, html)
+        if match is None:
+            return None
+        next_page = match.group(1)
+        next_page = next_page.replace('&amp;', '&')
+        return f'https://www.fanmtl.com{next_page}'
+ProcessorRegistry.register('fanmtl.com', 'next_page', GenesisNextPageProcessor())

web-novel-scraper 2.0.3__py3-none-any.whl → 2.1.1__py3-none-any.whl

web-novel-scraper 2.0.3py3-none-any.whl → 2.1.1py3-none-any.whl