PyPI - web-novel-scraper - Versions diffs - 1.0.3__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

web-novel-scraper 1.0.3py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

web_novel_scraper/__main__.py +18 -12
web_novel_scraper/custom_processor/__init__.py +2 -0
web_novel_scraper/custom_processor/custom_processor.py +25 -0
web_novel_scraper/custom_processor/sites/genesis.py +46 -0
web_novel_scraper/custom_processor/sites/royalroad.py +22 -0
web_novel_scraper/decode.py +127 -15
web_novel_scraper/decode_guide/decode_guide.json +102 -16
web_novel_scraper/file_manager.py +18 -10
web_novel_scraper/novel_scraper.py +62 -84
web_novel_scraper/version.py +1 -1
{web_novel_scraper-1.0.3.dist-info → web_novel_scraper-1.1.0.dist-info}/METADATA +1 -1
web_novel_scraper-1.1.0.dist-info/RECORD +18 -0
web_novel_scraper-1.0.3.dist-info/RECORD +0 -14
{web_novel_scraper-1.0.3.dist-info → web_novel_scraper-1.1.0.dist-info}/WHEEL +0 -0
{web_novel_scraper-1.0.3.dist-info → web_novel_scraper-1.1.0.dist-info}/entry_points.txt +0 -0

web_novel_scraper/__main__.py CHANGED Viewed

@@ -52,7 +52,7 @@ def validate_date(ctx, param, value):
 # COMMON ARGUMENTS
 title_option = click.option(
-    '-t', '--title', type=str, required=True, help='Title of the novel, this server as the identifier.')
+    '-t', '--title', type=str, required=True, envvar='SCRAPER_NOVEL_TITLE', help='Title of the novel, this server as the identifier.')
 novel_base_dir_option = click.option(
     '-nb', '--novel-base-dir', type=str, help='Alternative base directory for the novel files.')
@@ -330,19 +330,25 @@ def show_toc(title, novel_base_dir):
 @click.option('--update-html', is_flag=True, default=False, show_default=True, help='If the chapter HTML is saved, it will be updated.')
 def scrap_chapter(title, novel_base_dir, chapter_url, chapter_num, update_html):
     """Scrap a chapter of a novel."""
+    if (chapter_url is None and chapter_num is None) or (chapter_url and chapter_num):
+        raise click.UsageError("You must set exactly one: --chapter-url o --chapter-num.")
     novel = obtain_novel(title, novel_base_dir)
-    if not chapter_url and not chapter_num:
-        click.echo('Chapter URL or chapter number should be set.', err=True)
-    if chapter_num and chapter_url:
-        click.echo('It should be either chapter URL or chapter number.', err=True)
-    if chapter_num <= 0 or chapter_num > len(novel.chapters):
-        raise click.BadParameter(
-            'Chapter number should be positive and an existing chapter.', param_hint='--chapter-num')
-    chapter = novel.scrap_chapter(
-        chapter_url=chapter_url, chapter_idx=chapter_num - 1, update_html=update_html)
+    if chapter_num is not None:
+        if chapter_num <= 0 or chapter_num > len(novel.chapters):
+            raise click.BadParameter(
+                'Chapter number should be positive and an existing chapter.', param_hint='--chapter-num')
+        chapter = novel.scrap_chapter(chapter_idx=chapter_num - 1,
+                                      update_html=update_html)
+    else:
+        chapter = novel.scrap_chapter(chapter_url=chapter_url,
+                                      update_html=update_html)
     if not chapter:
-        click.echo('Chapter number or URL not found.', err=True)
-        return
+        raise click.ClickException('Chapter not found or scrap failed.')
     click.echo(chapter)
     click.echo('Content:')
     click.echo(chapter.chapter_content)

web_novel_scraper/custom_processor/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .custom_processor import CustomProcessor, ProcessorRegistry
2	+ from .sites import royalroad, genesis

web_novel_scraper/custom_processor/custom_processor.py ADDED Viewed

@@ -0,0 +1,25 @@
+from abc import ABC, abstractmethod
+from typing import Any, Dict
+class CustomProcessor(ABC):
+    @abstractmethod
+    def process(self, html: str) -> Any:
+        """Process the HTML content using custom logic"""
+        pass
+class ProcessorRegistry:
+    _processors: Dict[str, Dict[str, CustomProcessor]] = {}
+    @classmethod
+    def register(cls, host: str, content_type: str, processor: CustomProcessor):
+        if host not in cls._processors:
+            cls._processors[host] = {}
+        cls._processors[host][content_type] = processor
+    @classmethod
+    def get_processor(cls, host: str, content_type: str) -> CustomProcessor:
+        return cls._processors.get(host, {}).get(content_type)
+    @classmethod
+    def has_processor(cls, host: str, content_type: str) -> bool:
+        return bool(cls.get_processor(host, content_type))

web_novel_scraper/custom_processor/sites/genesis.py ADDED Viewed

@@ -0,0 +1,46 @@
+import re
+import json
+from typing import List, Optional
+from ..custom_processor import CustomProcessor, ProcessorRegistry
+GENESIS_STUDIO_VIEWER_URL = 'https://genesistudio.com/viewer'
+class GenesisChaptersProcessor(CustomProcessor):
+    def process(self, html: str) -> Optional[List[dict]]:
+        pattern = r',chapters:\s*{\s*free:\s*(\[.*?"}}])'
+        match = re.search(pattern, html, re.DOTALL)
+        if not match:
+            if not match:
+                return None
+        try:
+            chapters_json = match.group(1).strip()
+            replaces = {
+                "chapter_title:": '"chapter_tile":',
+                "id:": '"id":',
+                "nsfw:": '"nsfw":',
+                "required_tier:": '"required_tier":',
+                "date_created:": '"date_created":',
+                "spoiler_title:": '"spoiler_title":',
+                "chapter_number:": '"chapter_number":',
+                "novel:": '"novel":',
+            }
+            # Ensure the JSON string ends properly
+            if not chapters_json.endswith(']'):
+                chapters_json += ']'
+            for old_key, new_key in replaces.items():
+                chapters_json = chapters_json.replace(old_key, new_key)
+            # print(f"Extracted JSON: {chapters_json[12200:12300]}")  # Debug print
+            chapters = json.loads(chapters_json)
+            chapters_url = []
+            for chapter in chapters:
+                chapters_url.append(f"{GENESIS_STUDIO_VIEWER_URL}/{chapter['id']}")
+            print(chapters)
+            return chapters_url
+        except (json.JSONDecodeError, IndexError) as e:
+            print(f"Error processing JSON: {str(e)}")
+            return None
+ProcessorRegistry.register('genesistudio.com', 'index', GenesisChaptersProcessor())

web_novel_scraper/custom_processor/sites/royalroad.py ADDED Viewed

@@ -0,0 +1,22 @@
+import re
+import json
+from typing import List, Optional
+from ..custom_processor import CustomProcessor, ProcessorRegistry
+class RoyalRoadChaptersProcessor(CustomProcessor):
+    def process(self, html: str) -> Optional[List[dict]]:
+        pattern = r'window\.chapters\s*=\s*(\[.*?\]);'
+        match = re.search(pattern, html, re.DOTALL)
+        if not match:
+            return None
+        try:
+            chapters_json = match.group(1)
+            chapters = json.loads(chapters_json)
+            chapters = [chapter['url'] for chapter in chapters if 'url' in chapter]
+            return chapters
+        except (json.JSONDecodeError, IndexError):
+            return None
+ProcessorRegistry.register('www.royalroad.com', 'index', RoyalRoadChaptersProcessor())

web_novel_scraper/decode.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import os
 import json
 from pathlib import Path
+from typing import Optional
 from . import logger_manager
+from .custom_processor.custom_processor import ProcessorRegistry
 from bs4 import BeautifulSoup
@@ -10,8 +12,7 @@ logger = logger_manager.create_logger('DECODE HTML')
 CURRENT_DIR = Path(__file__).resolve().parent
-DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{
-                              CURRENT_DIR}/decode_guide/decode_guide.json')
+DECODE_GUIDE_FILE = os.getenv('DECODE_GUIDE_FILE', f'{CURRENT_DIR}/decode_guide/decode_guide.json')
 XOR_SEPARATOR = "XOR"
@@ -41,17 +42,92 @@ class Decoder:
         self.decode_guide = self._get_element_by_key(
             DECODE_GUIDE, 'host', host)
-    def decode_html(self, html: str, content_type: str):
+    def get_chapter_urls(self, html: str) -> list[str]:
+        logger.debug('Obtaining chapter URLs...')
+        chapter_urls = self.decode_html(html, 'index')
+        if chapter_urls is None:
+            logger.critical(f"Failed to obtain chapter URLs for {self.host}")
+            raise ValueError(f"Failed to obtain chapter URLs for {self.host}")
+        if isinstance(chapter_urls, str):
+            logger.warning('When obtaining chapter urls, obtained a String but expected a List')
+            logger.warning('Check decode config')
+            chapter_urls = [chapter_urls]
+        return chapter_urls
+    def get_toc_next_page_url(self, html: str) -> Optional[str]:
+        logger.debug('Obtaining toc next page URL...')
+        toc_next_page_url = self.decode_html(html, 'next_page')
+        if toc_next_page_url is None:
+            logger.debug('No next page URL found, assuming last page...')
+            return None
+        return toc_next_page_url
+    def get_chapter_title(self, html: str) -> Optional[str]:
+        logger.debug('Obtaining chapter title...')
+        chapter_title = self.decode_html(html, 'title')
+        if chapter_title is None:
+            logger.debug(f'No chapter_title found.')
+        return chapter_title
+    def get_chapter_content(self, html: str, save_title_to_content: bool, chapter_title: str) -> str:
+        logger.debug('Obtaining chapter content...')
+        full_chapter_content = ""
+        chapter_content = self.decode_html(html, 'content')
+        if chapter_content is None:
+            logger.critical('No content found on chapter')
+            raise ValueError('No content found on chapter')
+        if save_title_to_content:
+            logger.debug('Saving chapter title to content...')
+            full_chapter_content += f'<h4>{chapter_title}</h4>'
+        if isinstance(chapter_content, list):
+            logger.debug(f'{len(chapter_content)} paragraphs found in chapter')
+            logger.debug('Converting list of paragraphs to a single string')
+            for paragraph in chapter_content:
+                full_chapter_content += str(paragraph)
+        else:
+            logger.debug('Chapter content is not a list, no conversion made')
+            full_chapter_content += str(chapter_content)
+        return full_chapter_content
+    def decode_html(self, html: str, content_type: str) -> str | list[str] | None:
+        logger.debug(f'Decoding HTML...')
+        logger.debug(f'Content type: {content_type}')
+        logger.debug(f'Decode guide: {DECODE_GUIDE_FILE}')
+        logger.debug(f'Host: {self.host}')
         if not content_type in self.decode_guide:
-            logger.error(f'{content_type} key does not exists on decode guide {
-                         DECODE_GUIDE_FILE} for host {self.host}')
-            return
-        soup = BeautifulSoup(html, 'html.parser')
+            logger.critical(f'{content_type} key does not exists on decode guide {DECODE_GUIDE_FILE}'
+                            f'for host {self.host}')
+            raise ValueError(f'{content_type} key does not exists on decode guide {DECODE_GUIDE_FILE}'
+                            f'for host {self.host}')
+        if ProcessorRegistry.has_processor(self.host, content_type):
+            logger.debug(f'Host {self.host} will use a custom processor')
+            processor = ProcessorRegistry.get_processor(self.host, content_type)
+            return processor.process(html)
+        logger.debug('Starting HTML parsing...')
+        try:
+            soup = BeautifulSoup(html, 'html.parser')
+        except Exception as e:
+            logger.critical(f'Error parsing HTML with BeautifulSoup: {e}')
+            raise ValueError(f'Error parsing HTML with BeautifulSoup: {e}')
         decoder = self.decode_guide[content_type]
         elements = self._find_elements(soup, decoder)
         if not elements:
-            logger.warning(f'{content_type} not found on html using {
-                           DECODE_GUIDE_FILE} for host {self.host}')
+            logger.warning(f'{content_type} not found on html using {DECODE_GUIDE_FILE} '
+                           f'for host {self.host}')
+        # Investigate this conditional
+        if content_type == 'title' and isinstance(elements, list):
+            logger.debug('Joining titles...')
+            return ' '.join(elements)
         return elements
     def has_pagination(self, host: str = None):
@@ -81,8 +157,11 @@ class Decoder:
         return "\n".join([line.strip() for line in str(soup).splitlines() if line.strip()])
-    def _find_elements(self, soup: BeautifulSoup, decoder: dict):
+    @staticmethod
+    def _find_elements(soup: BeautifulSoup, decoder: dict):
+        logger.debug('Finding elements...')
         selector = decoder.get('selector')
+        elements = []
         if selector is None:
             selector = ''
             element = decoder.get('element')
@@ -91,32 +170,46 @@ class Decoder:
             attributes = decoder.get('attributes')
             if element:
+                logger.debug(f'Using element "{element}"')
                 selector += element
             if _id:
+                logger.debug(f'Using id "{_id}"')
                 selector += f'#{_id}'
             if _class:
+                logger.debug(f'Using class "{_class}"')
                 selector += f'.{_class}'
             if attributes:
                 for attr, value in attributes.items():
-                    selector += f'[{attr}="{value}"]' if value else f'[{attr}]'
+                    logger.debug(f'Using attribute "{attr}"')
+                    if value is not None:
+                        logger.debug(f'With value "{value}"')
+                        selector += f'[{attr}="{value}"]'
+                    else:
+                        selector += f'[{attr}]'
             selectors = [selector]
         else:
+            logger.debug(f'Using selector "{selector}"')
             if XOR_SEPARATOR in selector:
+                logger.debug(f'Found XOR_OPERATOR "{XOR_SEPARATOR}" in selector')
+                logger.debug('Splitting selectors...')
                 selectors = selector.split(XOR_SEPARATOR)
             else:
                 selectors = [selector]
         for selector in selectors:
-            logger.debug(f'Attempt using selector {selector}')
+            logger.debug(f'Searching using selector "{selector}"...')
             elements = soup.select(selector)
             if elements:
-                logger.debug(f'{len(elements)} found using selector {selector}')
+                logger.debug(f'{len(elements)} found using selector "{selector}"')
                 break
+            logger.debug(f'No elements found using selector "{selector}"')
         extract = decoder.get('extract')
         if extract:
+            logger.debug(f'Extracting from elements...')
             if extract["type"] == "attr":
                 attr_key = extract["key"]
+                logger.debug(f'Extracting value from attribute "{attr_key}"...')
                 elements_aux = elements
                 elements = []
                 for element in elements_aux:
@@ -125,15 +218,34 @@ class Decoder:
                         if attr:
                             elements.append(attr)
                     except KeyError:
+                        logger.debug(f'Attribute "{attr_key}" not found')
+                        logger.debug('Ignoring...')
                         pass
+                logger.debug(f'{len(elements)} elements found using attribute "{attr_key}"')
             if extract["type"] == "text":
+                logger.debug('Extracting text from elements...')
                 elements = [element.string for element in elements]
+        if not elements:
+            logger.error('No elements found, returning "None"')
+            return None
         inverted = decoder.get('inverted')
         if inverted:
+            logger.debug('Inverted option activate')
+            logger.debug('Inverting elements order...')
             elements = elements[::-1]
-        return elements if decoder.get('array') else elements[0] if elements else None
-    def _get_element_by_key(self, json_data, key, value):
+        if decoder.get('array'):
+            logger.debug('Array option activated')
+            logger.debug('Returning elements a list')
+            return elements
+        logger.debug('Array option not activated')
+        logger.debug('Returning only first element...')
+        return elements[0]
+    @staticmethod
+    def _get_element_by_key(json_data, key, value):
         for item in json_data:
             if item[key] == value:
                 return item

web_novel_scraper/decode_guide/decode_guide.json CHANGED Viewed

@@ -24,7 +24,11 @@
             "class": null,
             "selector": null,
             "attributes": null,
-            "array": true
+            "array": true,
+            "extract": {
+                "type": "attr",
+                "key": "href"
+            }
         },
         "next_page": {
             "element": "p",
@@ -60,7 +64,11 @@
             "class": null,
             "selector": "div.m-newest2 ul li a",
             "attributes": null,
-            "array": true
+            "array": true,
+            "extract": {
+                "type": "attr",
+                "key": "href"
+            }
         },
         "next_page": {
             "element": null,
@@ -72,7 +80,7 @@
         }
     },
     {
-        "host": "royalroad.com",
+        "host": "www.royalroad.com",
         "has_pagination": false,
         "title": {
             "element": null,
@@ -95,12 +103,7 @@
             "array": true
         },
         "index": {
-            "element": null,
-            "id": null,
-            "class": null,
-            "selector": "tr.chapter-row td a",
-            "attributes": null,
-            "array": true
+            "use_custom_processor": true
         },
         "next_page": {
             "element": null,
@@ -127,10 +130,10 @@
             }
         },
         "content": {
-            "element": "div#chr-content",
+            "element": null,
             "id": null,
             "class": null,
-            "selector": null,
+            "selector": "div#chr-content p",
             "attributes": null,
             "array": true
         },
@@ -140,7 +143,11 @@
             "class": null,
             "selector": "ul.list-chapter li a",
             "attributes": null,
-            "array": true
+            "array": true,
+            "extract": {
+                "type": "attr",
+                "key": "href"
+            }
         },
         "next_page": {
             "element": null,
@@ -167,10 +174,10 @@
             }
         },
         "content": {
-            "element": "div#chr-content",
+            "element": null,
             "id": null,
             "class": null,
-            "selector": null,
+            "selector": "div#chr-content p",
             "attributes": null,
             "array": true
         },
@@ -180,7 +187,11 @@
             "class": null,
             "selector": "ul.list-chapter li a",
             "attributes": null,
-            "array": true
+            "array": true,
+            "extract": {
+                "type": "attr",
+                "key": "href"
+            }
         },
         "next_page": {
             "element": null,
@@ -207,7 +218,82 @@
         "index": {
             "element": "ul.main li a",
             "array": true,
-            "inverted": true
+            "inverted": true,
+            "extract": {
+                "type": "attr",
+                "key": "href"
+            }
+        }
+    },
+    {
+        "host": "genesistudio.com",
+        "has_pagination": false,
+        "title": {
+            "element": null,
+            "id": null,
+            "class": null,
+            "selector": "p.leading-none span",
+            "attributes": null,
+            "array": true,
+            "extract": {
+                "type": "text",
+                "key": "text"
+            }
+        },
+        "content": {
+            "element": "p",
+            "id": null,
+            "class": "narration",
+            "selector": null,
+            "attributes": null,
+            "array": true,
+            "extract": {
+                "type": "text",
+                "key": "text"
+            }
+        },
+        "index": {
+            "use_custom_processor": true
+        },
+        "next_page": {
+            "element": null,
+            "id": null,
+            "class": null,
+            "selector": null,
+            "attributes": null,
+            "array": true
+        }
+    },
+    {
+        "host": "hostednovel.com",
+        "has_pagination": true,
+        "title": {
+            "selector": "span#chapter-title",
+            "extract": {
+                "type": "text"
+            }
+        },
+        "content": {
+            "element": "div",
+            "id": "chapter-content",
+            "array": true
+        },
+        "index": {
+            "selector": "li ul li.flow-root a",
+            "array": true,
+            "inverted": false,
+            "extract": {
+                "type": "attr",
+                "key": "href"
+            }
+        },
+        "next_page": {
+            "selector": "a:has(span:contains('Next'))",
+            "array": false,
+            "extract": {
+                "type": "attr",
+                "key": "href"
+            }
         }
     }
 ]

web_novel_scraper/file_manager.py CHANGED Viewed

@@ -7,6 +7,7 @@ from pathlib import Path
 import shutil
 from dotenv import load_dotenv
 from ebooklib import epub
+import unicodedata
 from . import logger_manager
@@ -44,10 +45,10 @@ class FileManager:
                  novel_config_dir: str = None,
                  read_only: bool = False):
         logger.debug(f'Initializing FileManager for novel: {novel_title}, read_only: {read_only}')
-        novel_base_dir = novel_base_dir if novel_base_dir else f'{
-            SCRAPER_BASE_DATA_DIR}/{novel_title}'
-        novel_config_dir = novel_config_dir if novel_config_dir else f'{
-            SCRAPER_BASE_CONFIG_DIR}/{novel_title}'
+        novel_base_dir = novel_base_dir if novel_base_dir else \
+                        f'{SCRAPER_BASE_DATA_DIR}/{novel_title}'
+        novel_config_dir = novel_config_dir if novel_config_dir else \
+                            f'{SCRAPER_BASE_CONFIG_DIR}/{novel_title}'
         logger.debug(f'Using base dir: {novel_base_dir}, config dir: {novel_config_dir}')
@@ -77,6 +78,16 @@ class FileManager:
     def save_chapter_html(self, filename: str, content: str):
         full_path = self.novel_chapters_dir / filename
         logger.debug(f'Saving chapter to {full_path}')
+        content = unicodedata.normalize('NFKC', content)
+        char_replacements = {
+            "â": "'",    # Reemplazar â con apóstrofe
+            "\u2018": "'", # Comillda simple izquierda Unicode
+            "\u2019": "'", # Comilla simple derecha Unicode
+            "\u201C": '"', # Comilla doble izquierda Unicode
+            "\u201D": '"', # Comilla doble derecha Unicode
+        }
+        for old_char, new_char in char_replacements.items():
+            content = content.replace(old_char, new_char)
         _save_content_to_file(full_path, content)
     def load_chapter_html(self, filename: str):
@@ -232,8 +243,7 @@ def _save_content_to_file(filepath: Path, content: str | dict, is_json: bool = F
     except (OSError, IOError) as e:
         logger.error(f'Error saving file "{filepath}": {e}')
     except Exception as e:
-        logger.error(f'Unexpected error saving file "{
-                     filepath}": {e}', exc_info=True)
+        logger.error(f'Unexpected error saving file "{filepath}": {e}', exc_info=True)
 def _read_content_from_file(filepath: Path, bytes: bool = False) -> str:
@@ -252,8 +262,7 @@ def _read_content_from_file(filepath: Path, bytes: bool = False) -> str:
         logger.error(f'Error reading file "{filepath}": {e}')
     except Exception as e:
         # Log for unexpected errors
-        logger.error(f'Unexpected error reading file "{
-                     filepath}": {e}', exc_info=True)
+        logger.error(f'Unexpected error reading file "{filepath}": {e}', exc_info=True)
 def _delete_file(filepath: Path) -> None:
@@ -269,8 +278,7 @@ def _delete_file(filepath: Path) -> None:
         logger.error(f'Error deleting file "{filepath}": {e}')
     except Exception as e:
         # Log any unexpected errors
-        logger.error(f'Unexpected error deleting file "{
-                     filepath}": {e}', exc_info=True)
+        logger.error(f'Unexpected error deleting file "{filepath}": {e}', exc_info=True)
 def _copy_file(source: Path, destination: Path) -> bool:

web_novel_scraper/novel_scraper.py CHANGED Viewed

@@ -39,9 +39,11 @@ class Metadata:
         """
         Dynamic string representation of the configuration.
         """
-        attributes = [f"{field.name}={
-            getattr(self, field.name)}" for field in fields(self)]
-        return f"Metadata: \n{'\n'.join(attributes)}"
+        attributes = [(f"{field.name}="
+                       f"{getattr(self, field.name)}") for field in fields(self)]
+        attributes_str = '\n'.join(attributes)
+        return (f"Metadata: \n"
+                f"{attributes_str}")
 @dataclass_json
@@ -70,9 +72,11 @@ class ScraperBehavior:
         """
         Dynamic string representation of the configuration.
         """
-        attributes = [f"{field.name}={
-            getattr(self, field.name)}" for field in fields(self)]
-        return f"Scraper Behavior: \n{'\n'.join(attributes)}"
+        attributes = [(f"{field.name}="
+                       f"{getattr(self, field.name)}") for field in fields(self)]
+        attributes_str = '\n'.join(attributes)
+        return (f"Scraper Behavior: \n"
+                f"{attributes_str}")
 @dataclass_json(undefined=Undefined.EXCLUDE)
@@ -169,7 +173,9 @@ class Novel:
             f"TOC Info: {toc_info}",
             f"Host: {self.host}"
         ]
-        return f"Novel Info: \n{'\n'.join(attributes)}"
+        attributes_str = '\n'.join(attributes)
+        return (f"Novel Info: \n"
+                f"{attributes_str}")
     # NOVEL PARAMETERS MANAGEMENT
@@ -186,8 +192,7 @@ class Novel:
             self.metadata.tags.append(tag)
             self.save_novel()
             return True
-        logger.warning(f'Tag "{tag}" already exists on novel {
-                       self.metadata.novel_title}')
+        logger.warning(f'Tag "{tag}" already exists on novel {self.metadata.novel_title}')
         return False
     def remove_tag(self, tag: str) -> bool:
@@ -195,8 +200,7 @@ class Novel:
             self.metadata.tags.remove(tag)
             self.save_novel()
             return True
-        logger.warning(f'Tag "{tag}" doesn\'t exist on novel {
-                       self.metadata.novel_title}')
+        logger.warning(f'Tag "{tag}" doesn\'t exist on novel {self.metadata.novel_title}')
         return False
     def set_cover_image(self, cover_image_path: str) -> bool:
@@ -220,6 +224,7 @@ class Novel:
             self.decoder = Decoder(self.host)
         elif update_host:
             self.decoder = Decoder(utils.obtain_host(self.toc_main_url))
+        self.save_novel()
     def add_toc_html(self, html: str, host: str = None) -> None:
         if self.toc_main_url:
@@ -248,7 +253,7 @@ class Novel:
         toc_not_exists = not all_tocs_content and self.toc_main_url is None
         if toc_not_exists:
             logger.critical(
-                'There is no toc html and no toc url setted, unable to get toc.')
+                'There is no toc html and no toc url set, unable to get toc.')
             return False
         reload_files = reload_files and self.toc_main_url is not None
@@ -259,18 +264,16 @@ class Novel:
             toc_content = self._add_toc(self.toc_main_url)
             all_tocs_content.append(toc_content)
             if self.decoder.has_pagination():
-                next_page = self._get_next_page_from_toc_content(toc_content)
+                next_page = self.decoder.get_toc_next_page_url(toc_content)
                 while next_page:
                     toc_content = self._add_toc(next_page)
-                    next_page = self._get_next_page_from_toc_content(
-                        toc_content)
+                    next_page = self.decoder.get_toc_next_page_url(toc_content)
                     all_tocs_content.append(toc_content)
         # Now we get the links from the toc content
         self.chapters_url_list = []
         for toc_content in all_tocs_content:
-            chapters_url_from_toc_content = self._get_chapter_urls_from_toc_content(
-                toc_content)
+            chapters_url_from_toc_content = self.decoder.get_chapter_urls(toc_content)
             if chapters_url_from_toc_content is None:
                 logger.error('Chapters url not found on toc_content')
                 return False
@@ -299,43 +302,45 @@ class Novel:
         chapter_list = "Chapters List:\n"
         for i, chapter in enumerate(self.chapters):
             chapter_list += f"Chapter {i + 1}:\n"
-            chapter_list += f"  Title: {
-                chapter.chapter_title if chapter.chapter_title else 'Title not yet scrapped'}\n"
+            chapter_list += f"  Title: {chapter.chapter_title if chapter.chapter_title else 'Title not yet scrapped'}\n"
             chapter_list += f"  URL: {chapter.chapter_url}\n"
-            chapter_list += f"  Filename: {
-                chapter.chapter_html_filename if chapter.chapter_html_filename else 'File not yet requested'}\n"
+            chapter_list += f"  Filename: {chapter.chapter_html_filename if chapter.chapter_html_filename else 'File not yet requested'}\n"
         return chapter_list
     def scrap_chapter(self, chapter_url: str = None, chapter_idx: int = None, update_html: bool = False) -> Chapter:
+        logger.info('Scraping Chapter...')
+        chapter = None
         if not utils.check_exclusive_params(chapter_url, chapter_idx):
-            logger.error(
-                'chapter_url and chapter_id, only one needs to be setted')
-            return
+            raise ValueError("chapter_url and chapter_id, only one needs to be set")
         if chapter_url is not None:
+            logger.debug(f'Using chapter url: {chapter_url}')
             chapter = self._get_chapter_by_url(chapter_url=chapter_url)
             if chapter is None:
+                logger.warning(f'Chapter with url "{chapter_url}" does not exist, generating one...')
                 chapter = Chapter(chapter_url=chapter_url)
         if chapter_idx is not None:
+            logger.debug(f'Using chapter index: {chapter_idx}')
             if chapter_idx < 0 or chapter_idx >= len(self.chapters):
-                logger.error(f'Could not find chapter with idx {chapter_idx}')
-                return
-            chapter = self.chapters[chapter_idx]
+                logger.critical(f'Could not find chapter with idx {chapter_idx}')
+                raise ValueError(f'Could not find chapter with idx {chapter_idx}')
+            chapter = self.chapters[chapter_idx]
+        if update_html:
+            logger.debug('HTML will be updated...')
         chapter = self._get_chapter(chapter,
                                     reload=update_html)
         if not chapter.chapter_html or not chapter.chapter_html_filename:
-            logger.warning(f'Failed to create chapter on link: "{
-                           chapter_url}" on path "{chapter.chapter_html_filename}"')
-            return
+            logger.critical(f'Failed to create chapter on link: "{chapter_url}" '
+                           f'on path "{chapter.chapter_html_filename}"')
+            raise ValueError(f'Failed to create chapter on link: "{chapter_url}" '
+                           f'on path "{chapter.chapter_html_filename}"')
-        # We get the title and content, if there's no title, we autogenerate one.
+        # We get the chapter title and content
+        # We pass an index so we can autogenerate a Title
         chapter = self._decode_chapter(chapter=chapter, idx_for_chapter_name=chapter_idx)
-        if not chapter.chapter_content:
-            logger.error('Content not found')
-            return
         logger.info(f'Chapter scrapped from link: {chapter_url}')
         return chapter
@@ -376,8 +381,7 @@ class Novel:
                 chapter = self._get_chapter(
                     chapter=chapter, reload=update_html)
                 if not chapter.chapter_html_filename:
-                    logger.critical(f'Error requesting chapter {
-                                    i} with url {chapter.chapter_url}')
+                    logger.critical(f'Error requesting chapter {i} with url {chapter.chapter_url}')
                     return False
                 self._add_or_update_chapter_data(chapter=chapter, link_idx=i,
@@ -399,16 +403,15 @@ class Novel:
             self.sync_toc()
         if start_chapter > len(self.chapters):
-            logger.info(f'The start chapter is bigger than the number of chapters saved ({
-                        len(self.chapters)})')
+            logger.info(f'The start chapter is bigger than the number of chapters saved ({len(self.chapters)})')
             return
         if not end_chapter:
             end_chapter = len(self.chapters)
         elif end_chapter > len(self.chapters):
             end_chapter = len(self.chapters)
-            logger.info(f'The end chapter is bigger than the number of chapters, automatically setting it to {
-                        end_chapter}.')
+            logger.info(f'The end chapter is bigger than the number of chapters, '
+                        f'automatically setting it to {end_chapter}.')
         idx = 1
         start = start_chapter
@@ -418,8 +421,8 @@ class Novel:
                                                  end_chapter=end,
                                                  collection_idx=idx)
             if not result:
-                logger.critical(f'Error with saving novel to epub, with start chapter: {
-                                start_chapter} and end chapter: {end_chapter}')
+                logger.critical(f'Error with saving novel to epub, with start chapter: '
+                                f'{start_chapter} and end chapter: {end_chapter}')
                 return False
             start = start + chapters_by_book
             idx = idx + 1
@@ -506,22 +509,6 @@ class Novel:
         self.file_manager.add_toc(content)
         return content
-    def _get_chapter_urls_from_toc_content(self, toc_content: str) -> list[str]:
-        toc_elements = self.decoder.decode_html(toc_content, 'index')
-        try:
-            toc_urls = [toc_element['href'] for toc_element in toc_elements]
-        except KeyError as e:
-            logger.error(f'{e} not found on the Tag elements decoded from TOC')
-            return
-        if toc_urls:
-            return toc_urls
-        logger.warning('No chapter links found on toc content')
-    def _get_next_page_from_toc_content(self, toc_content: str) -> str:
-        next_page = self.decoder.decode_html(toc_content, 'next_page')
-        if next_page:
-            return next_page[0]['href']
     def _add_or_update_chapter_data(self, chapter: Chapter, link_idx: int = None, save_in_file: bool = True) -> None:
         if link_idx:
             chapter_idx = link_idx
@@ -579,35 +566,28 @@ class Novel:
         self.save_novel()
     def _decode_chapter(self, chapter: Chapter, idx_for_chapter_name: str = None) -> Chapter:
-        chapter_title = None
+        logger.debug('Decoding chapter...')
         if chapter.chapter_html is None:
+            logger.debug(f'No HTML content found, requesting HTML content...')
             chapter = self._get_chapter(chapter)
             if not chapter.chapter_html:
-                logger.error(f'No chapter content found for chapter link {
-                             chapter.chapter_url} on file {chapter.chapter_html_filename}')
-                return None
-        paragraphs = self.decoder.decode_html(chapter.chapter_html, 'content')
-        if not paragraphs:
-            if chapter:
-                logger.warning(f'No paragraphs found in chapter link {
-                    chapter.chapter_url} on file {chapter.chapter_html_filename}')
+                raise ValueError(f'Chapter HTML could not be obtained for chapter link "{chapter.chapter_url}" '
+                                 f'on file "{chapter.chapter_html_filename}"')
-        chapter_title = self.decoder.decode_html(chapter.chapter_html, 'title')
+        logger.debug('Obtaining chapter title...')
+        chapter_title = self.decoder.get_chapter_title(chapter.chapter_html)
         if not chapter_title:
-            chapter_title = f'{self.metadata.novel_title} Chapter {
-                idx_for_chapter_name}'
+            logger.debug('No chapter title found, generating one...')
+            chapter_title = f'{self.metadata.novel_title} Chapter {idx_for_chapter_name}'
         chapter.chapter_title = str(chapter_title)
+        logger.debug(f'Chapter title: "{chapter_title}"')
-        chapter.chapter_content = ""
-        if self.scraper_behavior.save_title_to_content:
-            chapter.chapter_content += f'<h4>{chapter_title}</h4>'
-        logger.info(f'{len(paragraphs)} paragraphs found in chapter')
-        for paragraph in paragraphs:
-            chapter.chapter_content += str(paragraph)
+        logger.debug('Obtaining chapter content...')
+        chapter.chapter_content = self.decoder.get_chapter_content(chapter.chapter_html,
+                                                           self.scraper_behavior.save_title_to_content,
+                                                           chapter.chapter_title)
+        logger.debug('Chapter successfully decoded')
         return chapter
@@ -631,7 +611,7 @@ class Novel:
         if self.metadata.start_date:
             date_metadata += self.metadata.start_date
         # Calibre specification doesn't use end_date.
-        # For now we use a custom metadata
+        # For now, we use a custom metadata
         # https://idpf.org/epub/31/spec/epub-packages.html#sec-opf-dcdate
         # if self.metadata.end_date:
         #     date_metadata += f'/{self.metadata.end_date}'
@@ -699,8 +679,7 @@ class Novel:
         idx_start = start_chapter - 1
         idx_end = end_chapter
         # We create the epub book
-        book_title = f'{self.metadata.novel_title} Chapters {
-            start_chapter} - {end_chapter}'
+        book_title = f'{self.metadata.novel_title} Chapters {start_chapter} - {end_chapter}'
         calibre_collection = None
         # If collection_idx is set, we create a calibre collection
         if collection_idx:
@@ -712,8 +691,7 @@ class Novel:
             book = self._add_chapter_to_epub_book(chapter=chapter,
                                                   book=book)
             if book is None:
-                logger.critical(f'Error saving epub {book_title}, could not decode chapter {
-                                chapter} using host {self.host}')
+                logger.critical(f'Error saving epub {book_title}, could not decode chapter {chapter} using host {self.host}')
                 return False
         book.add_item(epub.EpubNcx())

web_novel_scraper/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.0.3"
1	+ __version__ = "1.1.0"

{web_novel_scraper-1.0.3.dist-info → web_novel_scraper-1.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: web-novel-scraper
-Version: 1.0.3
+Version: 1.1.0
 Summary: Python tool that allows you to scrape web novels from various sources and save them to more readable formats like EPUB.
 Project-URL: Homepage, https://github.com/ImagineBrkr/web-novel-scraper
 Project-URL: Documentation, https://web-novel-scraper.readthedocs.io

web_novel_scraper-1.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+web_novel_scraper/__main__.py,sha256=OQQVX5CttmAkUwdrnjBSjKPaoh_boUI2ysHi3rLGOSs,17769
+web_novel_scraper/decode.py,sha256=QxPjoYI1t4bf0zAf_7uLRrpsboi8DwsD1BNZUiHO4gc,10150
+web_novel_scraper/file_manager.py,sha256=qAqgqtaRb7QyVtyEOW2cMhPYWdKM6nJ69weUCYKwVtM,11862
+web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
+web_novel_scraper/novel_scraper.py,sha256=hXIIPelRfx-jfD9VSPheg6z04I4JKxQj7wVBPlpP1go,28452
+web_novel_scraper/request_manager.py,sha256=VtGpRi5b_Dp3h8viCdt7yMCb9M21Lk7oLC_Q_0EkXH8,6448
+web_novel_scraper/utils.py,sha256=vq5ROuPv04k3MhbksTe0ni_yP0i_a7T_33mkBB1DUbQ,2076
+web_novel_scraper/version.py,sha256=LGVQyDsWifdACo7qztwb8RWWHds1E7uQ-ZqD8SAjyw4,22
+web_novel_scraper/custom_processor/__init__.py,sha256=iy4tjivMjshSzc52--aa-jK53qu9VwdK-6p4vuQc6oc,103
+web_novel_scraper/custom_processor/custom_processor.py,sha256=h1MPl6JU_C2Mc7SqK70LsNQHpDzSL6QyraMIQ87HcMM,870
+web_novel_scraper/custom_processor/sites/genesis.py,sha256=xV0eybI0ieHR5gn4yWXI74l99Eayhqs16PIYs-BrPjE,1843
+web_novel_scraper/custom_processor/sites/royalroad.py,sha256=_2PsFC_w3RJCUkAPoRn-7R2jlzl3XsG4WYtRaQkp6lg,787
+web_novel_scraper/decode_guide/decode_guide.json,sha256=DbcfnyRNOVXZd6ar1HDCHxkKgnmR3ziJ-B4GOFcDMEs,7584
+web_novel_scraper-1.1.0.dist-info/METADATA,sha256=Llcez3yLJTICPNMAoO1aZShywK2soma1kmjl2OA3tYA,8423
+web_novel_scraper-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+web_novel_scraper-1.1.0.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
+web_novel_scraper-1.1.0.dist-info/RECORD,,

web_novel_scraper-1.0.3.dist-info/RECORD DELETED Viewed

@@ -1,14 +0,0 @@
-web_novel_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-web_novel_scraper/__main__.py,sha256=PBIAG1vshnRdvYwyeD-RxlHS5mNEby-d4puV9kEyfpA,17615
-web_novel_scraper/decode.py,sha256=dqGv_8nFSKwO6GBj3jhaO9SQeLHeBjDzoV1C_YcN40k,5085
-web_novel_scraper/file_manager.py,sha256=PJu8kKeng49DTNQBbbMekFtIcTZOkeCEjFYqYJarv9M,11363
-web_novel_scraper/logger_manager.py,sha256=A-a4bhYI4YCEuSJd9E3WH_kanJ7YCASMwheBzObZK4Q,1972
-web_novel_scraper/novel_scraper.py,sha256=eiic2i3AdK9lcFK9aNb4d8ptnKv9ua1B_9kcUY8_liM,28660
-web_novel_scraper/request_manager.py,sha256=VtGpRi5b_Dp3h8viCdt7yMCb9M21Lk7oLC_Q_0EkXH8,6448
-web_novel_scraper/utils.py,sha256=vq5ROuPv04k3MhbksTe0ni_yP0i_a7T_33mkBB1DUbQ,2076
-web_novel_scraper/version.py,sha256=2plzdEEb24FLjE2I2XyBBcJEPYWHccNL4SgtLC_6erg,22
-web_novel_scraper/decode_guide/decode_guide.json,sha256=Q4v-OZh_1MwdrFxDDVvj8T3evW3zzbSapRaGwFCdnX8,5425
-web_novel_scraper-1.0.3.dist-info/METADATA,sha256=VKG91J-QhL_NBjSuS29Em5_ZcFlw9oKf50-7WcJ97Lw,8423
-web_novel_scraper-1.0.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-web_novel_scraper-1.0.3.dist-info/entry_points.txt,sha256=bqRvStfvSprSJc2EJXgKIbggWOXSePHFfVIZWy_plDQ,69
-web_novel_scraper-1.0.3.dist-info/RECORD,,

{web_novel_scraper-1.0.3.dist-info → web_novel_scraper-1.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{web_novel_scraper-1.0.3.dist-info → web_novel_scraper-1.1.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

web-novel-scraper 1.0.3__py3-none-any.whl → 1.1.0__py3-none-any.whl

web-novel-scraper 1.0.3py3-none-any.whl → 1.1.0py3-none-any.whl