PyPI - ebk - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

ebk 0.3.1py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ebk might be problematic. Click here for more details.

Files changed (61) hide show

ebk/ai/__init__.py +23 -0
ebk/ai/knowledge_graph.py +443 -0
ebk/ai/llm_providers/__init__.py +21 -0
ebk/ai/llm_providers/base.py +230 -0
ebk/ai/llm_providers/ollama.py +362 -0
ebk/ai/metadata_enrichment.py +396 -0
ebk/ai/question_generator.py +328 -0
ebk/ai/reading_companion.py +224 -0
ebk/ai/semantic_search.py +434 -0
ebk/ai/text_extractor.py +394 -0
ebk/cli.py +1097 -9
ebk/db/__init__.py +37 -0
ebk/db/migrations.py +180 -0
ebk/db/models.py +526 -0
ebk/db/session.py +144 -0
ebk/exports/__init__.py +0 -0
ebk/exports/base_exporter.py +218 -0
ebk/exports/html_library.py +1390 -0
ebk/exports/html_utils.py +117 -0
ebk/exports/hugo.py +59 -0
ebk/exports/jinja_export.py +287 -0
ebk/exports/multi_facet_export.py +164 -0
ebk/exports/symlink_dag.py +479 -0
ebk/exports/zip.py +25 -0
ebk/library_db.py +155 -0
ebk/repl/__init__.py +9 -0
ebk/repl/find.py +126 -0
ebk/repl/grep.py +174 -0
ebk/repl/shell.py +1677 -0
ebk/repl/text_utils.py +320 -0
ebk/services/__init__.py +11 -0
ebk/services/import_service.py +442 -0
ebk/services/tag_service.py +282 -0
ebk/services/text_extraction.py +317 -0
ebk/similarity/__init__.py +77 -0
ebk/similarity/base.py +154 -0
ebk/similarity/core.py +445 -0
ebk/similarity/extractors.py +168 -0
ebk/similarity/metrics.py +376 -0
ebk/vfs/__init__.py +101 -0
ebk/vfs/base.py +301 -0
ebk/vfs/library_vfs.py +124 -0
ebk/vfs/nodes/__init__.py +54 -0
ebk/vfs/nodes/authors.py +196 -0
ebk/vfs/nodes/books.py +480 -0
ebk/vfs/nodes/files.py +155 -0
ebk/vfs/nodes/metadata.py +385 -0
ebk/vfs/nodes/root.py +100 -0
ebk/vfs/nodes/similar.py +165 -0
ebk/vfs/nodes/subjects.py +184 -0
ebk/vfs/nodes/tags.py +371 -0
ebk/vfs/resolver.py +228 -0
{ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/METADATA +1 -1
ebk-0.3.2.dist-info/RECORD +69 -0
ebk-0.3.2.dist-info/entry_points.txt +2 -0
ebk-0.3.2.dist-info/top_level.txt +1 -0
ebk-0.3.1.dist-info/RECORD +0 -19
ebk-0.3.1.dist-info/entry_points.txt +0 -6
ebk-0.3.1.dist-info/top_level.txt +0 -2
{ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/WHEEL +0 -0
{ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/licenses/LICENSE +0 -0

ebk/exports/html_utils.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""HTML sanitization utilities for secure template rendering."""
+import json
+import html
+from typing import Any, Dict, List
+import re
+def sanitize_for_html(text: str) -> str:
+    """
+    Sanitize text for safe HTML output.
+    Escapes HTML special characters to prevent XSS attacks.
+    """
+    if not text:
+        return ""
+    return html.escape(str(text))
+def sanitize_for_javascript(obj: Any) -> str:
+    """
+    Safely encode data for embedding in JavaScript.
+    This prevents XSS attacks when embedding data in script tags.
+    """
+    # Convert to JSON with proper escaping
+    json_str = json.dumps(obj, ensure_ascii=False)
+    # Additional escaping for script context
+    # Replace </script> to prevent breaking out of script tags
+    json_str = json_str.replace('</script>', '<\\/script>')
+    json_str = json_str.replace('<!--', '<\\!--')
+    json_str = json_str.replace('-->', '--\\>')
+    return json_str
+def sanitize_metadata(entry: Dict) -> Dict:
+    """
+    Sanitize metadata fields that will be displayed in HTML.
+    Preserves structure but escapes string values.
+    """
+    sanitized = {}
+    for key, value in entry.items():
+        if isinstance(value, str):
+            # Don't sanitize file paths and IDs (they're not displayed as HTML)
+            if key in ('file_paths', 'cover_path', 'unique_id', '_entry_id'):
+                sanitized[key] = value
+            else:
+                sanitized[key] = sanitize_for_html(value)
+        elif isinstance(value, list):
+            # Sanitize list items if they're strings
+            sanitized[key] = [
+                sanitize_for_html(item) if isinstance(item, str) else item
+                for item in value
+            ]
+        elif isinstance(value, dict):
+            # Recursively sanitize nested dicts
+            sanitized[key] = sanitize_metadata(value)
+        else:
+            sanitized[key] = value
+    return sanitized
+def sanitize_entries_for_javascript(entries: List[Dict]) -> str:
+    """
+    Prepare entries for safe embedding in JavaScript.
+    This sanitizes user content while preserving the data structure.
+    """
+    # Create a sanitized copy of entries
+    sanitized_entries = []
+    for entry in entries:
+        # Create a minimal, safe version for JavaScript
+        safe_entry = {
+            'unique_id': entry.get('unique_id', ''),
+            'title': sanitize_for_html(entry.get('title', '')),
+            'creators': [sanitize_for_html(c) for c in entry.get('creators', [])],
+            'subjects': [sanitize_for_html(s) for s in entry.get('subjects', [])],
+            'language': sanitize_for_html(entry.get('language', '')),
+            'date': sanitize_for_html(str(entry.get('date', ''))),
+            'publisher': sanitize_for_html(str(entry.get('publisher', ''))),
+            'description': sanitize_for_html(entry.get('description', '')),
+            'cover_path': entry.get('cover_path', ''),
+            'file_paths': entry.get('file_paths', []),
+            '_readable_name': sanitize_for_html(entry.get('_readable_name', '')),
+            '_entry_id': entry.get('_entry_id', '')
+        }
+        sanitized_entries.append(safe_entry)
+    return sanitize_for_javascript(sanitized_entries)
+def create_safe_filename(text: str, max_length: int = 255) -> str:
+    """
+    Create a safe filename from text.
+    Removes/replaces characters that could cause issues in filenames.
+    """
+    # Remove HTML tags if any
+    text = re.sub(r'<[^>]+>', '', text)
+    # Replace unsafe characters
+    safe_chars = re.sub(r'[<>:"/\\|?*]', '_', text)
+    # Remove control characters
+    safe_chars = ''.join(char for char in safe_chars if ord(char) >= 32)
+    # Truncate if too long
+    if len(safe_chars) > max_length:
+        safe_chars = safe_chars[:max_length-3] + '...'
+    return safe_chars.strip()

ebk/exports/hugo.py ADDED Viewed

@@ -0,0 +1,59 @@
+import json
+import shutil
+from pathlib import Path
+from typing import List
+import logging
+logger = logging.getLogger(__name__)
+def export_hugo(lib_dir, hugo_dir):
+    """
+    Export ebk library to Hugo-compatible Markdown files.
+    Args:
+        lib_dir (str): Path to the ebk library directory to export (contains `metadata.json` and ebook-related files)
+        hugo_dir (str): Path to the Hugo site directory
+    """
+    lib_dir = Path(lib_dir)
+    with open(lib_dir / "metadata.json", "r") as f:
+        books = json.load(f)
+    hugo_dir = Path(hugo_dir)
+    content_dir = hugo_dir / "content" / "library"
+    static_dir = hugo_dir / "static" / "ebooks"
+    content_dir.mkdir(parents=True, exist_ok=True)
+    static_dir.mkdir(parents=True, exist_ok=True)
+    for book in books:
+        slug = book['title'].replace(" ", "-").lower()
+        md_file = content_dir / f"{slug}.md"
+        with open(md_file, "w") as md:
+            md.write("---\n")
+            md.write(f"title: {book['title']}\n")
+            md.write(f"creators: [{', '.join(book['creators'])}]\n")
+            md.write(f"subjects: [{', '.join(book['subjects'])}]\n")
+            md.write(f"description: {book['description']}\n")
+            md.write(f"date: {book['date']}\n")
+            md.write(f"tags: [{', '.join(book['Tags'].split(', '))}]\n")
+            md.write(f"ebook_file: /ebooks/{Path(book['file_path']).name}\n")
+            md.write(f"cover_image: /ebooks/{Path(book['Cover Path']).name if book['Cover Path'] else ''}\n")
+            md.write("---\n\n")
+            md.write(f"# {book['Title']}\n\n")
+            md.write(f"Author: {book['Author']}\n\n")
+            md.write(f"[Download eBook](/ebooks/{Path(book['File Path']).name})\n")
+        # Copy eBook and cover to static directory
+        if book["File Path"]:
+            source_file = Path(book['File Path'])
+            if source_file.exists():
+                shutil.copy2(source_file, static_dir)
+        if book["Cover Path"]:
+            cover_file = Path(book['Cover Path'])
+            if cover_file.exists():
+                shutil.copy2(cover_file, static_dir)
+    logger.debug(f"Exported {len(books)} books to Hugo site at '{hugo_dir}'")

ebk/exports/jinja_export.py ADDED Viewed

@@ -0,0 +1,287 @@
+"""
+Flexible Jinja2-based export system for ebk libraries.
+This module provides a template-driven approach to exporting ebook metadata
+in various formats, with Hugo as the primary implementation.
+"""
+import os
+import json
+import shutil
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+from jinja2 import Environment, FileSystemLoader, select_autoescape
+import logging
+from slugify import slugify
+from collections import defaultdict
+logger = logging.getLogger(__name__)
+class JinjaExporter:
+    """Flexible export system using Jinja2 templates."""
+    def __init__(self, template_dir: Optional[Path] = None):
+        """
+        Initialize the exporter with a template directory.
+        Args:
+            template_dir: Path to custom templates. If None, uses built-in templates.
+        """
+        if template_dir is None:
+            template_dir = Path(__file__).parent / "templates"
+        self.env = Environment(
+            loader=FileSystemLoader(template_dir),
+            autoescape=select_autoescape(['html', 'xml']),
+            trim_blocks=True,
+            lstrip_blocks=True
+        )
+        # Add custom filters
+        self.env.filters['slugify'] = slugify
+        self.env.filters['join_list'] = lambda x: ', '.join(x) if isinstance(x, list) else x
+        self.env.filters['default_if_none'] = lambda x, default='': x if x is not None else default
+    def export_hugo(self, lib_dir: str, hugo_dir: str,
+                    organize_by: str = "flat",
+                    create_indexes: bool = True,
+                    copy_files: bool = True):
+        """
+        Export library to Hugo with flexible organization options.
+        Args:
+            lib_dir: Path to ebk library
+            hugo_dir: Path to Hugo site directory
+            organize_by: Organization method - "flat", "year", "language", "subject", "creator"
+            create_indexes: Whether to create index pages for categories
+            copy_files: Whether to copy ebook and cover files
+        """
+        lib_path = Path(lib_dir)
+        hugo_path = Path(hugo_dir)
+        # Load metadata
+        with open(lib_path / "metadata.json", "r") as f:
+            books = json.load(f)
+        # Prepare books with normalized fields
+        books = self._normalize_metadata(books)
+        # Create directory structure
+        content_dir = hugo_path / "content" / "library"
+        static_dir = hugo_path / "static" / "ebooks"
+        content_dir.mkdir(parents=True, exist_ok=True)
+        static_dir.mkdir(parents=True, exist_ok=True)
+        # Group books by organization method
+        grouped_books = self._group_books(books, organize_by)
+        # Export individual book pages
+        for group_key, group_books in grouped_books.items():
+            group_dir = content_dir / group_key if organize_by != "flat" else content_dir
+            group_dir.mkdir(parents=True, exist_ok=True)
+            for book in group_books:
+                self._export_book(book, group_dir, static_dir, lib_path, copy_files)
+        # Create index pages
+        if create_indexes:
+            self._create_indexes(grouped_books, content_dir, organize_by)
+        # Create main library index
+        self._create_main_index(books, content_dir, organize_by)
+        logger.info(f"Exported {len(books)} books to Hugo site at '{hugo_dir}'")
+    def _normalize_metadata(self, books: List[Dict]) -> List[Dict]:
+        """Normalize metadata fields for consistent access."""
+        normalized = []
+        for book in books:
+            # Create a normalized version with consistent field names
+            norm = {
+                'title': book.get('title', 'Unknown Title'),
+                'creators': book.get('creators', []),
+                'subjects': book.get('subjects', []),
+                'description': book.get('description', ''),
+                'language': book.get('language', 'en'),
+                'date': book.get('date', ''),
+                'publisher': book.get('publisher', ''),
+                'identifiers': book.get('identifiers', {}),
+                'file_paths': book.get('file_paths', []),
+                'cover_path': book.get('cover_path', ''),
+                'unique_id': book.get('unique_id', ''),
+                # Keep original data for backward compatibility
+                '_original': book
+            }
+            # Extract year from date if available
+            if norm['date']:
+                try:
+                    norm['year'] = norm['date'][:4]
+                except (IndexError, TypeError, AttributeError):
+                    norm['year'] = ''  # Invalid date format
+            else:
+                norm['year'] = ''
+            # Generate slug
+            norm['slug'] = slugify(f"{norm['title']}-{norm['unique_id'][:8]}")
+            normalized.append(norm)
+        return normalized
+    def _group_books(self, books: List[Dict], organize_by: str) -> Dict[str, List[Dict]]:
+        """Group books by specified organization method."""
+        grouped = defaultdict(list)
+        if organize_by == "flat":
+            grouped[""] = books
+        elif organize_by == "year":
+            for book in books:
+                year = book.get('year', 'unknown-year')
+                grouped[year].append(book)
+        elif organize_by == "language":
+            for book in books:
+                lang = book.get('language', 'unknown-language')
+                grouped[lang].append(book)
+        elif organize_by == "subject":
+            for book in books:
+                subjects = book.get('subjects', ['uncategorized'])
+                for subject in subjects:
+                    grouped[slugify(subject)].append(book)
+        elif organize_by == "creator":
+            for book in books:
+                creators = book.get('creators', ['unknown-creator'])
+                for creator in creators:
+                    grouped[slugify(creator)].append(book)
+        else:
+            # Default to flat
+            grouped[""] = books
+        return dict(grouped)
+    def _export_book(self, book: Dict, output_dir: Path, static_dir: Path,
+                     lib_path: Path, copy_files: bool):
+        """Export a single book."""
+        # Load book template
+        template = self.env.get_template('hugo/book.md')
+        # Prepare file paths for Hugo
+        ebook_urls = []
+        if book['file_paths']:
+            for file_path in book['file_paths']:
+                if copy_files and file_path:
+                    src = lib_path / file_path
+                    if src.exists():
+                        dst = static_dir / src.name
+                        shutil.copy2(src, dst)
+                        ebook_urls.append(f"/ebooks/{src.name}")
+        cover_url = ""
+        if book['cover_path'] and copy_files:
+            src = lib_path / book['cover_path']
+            if src.exists():
+                dst = static_dir / src.name
+                shutil.copy2(src, dst)
+                cover_url = f"/ebooks/{src.name}"
+        # Render template
+        content = template.render(
+            book=book,
+            ebook_urls=ebook_urls,
+            cover_url=cover_url
+        )
+        # Write file
+        output_file = output_dir / f"{book['slug']}.md"
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write(content)
+    def _create_indexes(self, grouped_books: Dict[str, List[Dict]],
+                       content_dir: Path, organize_by: str):
+        """Create index pages for each group."""
+        if organize_by == "flat":
+            return
+        template = self.env.get_template('hugo/index.md')
+        for group_key, books in grouped_books.items():
+            if not group_key:  # Skip empty group
+                continue
+            group_dir = content_dir / group_key
+            index_file = group_dir / "_index.md"
+            # Determine group title
+            if organize_by == "year":
+                group_title = f"Books from {group_key}"
+            elif organize_by == "language":
+                group_title = f"Books in {group_key}"
+            elif organize_by == "subject":
+                group_title = f"Subject: {group_key.replace('-', ' ').title()}"
+            elif organize_by == "creator":
+                group_title = f"Books by {group_key.replace('-', ' ').title()}"
+            else:
+                group_title = group_key.replace('-', ' ').title()
+            content = template.render(
+                title=group_title,
+                organize_by=organize_by,
+                group_key=group_key,
+                books=books,
+                book_count=len(books)
+            )
+            with open(index_file, 'w', encoding='utf-8') as f:
+                f.write(content)
+    def _create_main_index(self, books: List[Dict], content_dir: Path, organize_by: str):
+        """Create main library index page."""
+        template = self.env.get_template('hugo/library.md')
+        # Calculate statistics
+        stats = {
+            'total_books': len(books),
+            'total_creators': len(set(creator for book in books for creator in book.get('creators', []))),
+            'total_subjects': len(set(subject for book in books for subject in book.get('subjects', []))),
+            'languages': defaultdict(int),
+            'years': defaultdict(int),
+            'top_creators': defaultdict(int),
+            'top_subjects': defaultdict(int)
+        }
+        for book in books:
+            # Language stats
+            lang = book.get('language', 'unknown')
+            stats['languages'][lang] += 1
+            # Year stats
+            year = book.get('year', 'unknown')
+            if year:
+                stats['years'][year] += 1
+            # Creator stats
+            for creator in book.get('creators', []):
+                stats['top_creators'][creator] += 1
+            # Subject stats
+            for subject in book.get('subjects', []):
+                stats['top_subjects'][subject] += 1
+        # Sort and limit top items
+        stats['top_creators'] = sorted(stats['top_creators'].items(),
+                                     key=lambda x: x[1], reverse=True)[:10]
+        stats['top_subjects'] = sorted(stats['top_subjects'].items(),
+                                     key=lambda x: x[1], reverse=True)[:10]
+        content = template.render(
+            title="Library",
+            books=books,
+            stats=stats,
+            organize_by=organize_by
+        )
+        index_file = content_dir / "_index.md"
+        with open(index_file, 'w', encoding='utf-8') as f:
+            f.write(content)

ebk/exports/multi_facet_export.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""Multi-faceted export for ebk libraries with sidebar navigation."""
+from pathlib import Path
+from typing import Dict, List, Set, Optional
+import json
+import shutil
+from collections import defaultdict
+import re
+from datetime import datetime
+from jinja2 import Environment, FileSystemLoader
+from .html_utils import sanitize_entries_for_javascript, sanitize_for_html, create_safe_filename
+from .base_exporter import BaseExporter
+class MultiFacetExporter(BaseExporter):
+    """Export library with multiple faceted navigation (subjects, authors, etc.)."""
+    def __init__(self, facets: Optional[Dict[str, str]] = None):
+        """
+        Initialize the multi-facet exporter.
+        Args:
+            facets: Dictionary mapping facet names to metadata fields
+                   e.g., {"Subjects": "subjects", "Authors": "creators", "Years": "date"}
+        """
+        super().__init__()
+        self.facets = facets or {
+            "Subjects": "subjects",
+            "Authors": "creators",
+            "Publishers": "publisher",
+            "Languages": "language"
+        }
+    def export(self, library_path: Path, output_path: Path,
+               include_files: bool = False,
+               create_index: bool = True, **options):
+        """Export the library with multi-faceted navigation."""
+        # Use base class methods
+        entries = self.load_metadata(library_path)
+        self.prepare_output_directory(output_path)
+        # Build facet data
+        facet_data = self._build_facet_data(entries)
+        # Create _books directory structure
+        books_dir = output_path / "_books"
+        books_dir.mkdir()
+        # Process each entry
+        for entry in entries:
+            entry_id = entry.get("unique_id", "")
+            if not entry_id:
+                continue
+            # Create entry directory
+            entry_dir = books_dir / self._sanitize_filename(entry_id)
+            entry_dir.mkdir(exist_ok=True)
+            # Use base class file operations
+            if include_files:
+                self.copy_entry_files(entry, library_path, entry_dir)
+            else:
+                self.symlink_entry_files(entry, library_path, entry_dir)
+            # Write entry metadata using base class method
+            self.write_json(entry, entry_dir / "metadata.json")
+            # Add computed fields for template
+            entry["_entry_id"] = entry_id
+            entry["_readable_name"] = self.get_readable_name(entry)
+        # Create index.html if requested
+        if create_index:
+            self._create_index_file(output_path, entries, facet_data)
+        # Create README using base class method
+        stats = {
+            'total_entries': len(entries),
+            'export_date': datetime.now().isoformat(),
+            'export_type': 'Multi-Faceted Export',
+            'structure_description': f"Organized by {len(self.facets)} facets with {len(entries)} entries"
+        }
+        self.create_readme(output_path, stats)
+    def _build_facet_data(self, entries: List[Dict]) -> Dict[str, Dict]:
+        """Build facet data structure from entries."""
+        facet_data = {}
+        for facet_name, field_name in self.facets.items():
+            items = defaultdict(int)
+            for entry in entries:
+                values = entry.get(field_name, [])
+                if not isinstance(values, list):
+                    values = [values] if values else []
+                for value in values:
+                    if value:  # Skip empty values
+                        # Special handling for dates - extract year
+                        if field_name == "date" and value:
+                            try:
+                                year = str(value)[:4]
+                                if year.isdigit():
+                                    items[year] += 1
+                            except (KeyError, ValueError, AttributeError):
+                                pass  # Skip entries with invalid date format
+                        else:
+                            items[str(value)] += 1
+            facet_data[field_name] = {
+                "display_name": facet_name,
+                "items": dict(items)
+            }
+        return facet_data
+    def _create_index_file(self, output_path: Path, entries: List[Dict],
+                          facet_data: Dict[str, Dict]):
+        """Create the multi-faceted index.html file."""
+        # Prepare entries for JSON
+        clean_entries = []
+        for entry in entries:
+            clean_entry = {}
+            for key, value in entry.items():
+                if isinstance(value, str):
+                    if key == "description":
+                        # Strip HTML and limit length
+                        import re
+                        value = re.sub(r'<[^>]+>', '', value)
+                        if len(value) > 500:
+                            value = value[:500] + "..."
+                    clean_entry[key] = value
+                elif isinstance(value, list):
+                    clean_entry[key] = [str(v) for v in value]
+                else:
+                    clean_entry[key] = str(value)
+            clean_entries.append(clean_entry)
+        # Use safe JSON encoding for JavaScript embedding
+        entries_json = sanitize_entries_for_javascript(clean_entries)
+        # Set up Jinja2
+        template_dir = Path(__file__).parent / "templates"
+        env = Environment(loader=FileSystemLoader(str(template_dir)))
+        template = env.get_template("multi_facet_index.html")
+        # Render template with sanitized data
+        html_content = template.render(
+            title=sanitize_for_html("EBK Library"),
+            entries=entries,
+            entries_json=entries_json,  # Already sanitized
+            facets=facet_data,
+            is_subdir=False
+        )
+        # Write the file
+        index_path = output_path / "index.html"
+        with open(index_path, "w", encoding="utf-8") as f:
+            f.write(html_content)
+        readme_path = output_path / "README.md"
+        with open(readme_path, 'w', encoding='utf-8') as f:
+            f.write(readme_content)

ebk 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

Potentially problematic release.

ebk 0.3.1py3-none-any.whl → 0.3.2py3-none-any.whl