PyPI - pdf-file-renamer - Versions diffs - 0.4.2__py3-none-any.whl - Mend

pdf-file-renamer 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

pdf_file_renamer-0.4.2.dist-info/METADATA +245 -0
pdf_file_renamer-0.4.2.dist-info/RECORD +26 -0
pdf_file_renamer-0.4.2.dist-info/WHEEL +5 -0
pdf_file_renamer-0.4.2.dist-info/entry_points.txt +2 -0
pdf_file_renamer-0.4.2.dist-info/licenses/LICENSE +21 -0
pdf_file_renamer-0.4.2.dist-info/top_level.txt +1 -0
pdf_renamer/__init__.py +3 -0
pdf_renamer/application/__init__.py +7 -0
pdf_renamer/application/filename_service.py +70 -0
pdf_renamer/application/pdf_rename_workflow.py +144 -0
pdf_renamer/application/rename_service.py +79 -0
pdf_renamer/domain/__init__.py +25 -0
pdf_renamer/domain/models.py +80 -0
pdf_renamer/domain/ports.py +106 -0
pdf_renamer/infrastructure/__init__.py +5 -0
pdf_renamer/infrastructure/config.py +94 -0
pdf_renamer/infrastructure/llm/__init__.py +5 -0
pdf_renamer/infrastructure/llm/pydantic_ai_provider.py +234 -0
pdf_renamer/infrastructure/pdf/__init__.py +7 -0
pdf_renamer/infrastructure/pdf/composite.py +57 -0
pdf_renamer/infrastructure/pdf/docling_extractor.py +116 -0
pdf_renamer/infrastructure/pdf/pymupdf_extractor.py +165 -0
pdf_renamer/main.py +6 -0
pdf_renamer/presentation/__init__.py +6 -0
pdf_renamer/presentation/cli.py +233 -0
pdf_renamer/presentation/formatters.py +216 -0

pdf_renamer/infrastructure/pdf/docling_extractor.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""Docling-based PDF extractor for structure-aware text extraction."""
+import re
+from pathlib import Path
+from docling_core.types.doc.page import TextCellUnit
+from docling_parse.pdf_parser import DoclingPdfParser
+from pdf_renamer.domain.models import PDFContent, PDFMetadata
+from pdf_renamer.domain.ports import PDFExtractor
+class DoclingPDFExtractor(PDFExtractor):
+    """PDF extractor using docling-parse for better structure-aware extraction."""
+    def __init__(self, max_pages: int = 5, max_chars: int = 8000) -> None:
+        """
+        Initialize the Docling PDF extractor.
+        Args:
+            max_pages: Maximum pages to extract
+            max_chars: Maximum characters to extract
+        """
+        self.max_pages = max_pages
+        self.max_chars = max_chars
+        self._parser = DoclingPdfParser()
+    async def extract(self, pdf_path: Path) -> PDFContent:
+        """
+        Extract text and metadata from PDF using docling-parse.
+        Args:
+            pdf_path: Path to PDF file
+        Returns:
+            PDFContent with extracted text and metadata
+        Raises:
+            RuntimeError: If extraction fails
+        """
+        try:
+            pdf_doc = self._parser.load(path_or_stream=str(pdf_path))
+            text_parts: list[str] = []
+            total_chars = 0
+            page_count = 0
+            for page_no, pred_page in pdf_doc.iterate_pages():
+                page_count += 1
+                if page_no >= self.max_pages:
+                    break
+                # Extract text at line level for better structure preservation
+                page_lines: list[str] = []
+                for line in pred_page.iterate_cells(unit_type=TextCellUnit.LINE):
+                    page_lines.append(line.text)
+                page_text = "\n".join(page_lines)
+                # Add page text until we hit the character limit
+                remaining_chars = self.max_chars - total_chars
+                if remaining_chars <= 0:
+                    break
+                text_parts.append(page_text[:remaining_chars])
+                total_chars += len(page_text)
+            extracted_text = "\n".join(text_parts).strip()
+            # Extract metadata using separate method
+            metadata = await self._extract_metadata(pdf_path, extracted_text)
+            return PDFContent(text=extracted_text, metadata=metadata, page_count=page_count)
+        except Exception as e:
+            msg = f"Failed to extract text from {pdf_path} using docling-parse: {e}"
+            raise RuntimeError(msg) from e
+    async def _extract_metadata(self, pdf_path: Path, text: str) -> PDFMetadata:
+        """
+        Extract metadata from PDF.
+        Args:
+            pdf_path: Path to PDF file
+            text: Extracted text content
+        Returns:
+            PDFMetadata
+        """
+        # Note: docling-parse doesn't provide document-level metadata
+        # So we extract focused metadata from the text content
+        header_text = text[:500] if text else ""
+        # Extract year hints
+        year_pattern = r"\b(19\d{2}|20\d{2})\b"
+        years = re.findall(year_pattern, header_text)
+        # Extract email hints
+        email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
+        emails = re.findall(email_pattern, text[:2000])
+        # Look for author indicators
+        author_indicators = ["by ", "author:", "authors:", "written by"]
+        author_hints: list[str] = []
+        text_lower = text[:2000].lower()
+        for indicator in author_indicators:
+            if indicator in text_lower:
+                idx = text_lower.index(indicator)
+                author_hints.append(text[idx : idx + 100])
+        return PDFMetadata(
+            header_text=header_text,
+            year_hints=years[:3] if years else None,
+            email_hints=emails[:3] if emails else None,
+            author_hints=author_hints[:2] if author_hints else None,
+        )

pdf_renamer/infrastructure/pdf/pymupdf_extractor.py ADDED Viewed

@@ -0,0 +1,165 @@
+"""PyMuPDF-based PDF extractor with metadata support and OCR fallback."""
+import re
+from pathlib import Path
+import pymupdf
+from pdf_renamer.domain.models import PDFContent, PDFMetadata
+from pdf_renamer.domain.ports import PDFExtractor
+class PyMuPDFExtractor(PDFExtractor):
+    """PDF extractor using PyMuPDF with metadata and OCR support."""
+    def __init__(self, max_pages: int = 5, max_chars: int = 8000, enable_ocr: bool = True) -> None:
+        """
+        Initialize the PyMuPDF extractor.
+        Args:
+            max_pages: Maximum pages to extract
+            max_chars: Maximum characters to extract
+            enable_ocr: Enable OCR for scanned PDFs
+        """
+        self.max_pages = max_pages
+        self.max_chars = max_chars
+        self.enable_ocr = enable_ocr
+    async def extract(self, pdf_path: Path) -> PDFContent:
+        """
+        Extract text and metadata from PDF using PyMuPDF.
+        Args:
+            pdf_path: Path to PDF file
+        Returns:
+            PDFContent with extracted text and metadata
+        Raises:
+            RuntimeError: If extraction fails
+        """
+        try:
+            doc = pymupdf.open(pdf_path)
+            text_parts: list[str] = []
+            total_chars = 0
+            for page_num in range(min(self.max_pages, len(doc))):
+                page = doc[page_num]
+                page_text = page.get_text()
+                # Add page text until we hit the character limit
+                remaining_chars = self.max_chars - total_chars
+                if remaining_chars <= 0:
+                    break
+                text_parts.append(page_text[:remaining_chars])
+                total_chars += len(page_text)
+            extracted_text = "\n".join(text_parts).strip()
+            # If very little text and OCR enabled, try OCR
+            if len(extracted_text) < 200 and self.enable_ocr:
+                extracted_text = await self._extract_with_ocr(pdf_path, doc)
+            # Extract metadata
+            metadata = await self._extract_metadata(pdf_path, doc, extracted_text)
+            page_count = len(doc)
+            doc.close()
+            return PDFContent(text=extracted_text, metadata=metadata, page_count=page_count)
+        except Exception as e:
+            msg = f"Failed to extract text from {pdf_path} using PyMuPDF: {e}"
+            raise RuntimeError(msg) from e
+    async def _extract_with_ocr(self, pdf_path: Path, doc: pymupdf.Document) -> str:
+        """
+        Extract text using OCR for scanned PDFs.
+        Args:
+            pdf_path: Path to PDF file
+            doc: PyMuPDF document
+        Returns:
+            Extracted text
+        """
+        text_parts: list[str] = []
+        total_chars = 0
+        for page_num in range(min(self.max_pages, len(doc))):
+            page = doc[page_num]
+            try:
+                # Try OCR with Tesseract (if available)
+                tp = page.get_textpage(flags=0)
+                page_text = tp.extractText()
+                # If still no text, try with flags
+                if not page_text or len(page_text.strip()) < 50:
+                    page_text = page.get_text("text", flags=pymupdf.TEXT_PRESERVE_WHITESPACE)
+            except Exception:
+                # If OCR fails, get whatever text is available
+                page_text = page.get_text()
+            # Add page text until we hit the character limit
+            remaining_chars = self.max_chars - total_chars
+            if remaining_chars <= 0:
+                break
+            text_parts.append(page_text[:remaining_chars])
+            total_chars += len(page_text)
+        return "\n".join(text_parts).strip()
+    async def _extract_metadata(
+        self, pdf_path: Path, doc: pymupdf.Document, text: str
+    ) -> PDFMetadata:
+        """
+        Extract metadata from PDF.
+        Args:
+            pdf_path: Path to PDF file
+            doc: PyMuPDF document
+            text: Extracted text content
+        Returns:
+            PDFMetadata
+        """
+        # Get PDF metadata
+        meta = doc.metadata or {}
+        # Extract focused metadata from text
+        header_text = text[:500] if text else ""
+        # Extract year hints
+        year_pattern = r"\b(19\d{2}|20\d{2})\b"
+        years = re.findall(year_pattern, header_text)
+        # Extract email hints
+        email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
+        emails = re.findall(email_pattern, text[:2000])
+        # Look for author indicators
+        author_indicators = ["by ", "author:", "authors:", "written by"]
+        author_hints: list[str] = []
+        text_lower = text[:2000].lower()
+        for indicator in author_indicators:
+            if indicator in text_lower:
+                idx = text_lower.index(indicator)
+                author_hints.append(text[idx : idx + 100])
+        return PDFMetadata(
+            title=meta.get("title"),
+            author=meta.get("author"),
+            subject=meta.get("subject"),
+            keywords=meta.get("keywords"),
+            creator=meta.get("creator"),
+            producer=meta.get("producer"),
+            creation_date=meta.get("creationDate"),
+            modification_date=meta.get("modDate"),
+            header_text=header_text,
+            year_hints=years[:3] if years else None,
+            email_hints=emails[:3] if emails else None,
+            author_hints=author_hints[:2] if author_hints else None,
+        )

pdf_renamer/main.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Main entry point for the PDF renamer application."""
+from pdf_renamer.presentation.cli import app
+if __name__ == "__main__":
+    app()

pdf_renamer/presentation/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Presentation layer - CLI and user interaction."""
+from pdf_renamer.presentation.cli import app
+from pdf_renamer.presentation.formatters import ProgressDisplay
+__all__ = ["ProgressDisplay", "app"]

pdf_renamer/presentation/cli.py ADDED Viewed

@@ -0,0 +1,233 @@
+"""CLI interface using Typer."""
+import asyncio
+import contextlib
+from pathlib import Path
+from typing import Annotated
+import typer
+from rich.console import Console
+from rich.live import Live
+from pdf_renamer.application import (
+    FilenameService,
+    PDFRenameWorkflow,
+    RenameService,
+)
+from pdf_renamer.infrastructure.config import Settings
+from pdf_renamer.infrastructure.llm import PydanticAIProvider
+from pdf_renamer.infrastructure.pdf import (
+    CompositePDFExtractor,
+    DoclingPDFExtractor,
+    PyMuPDFExtractor,
+)
+from pdf_renamer.presentation.formatters import (
+    InteractivePrompt,
+    ProgressDisplay,
+    ResultsTable,
+)
+app = typer.Typer(help="Intelligent PDF renaming using LLMs")
+console = Console()
+def create_workflow(settings: Settings) -> PDFRenameWorkflow:
+    """
+    Create the workflow with all dependencies (Dependency Injection).
+    This is the "Composition Root" where we wire up all dependencies.
+    Args:
+        settings: Application settings
+    Returns:
+        Configured PDFRenameWorkflow
+    """
+    # Create PDF extractor (composite with fallback strategy)
+    extractors = [
+        DoclingPDFExtractor(max_pages=settings.pdf_max_pages, max_chars=settings.pdf_max_chars),
+        PyMuPDFExtractor(
+            max_pages=settings.pdf_max_pages,
+            max_chars=settings.pdf_max_chars,
+            enable_ocr=True,
+        ),
+    ]
+    pdf_extractor = CompositePDFExtractor(extractors)
+    # Create LLM provider
+    llm_provider = PydanticAIProvider(
+        model_name=settings.llm_model,
+        api_key=settings.openai_api_key,
+        base_url=settings.llm_base_url,
+        retry_max_attempts=settings.retry_max_attempts,
+        retry_min_wait=settings.retry_min_wait,
+        retry_max_wait=settings.retry_max_wait,
+    )
+    # Create application services
+    filename_service = FilenameService(llm_provider)
+    file_renamer = RenameService()
+    # Create workflow
+    return PDFRenameWorkflow(
+        pdf_extractor=pdf_extractor,
+        filename_generator=filename_service,
+        file_renamer=file_renamer,
+        max_concurrent_api=settings.max_concurrent_api,
+        max_concurrent_pdf=settings.max_concurrent_pdf,
+    )
+@app.command()
+def main(
+    directory: Annotated[
+        Path, typer.Argument(help="Directory containing PDF files to rename")
+    ] = Path.cwd(),
+    dry_run: Annotated[
+        bool, typer.Option("--dry-run/--no-dry-run", help="Show suggestions without renaming")
+    ] = True,
+    model: Annotated[
+        str | None,
+        typer.Option("--model", help="Model to use (overrides config)"),
+    ] = None,
+    url: Annotated[
+        str | None,
+        typer.Option("--url", help="Custom base URL for OpenAI-compatible APIs"),
+    ] = None,
+    interactive: Annotated[
+        bool, typer.Option("--interactive", "-i", help="Confirm each rename")
+    ] = False,
+    pattern: Annotated[str, typer.Option("--pattern", help="Glob pattern for PDF files")] = "*.pdf",
+    output_dir: Annotated[
+        Path | None,
+        typer.Option("--output-dir", "-o", help="Move renamed files to this directory"),
+    ] = None,
+) -> None:
+    """Rename PDF files in a directory using LLM-generated suggestions."""
+    # Load settings
+    settings = Settings()
+    # Override settings from CLI args
+    if model:
+        settings.llm_model = model
+    if url:
+        settings.llm_base_url = url
+    # Validate output directory
+    if output_dir:
+        output_dir.mkdir(parents=True, exist_ok=True)
+        if not output_dir.is_dir():
+            console.print(f"[red]Error: {output_dir} is not a directory[/red]")
+            raise typer.Exit(1)
+    # Find PDF files
+    pdf_files = sorted(directory.glob(pattern))
+    if not pdf_files:
+        console.print(f"[yellow]No PDF files found matching '{pattern}' in {directory}[/yellow]")
+        raise typer.Exit(0)
+    console.print(f"Found {len(pdf_files)} PDF files to process\n")
+    # Create workflow
+    workflow = create_workflow(settings)
+    # Process files with progress display
+    async def process_all() -> list:
+        progress = ProgressDisplay(console, len(pdf_files))
+        def status_callback(filename: str, status: dict[str, str]) -> None:
+            progress.update_status(filename, status)
+        # Run with live display
+        with Live(progress.create_display(), console=console, refresh_per_second=4) as live:
+            async def update_display() -> None:
+                while True:
+                    live.update(progress.create_display())
+                    await asyncio.sleep(0.25)
+            display_task = asyncio.create_task(update_display())
+            results = await workflow.process_batch(pdf_files, status_callback)
+            display_task.cancel()
+            with contextlib.suppress(asyncio.CancelledError):
+                await display_task
+            live.update(progress.create_display())
+        return results
+    # Run processing
+    console.print(
+        f"[bold]Processing {len(pdf_files)} PDFs with max {settings.max_concurrent_api} "
+        f"concurrent API calls and {settings.max_concurrent_pdf} concurrent extractions[/bold]\n"
+    )
+    results = asyncio.run(process_all())
+    # Filter successful operations
+    operations = [r for r in results if r is not None]
+    if not operations:
+        console.print("[red]No files could be processed successfully[/red]")
+        raise typer.Exit(1)
+    # Display results (if not interactive)
+    if not interactive:
+        ResultsTable.create(operations, console)
+    # Execute renames
+    if not dry_run or interactive:
+        renamed_count = 0
+        skipped_count = 0
+        async def execute_renames() -> None:
+            nonlocal renamed_count, skipped_count
+            prompt = InteractivePrompt(console) if interactive else None
+            for operation in operations:
+                # Interactive mode
+                if interactive and prompt:
+                    final_name, should_rename = await prompt.prompt_for_action(operation)
+                    if not should_rename:
+                        skipped_count += 1
+                        continue
+                    # Update operation with user's choice
+                    operation.suggested_filename = final_name
+                # Skip if no change
+                if not output_dir and operation.original_path.name == operation.new_filename:
+                    skipped_count += 1
+                    continue
+                # Execute rename
+                try:
+                    success = await workflow.execute_rename(operation, output_dir, dry_run)
+                    if success:
+                        if dry_run:
+                            console.print(
+                                f"[dim]Would rename: {operation.original_path.name} → "
+                                f"{operation.new_filename}[/dim]"
+                            )
+                        else:
+                            new_path = operation.create_new_path(output_dir)
+                            console.print(
+                                f"[green]✓[/green] {operation.original_path.name} → {new_path.name}"
+                            )
+                        renamed_count += 1
+                except Exception as e:
+                    console.print(
+                        f"[red]✗[/red] Failed to rename {operation.original_path.name}: {e}"
+                    )
+                    skipped_count += 1
+        asyncio.run(execute_renames())
+        console.print(f"\n[bold]Summary:[/bold] {renamed_count} renamed, {skipped_count} skipped")
+    else:
+        console.print("\n[bold yellow]Dry run mode - no files were renamed[/bold yellow]")
+        console.print("Run without --dry-run to apply changes")
+if __name__ == "__main__":
+    app()