PyPI - pdf-file-renamer - Versions diffs - 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl - Mend

pdf-file-renamer 0.6.1py3-none-any.whl → 0.6.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

pdf_file_renamer/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """PDF Renamer - Intelligent PDF file renaming using LLMs."""
-__version__ = "0.6.1"
+__version__ = "0.6.2"

pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py CHANGED Viewed

@@ -1,7 +1,9 @@
 """DOI extraction using pdf2doi library."""
 import asyncio
+import contextlib
 import re
+from difflib import SequenceMatcher
 from pathlib import Path
 import pdf2doi
@@ -13,10 +15,18 @@ from pdf_file_renamer.domain.ports import DOIExtractor
 class PDF2DOIExtractor(DOIExtractor):
     """Extract DOI from PDF files using pdf2doi library."""
-    def __init__(self) -> None:
-        """Initialize the PDF2DOI extractor."""
+    def __init__(self, validate_match: bool = True, similarity_threshold: float = 0.3) -> None:
+        """
+        Initialize the PDF2DOI extractor.
+        Args:
+            validate_match: Whether to validate that DOI metadata matches PDF content
+            similarity_threshold: Minimum similarity score (0-1) for title validation
+        """
         # Suppress pdf2doi verbose output
         pdf2doi.config.set("verbose", False)
+        self.validate_match = validate_match
+        self.similarity_threshold = similarity_threshold
     async def extract_doi(self, pdf_path: Path) -> DOIMetadata | None:
         """
@@ -31,9 +41,7 @@ class PDF2DOIExtractor(DOIExtractor):
         try:
             # Run pdf2doi in executor to avoid blocking
             loop = asyncio.get_event_loop()
-            result = await loop.run_in_executor(
-                None, pdf2doi.pdf2doi, str(pdf_path)
-            )
+            result = await loop.run_in_executor(None, pdf2doi.pdf2doi, str(pdf_path))
             # pdf2doi returns a dict (not a list)
             if not result or not isinstance(result, dict):
@@ -56,28 +64,26 @@ class PDF2DOIExtractor(DOIExtractor):
             metadata = {}
             if validation_info:
-                try:
+                with contextlib.suppress(json.JSONDecodeError):
                     metadata = json.loads(validation_info)
-                except json.JSONDecodeError:
-                    pass
             # Extract title
             title = metadata.get("title")
             # Extract authors (list of dicts with 'given' and 'family' fields)
-            authors = None
+            authors: list[str] | None = None
             if "author" in metadata:
                 author_list = metadata["author"]
-                authors = []
+                author_names: list[str] = []
                 for author in author_list:
                     if isinstance(author, dict):
                         family = author.get("family", "")
                         given = author.get("given", "")
                         if family:
                             full_name = f"{given} {family}".strip() if given else family
-                            authors.append(full_name)
-                if not authors:
-                    authors = None
+                            author_names.append(full_name)
+                if author_names:
+                    authors = author_names
             # Extract year from published-online or published
             year = None
@@ -94,7 +100,7 @@ class PDF2DOIExtractor(DOIExtractor):
             # Extract publisher
             publisher = metadata.get("publisher")
-            return DOIMetadata(
+            doi_metadata = DOIMetadata(
                 doi=identifier,
                 title=title,
                 authors=authors,
@@ -104,6 +110,16 @@ class PDF2DOIExtractor(DOIExtractor):
                 raw_bibtex=validation_info if validation_info else None,
             )
+            # Validate that the DOI metadata matches the PDF content
+            if self.validate_match:
+                # Extract first page text from PDF to check for title match
+                pdf_text = await self._extract_pdf_first_page(pdf_path)
+                if not self._validate_doi_matches_pdf(doi_metadata, pdf_text):
+                    # DOI doesn't match - likely a citation DOI, not the paper's DOI
+                    return None
+            return doi_metadata
         except Exception:
             # Silently fail - DOI extraction is opportunistic
             return None
@@ -161,3 +177,120 @@ class PDF2DOIExtractor(DOIExtractor):
         ]
         return authors if authors else None
+    async def _extract_pdf_first_page(self, pdf_path: Path) -> str:
+        """
+        Extract text from the first page of a PDF.
+        Args:
+            pdf_path: Path to PDF file
+        Returns:
+            Text from first page (empty string if extraction fails)
+        """
+        try:
+            import fitz  # PyMuPDF
+            loop = asyncio.get_event_loop()
+            def extract() -> str:
+                with fitz.open(pdf_path) as doc:
+                    if len(doc) > 0:
+                        return doc[0].get_text()
+                return ""
+            return await loop.run_in_executor(None, extract)
+        except Exception:
+            return ""
+    def _validate_doi_matches_pdf(self, doi_metadata: DOIMetadata, pdf_text: str) -> bool:
+        """
+        Validate that DOI metadata matches the PDF content.
+        This checks if the title from the DOI metadata appears in the PDF text
+        (particularly the first page, where the title should be).
+        Args:
+            doi_metadata: DOI metadata to validate
+            pdf_text: Text from PDF first page (not full document!)
+        Returns:
+            True if metadata appears to match PDF, False otherwise
+        """
+        if not doi_metadata.title or not pdf_text:
+            # If we can't validate, assume it's valid (fail open)
+            return True
+        # Normalize text for comparison
+        pdf_text_lower = pdf_text.lower()
+        title_lower = doi_metadata.title.lower()
+        # Check if the full title appears in the PDF text
+        if title_lower in pdf_text_lower:
+            return True
+        # Check similarity using SequenceMatcher on first ~300 chars (title area)
+        # Most paper titles appear in the first few hundred characters
+        title_area = pdf_text_lower[:300]
+        similarity = SequenceMatcher(None, title_lower, title_area).ratio()
+        if similarity >= self.similarity_threshold:
+            return True
+        # Check if significant words from title appear in the title area ONLY
+        # This prevents matching citation DOIs from the references section
+        title_words = self._extract_significant_words(title_lower)
+        if not title_words:
+            return True  # Can't validate, fail open
+        # Require at least 70% of significant words to appear in the title area
+        matches = sum(1 for word in title_words if word in title_area)
+        match_ratio = matches / len(title_words)
+        return match_ratio >= 0.7
+    def _extract_significant_words(self, text: str) -> list[str]:
+        """
+        Extract significant words from text (removing common words).
+        Args:
+            text: Input text
+        Returns:
+            List of significant words
+        """
+        # Common words to skip
+        stop_words = {
+            "a",
+            "an",
+            "the",
+            "and",
+            "or",
+            "but",
+            "in",
+            "on",
+            "at",
+            "to",
+            "for",
+            "of",
+            "with",
+            "by",
+            "from",
+            "as",
+            "is",
+            "was",
+            "are",
+            "were",
+            "been",
+            "be",
+            "this",
+            "that",
+            "these",
+            "those",
+        }
+        # Extract words (alphanumeric only)
+        words = re.findall(r"\b\w+\b", text.lower())
+        # Filter stop words and short words
+        return [w for w in words if w not in stop_words and len(w) > 3]

pdf_file_renamer/presentation/formatters.py CHANGED Viewed

@@ -214,9 +214,7 @@ class ResultsTable:
                 reasoning = reasoning[:100] + "..."
             # Handle both enum and string confidence
             conf_str = (
-                op.confidence.value
-                if isinstance(op.confidence, ConfidenceLevel)
-                else op.confidence
+                op.confidence.value if isinstance(op.confidence, ConfidenceLevel) else op.confidence
             )
             table.add_row(
                 op.original_path.name,

pdf_file_renamer-0.6.3.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,444 @@
+Metadata-Version: 2.4
+Name: pdf-file-renamer
+Version: 0.6.3
+Summary: Intelligent PDF renaming using LLMs with DOI-based naming and interactive workflow
+Project-URL: Homepage, https://github.com/nostoslabs/pdf-renamer
+Project-URL: Repository, https://github.com/nostoslabs/pdf-renamer
+Project-URL: Issues, https://github.com/nostoslabs/pdf-renamer/issues
+Project-URL: Changelog, https://github.com/nostoslabs/pdf-renamer/blob/main/CHANGELOG.md
+Author-email: Nostos Labs <info@nostoslabs.com>
+License: MIT
+License-File: LICENSE
+Keywords: academic-papers,ai,automation,document-management,doi,file-organization,llm,pdf,rename
+Classifier: Development Status :: 4 - Beta
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Education
+Classifier: Intended Audience :: End Users/Desktop
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Office/Business :: Office Suites
+Classifier: Topic :: Scientific/Engineering
+Classifier: Topic :: Text Processing :: General
+Classifier: Topic :: Utilities
+Classifier: Typing :: Typed
+Requires-Python: >=3.11
+Requires-Dist: docling-core>=2.0.0
+Requires-Dist: docling-parse>=2.0.0
+Requires-Dist: pdf2doi>=1.7
+Requires-Dist: pydantic-ai>=1.0.17
+Requires-Dist: pydantic-settings>=2.7.1
+Requires-Dist: pydantic>=2.10.6
+Requires-Dist: pymupdf>=1.26.5
+Requires-Dist: python-dotenv>=1.1.1
+Requires-Dist: rich>=14.2.0
+Requires-Dist: tenacity>=9.0.0
+Requires-Dist: typer>=0.19.2
+Provides-Extra: dev
+Requires-Dist: mypy>=1.14.1; extra == 'dev'
+Requires-Dist: pytest-asyncio>=0.25.2; extra == 'dev'
+Requires-Dist: pytest-cov>=6.0.0; extra == 'dev'
+Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
+Requires-Dist: pytest>=8.3.4; extra == 'dev'
+Requires-Dist: ruff>=0.9.1; extra == 'dev'
+Description-Content-Type: text/markdown
+# PDF Renamer
+[![PyPI version](https://img.shields.io/pypi/v/pdf-file-renamer.svg)](https://pypi.org/project/pdf-file-renamer/)
+[![PyPI downloads](https://img.shields.io/pypi/dm/pdf-file-renamer.svg)](https://pypi.org/project/pdf-file-renamer/)
+[![Python](https://img.shields.io/pypi/pyversions/pdf-file-renamer.svg)](https://pypi.org/project/pdf-file-renamer/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![CI](https://github.com/nostoslabs/pdf-renamer/workflows/CI/badge.svg)](https://github.com/nostoslabs/pdf-renamer/actions)
+[![Code style: ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
+[![Type checked: mypy](https://img.shields.io/badge/type%20checked-mypy-blue.svg)](http://mypy-lang.org/)
+**Intelligent PDF file renaming using LLMs and DOI metadata.** Automatically generate clean, descriptive filenames for your PDF library.
+> 🚀 Works with **OpenAI**, **Ollama**, **LM Studio**, and any OpenAI-compatible API
+> 📚 **DOI-first** approach for academic papers - no API costs!
+> 🎯 **Interactive mode** with retry, edit, and skip options
+## Table of Contents
+- [Quick Example](#quick-example)
+- [Features](#features)
+- [Installation](#installation)
+- [Configuration](#configuration)
+- [Usage](#usage)
+- [Interactive Mode](#interactive-mode)
+- [How It Works](#how-it-works)
+- [Cost Considerations](#cost-considerations)
+- [Architecture](#architecture)
+- [Development](#development)
+- [Contributing](#contributing)
+- [License](#license)
+## Quick Example
+![Demo](demo.gif)
+Transform messy filenames into clean, organized ones:
+```
+Before:                               After:
+📄 paper_final_v3.pdf          →     Leroux-Analog-In-memory-Computing-2025.pdf
+📄 download (2).pdf            →     Ruiz-Why-Don-Trace-Requirements-2023.pdf
+📄 document.pdf                →     Raspail-Camp_of_the_Saints.pdf
+```
+**Live Progress Display:**
+```
+Processing 3 PDFs with max 3 concurrent API calls and 10 concurrent extractions
+╭─────────────────────────── 📊 Progress ───────────────────────────╮
+│ Total: 3 | Pending: 0 | Extracting: 0 | Analyzing: 0 | Complete: 3 │
+╰───────────────────────────────────────────────────────────────────╯
+╭───────────────────────────────────────────────────────────────────╮
+│ [██████████████████████████████████████████████] 100.0%           │
+╰───────────────────────────────────────────────────────────────────╯
+                          Processing Status
+┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
+┃ File               ┃ Stage ┃ Status   ┃ Details             ┃
+┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
+│ paper_final_v3.pdf │   ✓   │ Complete │ very_high           │
+│ download (2).pdf   │   ✓   │ Complete │ very_high (DOI)     │
+│ document.pdf       │   ✓   │ Complete │ high                │
+└────────────────────┴───────┴──────────┴─────────────────────┘
+```
+## Features
+- **🎓 DOI-based naming** - Automatically extracts DOI and fetches authoritative metadata for academic papers
+- **🧠 Advanced PDF parsing** using docling-parse for better structure-aware extraction
+- **👁️ OCR fallback** for scanned PDFs with low text content
+- **🎯 Smart LLM prompting** with multi-pass analysis for improved accuracy
+- **⚡ Hybrid approach** - Uses DOI metadata when available, falls back to LLM analysis for other documents
+- **📝 Standardized format** - Generates filenames like `Author-Topic-Year.pdf`
+- **🔍 Dry-run mode** to preview changes before applying
+- **💬 Enhanced interactive mode** with options to accept, manually edit, retry, or skip each file
+- **📊 Live progress display** with concurrent processing for speed
+- **⚙️ Configurable concurrency** limits for API calls and PDF extraction
+- **📦 Batch processing** of multiple PDFs with optional output directory
+## Installation
+### Quick Start (No Installation Required)
+```bash
+# Run directly with uvx
+uvx pdf-renamer --dry-run /path/to/pdfs
+```
+### Install from PyPI
+```bash
+# Using pip
+pip install pdf-file-renamer
+# Using uv
+uv pip install pdf-file-renamer
+```
+### Install from Source
+```bash
+# Clone and install
+git clone https://github.com/nostoslabs/pdf-renamer.git
+cd pdf-renamer
+uv sync
+```
+## Configuration
+Configure your LLM provider:
+**Option A: OpenAI (Cloud)**
+```bash
+cp .env.example .env
+# Edit .env and add your OPENAI_API_KEY
+```
+**Option B: Ollama or other local models**
+```bash
+# No API key needed for local models
+# Either set LLM_BASE_URL in .env or use --url flag
+echo "LLM_BASE_URL=http://patmos:11434/v1" > .env
+```
+## Usage
+### Quick Start
+```bash
+# Preview renames (dry-run mode)
+pdf-renamer --dry-run /path/to/pdf/directory
+# Actually rename files
+pdf-renamer --no-dry-run /path/to/pdf/directory
+# Interactive mode - review each file
+pdf-renamer --interactive --no-dry-run /path/to/pdf/directory
+```
+### Using uvx (No Installation)
+```bash
+# Run directly without installing
+uvx pdf-renamer --dry-run /path/to/pdfs
+# Run from GitHub
+uvx https://github.com/nostoslabs/pdf-renamer --dry-run /path/to/pdfs
+```
+### Options
+- `--dry-run/--no-dry-run`: Show suggestions without renaming (default: True)
+- `--interactive, -i`: Interactive mode with rich options:
+  - **Accept** - Use the suggested filename
+  - **Edit** - Manually modify the filename
+  - **Retry** - Ask the LLM to generate a new suggestion
+  - **Skip** - Skip this file and move to the next
+- `--model`: Model to use (default: llama3.2, works with any OpenAI-compatible API)
+- `--url`: Custom base URL for OpenAI-compatible APIs (default: http://localhost:11434/v1)
+- `--pattern`: Glob pattern for files (default: *.pdf)
+- `--output-dir, -o`: Move renamed files to a different directory
+- `--max-concurrent-api`: Maximum concurrent API calls (default: 3)
+- `--max-concurrent-pdf`: Maximum concurrent PDF extractions (default: 10)
+### Examples
+**Using OpenAI:**
+```bash
+# Preview all PDFs in current directory
+uvx pdf-renamer --dry-run .
+# Rename PDFs in specific directory
+uvx pdf-renamer --no-dry-run ~/Documents/Papers
+# Use a different OpenAI model
+uvx pdf-renamer --model gpt-4o --dry-run .
+```
+**Using Ollama (or other local models):**
+```bash
+# Using Ollama on patmos server with gemma model
+uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --dry-run .
+# Using local Ollama with qwen model
+uvx pdf-renamer --url http://localhost:11434/v1 --model qwen2.5 --dry-run .
+# Set URL in environment and just use model flag
+export LLM_BASE_URL=http://patmos:11434/v1
+uvx pdf-renamer --model gemma3:latest --dry-run .
+```
+**Other examples:**
+```bash
+# Process only specific files
+uvx pdf-renamer --pattern "*2020*.pdf" --dry-run .
+# Interactive mode with local model
+uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --interactive --no-dry-run .
+# Run directly from GitHub
+uvx https://github.com/nostoslabs/pdf-renamer --no-dry-run ~/Documents/Papers
+```
+## Interactive Mode
+When using `--interactive` mode, you'll be presented with each file one at a time with detailed options:
+```
+================================================================================
+Original: 2024-research-paper.pdf
+Suggested: Smith-Machine-Learning-Applications-2024.pdf
+Confidence: high
+Reasoning: Clear author and topic identified from abstract
+================================================================================
+Options:
+  y / yes / Enter - Accept suggested name
+  e / edit - Manually edit the filename
+  r / retry - Ask LLM to generate a new suggestion
+  n / no / skip - Skip this file
+What would you like to do? [y]:
+```
+This mode is perfect for:
+- **Reviewing suggestions** before applying them
+- **Fine-tuning filenames** that are close but not quite right
+- **Retrying** when the LLM suggestion isn't good enough
+- **Building confidence** in the tool before batch processing
+You can use interactive mode with `--dry-run` to preview without actually renaming files, or with `--no-dry-run` to apply changes immediately after confirmation.
+## How It Works
+### Intelligent Hybrid Approach
+The tool uses a multi-strategy approach to generate accurate filenames:
+1. **DOI Detection** (for academic papers)
+   - Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
+   - **Validates DOI metadata** against PDF content to prevent citation DOI mismatches
+   - If found and validated, queries authoritative metadata (title, authors, year, journal)
+   - Generates filename with **very high confidence** from validated metadata
+   - **Saves API costs** - no LLM call needed for papers with DOIs
+2. **LLM Analysis** (fallback for non-academic PDFs)
+   - **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
+   - **OCR**: Automatically applies OCR for scanned PDFs with minimal text
+   - **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
+   - **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
+   - **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
+   - **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
+3. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
+4. **Rename**: Applies suggestions (if not in dry-run mode)
+### Benefits of DOI Integration
+- **Accuracy**: DOI metadata is canonical and verified
+- **Speed**: Instant lookup vs. LLM processing time
+- **Cost**: Free DOI lookups save on API costs for academic papers
+- **Reliability**: Works even when PDF text extraction is poor
+## Cost Considerations
+**DOI-based Naming (Academic Papers):**
+- **Completely free** - No API costs
+- **No LLM needed** - Direct metadata lookup
+- Works for most academic papers with embedded DOIs
+**OpenAI (Fallback):**
+- Uses `gpt-4o-mini` by default (very cost-effective)
+- Only called when DOI not found
+- Processes first ~4500 characters per PDF
+- Typical cost: ~$0.001-0.003 per PDF
+**Ollama/Local Models:**
+- Completely free (runs on your hardware)
+- Works with any Ollama model (llama3, qwen2.5, mistral, etc.)
+- Also compatible with LM Studio, vLLM, and other OpenAI-compatible endpoints
+## Filename Format
+The tool generates filenames in this format:
+- `Smith-Kalman-Filtering-Applications-2020.pdf`
+- `Adamy-Electronic-Warfare-Modeling-Techniques.pdf`
+- `Blair-Monopulse-Processing-Unresolved-Targets.pdf`
+Guidelines:
+- First author's last name
+- 3-6 word topic description (prioritizes clarity over brevity)
+- Year (if identifiable)
+- Hyphens between words
+- Target ~80 characters (can be longer if needed for clarity)
+## Architecture
+This project follows **Clean Architecture** principles with clear separation of concerns:
+```
+src/pdf_file_renamer/
+├── domain/          # Core business logic (models, ports)
+├── application/     # Use cases and workflows
+├── infrastructure/  # External integrations (PDF, LLM, DOI)
+└── presentation/    # CLI and UI components
+```
+**Key Design Patterns:**
+- **Ports and Adapters** - Clean interfaces for external dependencies
+- **Dependency Injection** - Flexible component composition
+- **Single Responsibility** - Each module has one clear purpose
+- **Type Safety** - Full mypy strict mode compliance
+See [REFACTORING_SUMMARY.md](REFACTORING_SUMMARY.md) for detailed architecture notes.
+## Development
+### Setup
+```bash
+# Clone repository
+git clone https://github.com/nostoslabs/pdf-renamer.git
+cd pdf-renamer
+# Install dependencies with uv
+uv sync
+# Run tests
+uv run pytest
+# Run linting
+uv run ruff check src/ tests/
+# Run type checking
+uv run mypy src/
+```
+### Code Quality
+- **Tests**: pytest with async support and coverage reporting
+- **Linting**: ruff for fast, comprehensive linting
+- **Formatting**: ruff format for consistent code style
+- **Type Checking**: mypy in strict mode
+- **CI/CD**: GitHub Actions for automated testing and releases
+### Running Locally
+```bash
+# Run with local changes
+uv run pdf-file-renamer --dry-run /path/to/pdfs
+# Run specific module
+uv run python -m pdf_file_renamer.main --help
+```
+## Contributing
+Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
+### Development Workflow
+1. Fork the repository
+2. Create a feature branch (`git checkout -b feature/amazing-feature`)
+3. Make your changes
+4. Run tests and linting (`uv run pytest && uv run ruff check src/`)
+5. Commit your changes (`git commit -m 'Add amazing feature'`)
+6. Push to the branch (`git push origin feature/amazing-feature`)
+7. Open a Pull Request
+### Code Style
+- Follow PEP 8 (enforced by ruff)
+- Use type hints for all functions
+- Write tests for new features
+- Update documentation as needed
+## License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## Acknowledgments
+- [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi) for DOI extraction
+- [pydantic-ai](https://ai.pydantic.dev/) for LLM integration
+- [docling-parse](https://github.com/DS4SD/docling-parse) for advanced PDF parsing
+- [PyMuPDF](https://pymupdf.readthedocs.io/) for PDF text extraction
+- [rich](https://rich.readthedocs.io/) for beautiful terminal UI
+## Support
+- **Issues**: [GitHub Issues](https://github.com/nostoslabs/pdf-renamer/issues)
+- **Discussions**: [GitHub Discussions](https://github.com/nostoslabs/pdf-renamer/discussions)
+- **Changelog**: [CHANGELOG.md](CHANGELOG.md)
+---
+**Made with ❤️ by [Nostos Labs](https://github.com/nostoslabs)**

{pdf_file_renamer-0.6.1.dist-info → pdf_file_renamer-0.6.3.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-pdf_file_renamer/__init__.py,sha256=1hyyq0EM6vqGG8Gxxdkg3MuLU_4Mwj3mc812ikutUB8,85
+pdf_file_renamer/__init__.py,sha256=ag2NG1Rry9SOlQHvUnNzrgujU5GkDJZ8Fh7FKCuSRNk,85
 pdf_file_renamer/main.py,sha256=FTEEb-9QmOOsN9SE8L1SZvFVIkVpQDy8xZ5a8t8CWUs,145
 pdf_file_renamer/application/__init__.py,sha256=riSV7UXBenkDst7Nnf11N1_RuRtM7wpKdwugxOhumS4,363
 pdf_file_renamer/application/filename_service.py,sha256=IbeCNBwyhFlCMCZveq16nmQ2qvyTdtgLmr6PDWPckOs,4868
@@ -10,7 +10,7 @@ pdf_file_renamer/domain/ports.py,sha256=ebOcHptiOK119NCmIwM32_fbRK5xkZP9K67vjL-4
 pdf_file_renamer/infrastructure/__init__.py,sha256=C3ZQ7WCPCa6PMfP00lu4wqb0r57GVyDdiD5EL2DhCeY,187
 pdf_file_renamer/infrastructure/config.py,sha256=baNL5_6_NNiS50ZNdql7fDwQbeAwf6f58HGYIWFQxQQ,2464
 pdf_file_renamer/infrastructure/doi/__init__.py,sha256=8N9ZEwfG7q5xomzh187YtP8t4CfEBHM334xNRblPeuI,153
-pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py,sha256=mK2Z5oOwN-TgiEHLgoLM5yCSe_-G9kWXLr4Sw3nMkEM,5105
+pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py,sha256=1tQ7fQF3TPxUZ7By9dzKz4LAfE8TPyjlvt8lACqGiLk,9551
 pdf_file_renamer/infrastructure/llm/__init__.py,sha256=ToB8__mHvXwaIukGKPEAQ8SeC4ZLiH4auZI1P1yH5PQ,159
 pdf_file_renamer/infrastructure/llm/pydantic_ai_provider.py,sha256=kVsmj0NIawkj-1WWM0hZXbsNH09GabVZm9HPlYsxGuo,9217
 pdf_file_renamer/infrastructure/pdf/__init__.py,sha256=uMHqxSXNLZH5WH_e1kXrp9m7uTqPkiI2hXjNo6rCRoo,368
@@ -19,9 +19,9 @@ pdf_file_renamer/infrastructure/pdf/docling_extractor.py,sha256=auZrJpK7mMg1mUXK
 pdf_file_renamer/infrastructure/pdf/pymupdf_extractor.py,sha256=C61udZCqGqiVx7T0HWNyjvnhgv5AgMIcCYtrhgHOJwk,5465
 pdf_file_renamer/presentation/__init__.py,sha256=1VR44GoPGTixk3hG5YzhGyQf7a4BTKsJBd2VP3rHcFM,211
 pdf_file_renamer/presentation/cli.py,sha256=0t_59-utRWLNCYjFetU0ZHoF1DPTjdNiWM9Au0jFaOg,8013
-pdf_file_renamer/presentation/formatters.py,sha256=Es7pZoHw5bEPtNfa_s43eHXa_m0yrTmX6S2aU78JUE0,8978
-pdf_file_renamer-0.6.1.dist-info/METADATA,sha256=OyZKW601xnQFXR-SDLakLEnasq5rtfP7YO6IYn6f-z4,9912
-pdf_file_renamer-0.6.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-pdf_file_renamer-0.6.1.dist-info/entry_points.txt,sha256=0fEGYy60chGE9rECWeCVPxjxzz6vMtIAYdFvmH7xzbw,63
-pdf_file_renamer-0.6.1.dist-info/licenses/LICENSE,sha256=_w08V08WgoMpDMlGNlkIatC5QfQ_Ds_rXOBM8pl7ffE,1068
-pdf_file_renamer-0.6.1.dist-info/RECORD,,
+pdf_file_renamer/presentation/formatters.py,sha256=8Vz95QupJKkPgPgRyMVmA_gxRWG5vfxdnSd7Czovlrg,8946
+pdf_file_renamer-0.6.3.dist-info/METADATA,sha256=ywxT5kRE2VGcv1HUuwvqrAeaVw7ksYsn6Y6MTa5hShA,16952
+pdf_file_renamer-0.6.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+pdf_file_renamer-0.6.3.dist-info/entry_points.txt,sha256=0fEGYy60chGE9rECWeCVPxjxzz6vMtIAYdFvmH7xzbw,63
+pdf_file_renamer-0.6.3.dist-info/licenses/LICENSE,sha256=_w08V08WgoMpDMlGNlkIatC5QfQ_Ds_rXOBM8pl7ffE,1068
+pdf_file_renamer-0.6.3.dist-info/RECORD,,

pdf_file_renamer-0.6.1.dist-info/METADATA DELETED Viewed

@@ -1,272 +0,0 @@
-Metadata-Version: 2.4
-Name: pdf-file-renamer
-Version: 0.6.1
-Summary: Intelligent PDF renaming using LLMs
-License-File: LICENSE
-Requires-Python: >=3.11
-Requires-Dist: docling-core>=2.0.0
-Requires-Dist: docling-parse>=2.0.0
-Requires-Dist: pdf2doi>=1.7
-Requires-Dist: pydantic-ai>=1.0.17
-Requires-Dist: pydantic-settings>=2.7.1
-Requires-Dist: pydantic>=2.10.6
-Requires-Dist: pymupdf>=1.26.5
-Requires-Dist: python-dotenv>=1.1.1
-Requires-Dist: rich>=14.2.0
-Requires-Dist: tenacity>=9.0.0
-Requires-Dist: typer>=0.19.2
-Provides-Extra: dev
-Requires-Dist: mypy>=1.14.1; extra == 'dev'
-Requires-Dist: pytest-asyncio>=0.25.2; extra == 'dev'
-Requires-Dist: pytest-cov>=6.0.0; extra == 'dev'
-Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
-Requires-Dist: pytest>=8.3.4; extra == 'dev'
-Requires-Dist: ruff>=0.9.1; extra == 'dev'
-Description-Content-Type: text/markdown
-# PDF Renamer
-[![PyPI version](https://img.shields.io/pypi/v/pdf-file-renamer.svg)](https://pypi.org/project/pdf-file-renamer/)
-[![Python](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
-[![uv](https://img.shields.io/badge/uv-0.5+-orange.svg)](https://docs.astral.sh/uv/)
-[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-[![pydantic-ai](https://img.shields.io/badge/pydantic--ai-1.0+-green.svg)](https://ai.pydantic.dev/)
-[![GitHub](https://img.shields.io/badge/github-nostoslabs%2Fpdf--renamer-blue?logo=github)](https://github.com/nostoslabs/pdf-renamer)
-[![Tests](https://img.shields.io/badge/tests-passing-brightgreen.svg)](https://github.com/nostoslabs/pdf-renamer)
-[![Code style: ruff](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff)
-[![Type checked: mypy](https://img.shields.io/badge/type%20checked-mypy-blue.svg)](http://mypy-lang.org/)
-[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
-Intelligent PDF file renaming using LLMs. This tool analyzes PDF content and metadata to suggest descriptive, standardized filenames.
-> 🚀 Works with **OpenAI**, **Ollama**, **LM Studio**, and any OpenAI-compatible API
-## Features
-- **DOI-based naming** - Automatically extracts DOI and fetches authoritative metadata for academic papers
-- **Advanced PDF parsing** using docling-parse for better structure-aware extraction
-- **OCR fallback** for scanned PDFs with low text content
-- **Smart LLM prompting** with multi-pass analysis for improved accuracy
-- **Hybrid approach** - Uses DOI metadata when available, falls back to LLM analysis for other documents
-- Suggests filenames in format: `Author-Topic-Year.pdf`
-- Dry-run mode to preview changes before applying
-- **Enhanced interactive mode** with options to accept, manually edit, retry, or skip each file
-- **Live progress display** with concurrent processing for speed
-- **Configurable concurrency** limits for API calls and PDF extraction
-- Batch processing of multiple PDFs with optional output directory
-## Installation
-### Quick Start (No Installation Required)
-```bash
-# Run directly with uvx
-uvx pdf-renamer --dry-run /path/to/pdfs
-```
-### Install from PyPI
-```bash
-# Using pip
-pip install pdf-file-renamer
-# Using uv
-uv pip install pdf-file-renamer
-```
-### Install from Source
-```bash
-# Clone and install
-git clone https://github.com/nostoslabs/pdf-renamer.git
-cd pdf-renamer
-uv sync
-```
-## Configuration
-Configure your LLM provider:
-**Option A: OpenAI (Cloud)**
-```bash
-cp .env.example .env
-# Edit .env and add your OPENAI_API_KEY
-```
-**Option B: Ollama or other local models**
-```bash
-# No API key needed for local models
-# Either set LLM_BASE_URL in .env or use --url flag
-echo "LLM_BASE_URL=http://patmos:11434/v1" > .env
-```
-## Usage
-### Quick Start
-```bash
-# Preview renames (dry-run mode)
-pdf-renamer --dry-run /path/to/pdf/directory
-# Actually rename files
-pdf-renamer --no-dry-run /path/to/pdf/directory
-# Interactive mode - review each file
-pdf-renamer --interactive --no-dry-run /path/to/pdf/directory
-```
-### Using uvx (No Installation)
-```bash
-# Run directly without installing
-uvx pdf-renamer --dry-run /path/to/pdfs
-# Run from GitHub
-uvx https://github.com/nostoslabs/pdf-renamer --dry-run /path/to/pdfs
-```
-### Options
-- `--dry-run/--no-dry-run`: Show suggestions without renaming (default: True)
-- `--interactive, -i`: Interactive mode with rich options:
-  - **Accept** - Use the suggested filename
-  - **Edit** - Manually modify the filename
-  - **Retry** - Ask the LLM to generate a new suggestion
-  - **Skip** - Skip this file and move to the next
-- `--model`: Model to use (default: llama3.2, works with any OpenAI-compatible API)
-- `--url`: Custom base URL for OpenAI-compatible APIs (default: http://localhost:11434/v1)
-- `--pattern`: Glob pattern for files (default: *.pdf)
-- `--output-dir, -o`: Move renamed files to a different directory
-- `--max-concurrent-api`: Maximum concurrent API calls (default: 3)
-- `--max-concurrent-pdf`: Maximum concurrent PDF extractions (default: 10)
-### Examples
-**Using OpenAI:**
-```bash
-# Preview all PDFs in current directory
-uvx pdf-renamer --dry-run .
-# Rename PDFs in specific directory
-uvx pdf-renamer --no-dry-run ~/Documents/Papers
-# Use a different OpenAI model
-uvx pdf-renamer --model gpt-4o --dry-run .
-```
-**Using Ollama (or other local models):**
-```bash
-# Using Ollama on patmos server with gemma model
-uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --dry-run .
-# Using local Ollama with qwen model
-uvx pdf-renamer --url http://localhost:11434/v1 --model qwen2.5 --dry-run .
-# Set URL in environment and just use model flag
-export LLM_BASE_URL=http://patmos:11434/v1
-uvx pdf-renamer --model gemma3:latest --dry-run .
-```
-**Other examples:**
-```bash
-# Process only specific files
-uvx pdf-renamer --pattern "*2020*.pdf" --dry-run .
-# Interactive mode with local model
-uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --interactive --no-dry-run .
-# Run directly from GitHub
-uvx https://github.com/nostoslabs/pdf-renamer --no-dry-run ~/Documents/Papers
-```
-## Interactive Mode
-When using `--interactive` mode, you'll be presented with each file one at a time with detailed options:
-```
-================================================================================
-Original: 2024-research-paper.pdf
-Suggested: Smith-Machine-Learning-Applications-2024.pdf
-Confidence: high
-Reasoning: Clear author and topic identified from abstract
-================================================================================
-Options:
-  y / yes / Enter - Accept suggested name
-  e / edit - Manually edit the filename
-  r / retry - Ask LLM to generate a new suggestion
-  n / no / skip - Skip this file
-What would you like to do? [y]:
-```
-This mode is perfect for:
-- **Reviewing suggestions** before applying them
-- **Fine-tuning filenames** that are close but not quite right
-- **Retrying** when the LLM suggestion isn't good enough
-- **Building confidence** in the tool before batch processing
-You can use interactive mode with `--dry-run` to preview without actually renaming files, or with `--no-dry-run` to apply changes immediately after confirmation.
-## How It Works
-### Intelligent Hybrid Approach
-The tool uses a multi-strategy approach to generate accurate filenames:
-1. **DOI Detection** (for academic papers)
-   - Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
-   - If found, queries authoritative metadata (title, authors, year, journal)
-   - Generates filename with **very high confidence** from validated metadata
-   - **Saves API costs** - no LLM call needed for papers with DOIs
-2. **LLM Analysis** (fallback for non-academic PDFs)
-   - **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
-   - **OCR**: Automatically applies OCR for scanned PDFs with minimal text
-   - **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
-   - **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
-   - **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
-   - **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
-3. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
-4. **Rename**: Applies suggestions (if not in dry-run mode)
-### Benefits of DOI Integration
-- **Accuracy**: DOI metadata is canonical and verified
-- **Speed**: Instant lookup vs. LLM processing time
-- **Cost**: Free DOI lookups save on API costs for academic papers
-- **Reliability**: Works even when PDF text extraction is poor
-## Cost Considerations
-**DOI-based Naming (Academic Papers):**
-- **Completely free** - No API costs
-- **No LLM needed** - Direct metadata lookup
-- Works for most academic papers with embedded DOIs
-**OpenAI (Fallback):**
-- Uses `gpt-4o-mini` by default (very cost-effective)
-- Only called when DOI not found
-- Processes first ~4500 characters per PDF
-- Typical cost: ~$0.001-0.003 per PDF
-**Ollama/Local Models:**
-- Completely free (runs on your hardware)
-- Works with any Ollama model (llama3, qwen2.5, mistral, etc.)
-- Also compatible with LM Studio, vLLM, and other OpenAI-compatible endpoints
-## Filename Format
-The tool generates filenames in this format:
-- `Smith-Kalman-Filtering-Applications-2020.pdf`
-- `Adamy-Electronic-Warfare-Modeling-Techniques.pdf`
-- `Blair-Monopulse-Processing-Unresolved-Targets.pdf`
-Guidelines:
-- First author's last name
-- 3-6 word topic description (prioritizes clarity over brevity)
-- Year (if identifiable)
-- Hyphens between words
-- Target ~80 characters (can be longer if needed for clarity)

{pdf_file_renamer-0.6.1.dist-info → pdf_file_renamer-0.6.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{pdf_file_renamer-0.6.1.dist-info → pdf_file_renamer-0.6.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{pdf_file_renamer-0.6.1.dist-info → pdf_file_renamer-0.6.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

pdf-file-renamer 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl

pdf-file-renamer 0.6.1py3-none-any.whl → 0.6.3py3-none-any.whl