pdf-file-renamer 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,116 @@
1
+ """Docling-based PDF extractor for structure-aware text extraction."""
2
+
3
+ import re
4
+ from pathlib import Path
5
+
6
+ from docling_core.types.doc.page import TextCellUnit
7
+ from docling_parse.pdf_parser import DoclingPdfParser
8
+
9
+ from pdf_renamer.domain.models import PDFContent, PDFMetadata
10
+ from pdf_renamer.domain.ports import PDFExtractor
11
+
12
+
13
+ class DoclingPDFExtractor(PDFExtractor):
14
+ """PDF extractor using docling-parse for better structure-aware extraction."""
15
+
16
+ def __init__(self, max_pages: int = 5, max_chars: int = 8000) -> None:
17
+ """
18
+ Initialize the Docling PDF extractor.
19
+
20
+ Args:
21
+ max_pages: Maximum pages to extract
22
+ max_chars: Maximum characters to extract
23
+ """
24
+ self.max_pages = max_pages
25
+ self.max_chars = max_chars
26
+ self._parser = DoclingPdfParser()
27
+
28
+ async def extract(self, pdf_path: Path) -> PDFContent:
29
+ """
30
+ Extract text and metadata from PDF using docling-parse.
31
+
32
+ Args:
33
+ pdf_path: Path to PDF file
34
+
35
+ Returns:
36
+ PDFContent with extracted text and metadata
37
+
38
+ Raises:
39
+ RuntimeError: If extraction fails
40
+ """
41
+ try:
42
+ pdf_doc = self._parser.load(path_or_stream=str(pdf_path))
43
+
44
+ text_parts: list[str] = []
45
+ total_chars = 0
46
+ page_count = 0
47
+
48
+ for page_no, pred_page in pdf_doc.iterate_pages():
49
+ page_count += 1
50
+ if page_no >= self.max_pages:
51
+ break
52
+
53
+ # Extract text at line level for better structure preservation
54
+ page_lines: list[str] = []
55
+ for line in pred_page.iterate_cells(unit_type=TextCellUnit.LINE):
56
+ page_lines.append(line.text)
57
+
58
+ page_text = "\n".join(page_lines)
59
+
60
+ # Add page text until we hit the character limit
61
+ remaining_chars = self.max_chars - total_chars
62
+ if remaining_chars <= 0:
63
+ break
64
+
65
+ text_parts.append(page_text[:remaining_chars])
66
+ total_chars += len(page_text)
67
+
68
+ extracted_text = "\n".join(text_parts).strip()
69
+
70
+ # Extract metadata using separate method
71
+ metadata = await self._extract_metadata(pdf_path, extracted_text)
72
+
73
+ return PDFContent(text=extracted_text, metadata=metadata, page_count=page_count)
74
+
75
+ except Exception as e:
76
+ msg = f"Failed to extract text from {pdf_path} using docling-parse: {e}"
77
+ raise RuntimeError(msg) from e
78
+
79
+ async def _extract_metadata(self, pdf_path: Path, text: str) -> PDFMetadata:
80
+ """
81
+ Extract metadata from PDF.
82
+
83
+ Args:
84
+ pdf_path: Path to PDF file
85
+ text: Extracted text content
86
+
87
+ Returns:
88
+ PDFMetadata
89
+ """
90
+ # Note: docling-parse doesn't provide document-level metadata
91
+ # So we extract focused metadata from the text content
92
+ header_text = text[:500] if text else ""
93
+
94
+ # Extract year hints
95
+ year_pattern = r"\b(19\d{2}|20\d{2})\b"
96
+ years = re.findall(year_pattern, header_text)
97
+
98
+ # Extract email hints
99
+ email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
100
+ emails = re.findall(email_pattern, text[:2000])
101
+
102
+ # Look for author indicators
103
+ author_indicators = ["by ", "author:", "authors:", "written by"]
104
+ author_hints: list[str] = []
105
+ text_lower = text[:2000].lower()
106
+ for indicator in author_indicators:
107
+ if indicator in text_lower:
108
+ idx = text_lower.index(indicator)
109
+ author_hints.append(text[idx : idx + 100])
110
+
111
+ return PDFMetadata(
112
+ header_text=header_text,
113
+ year_hints=years[:3] if years else None,
114
+ email_hints=emails[:3] if emails else None,
115
+ author_hints=author_hints[:2] if author_hints else None,
116
+ )
@@ -0,0 +1,165 @@
1
+ """PyMuPDF-based PDF extractor with metadata support and OCR fallback."""
2
+
3
+ import re
4
+ from pathlib import Path
5
+
6
+ import pymupdf
7
+
8
+ from pdf_renamer.domain.models import PDFContent, PDFMetadata
9
+ from pdf_renamer.domain.ports import PDFExtractor
10
+
11
+
12
+ class PyMuPDFExtractor(PDFExtractor):
13
+ """PDF extractor using PyMuPDF with metadata and OCR support."""
14
+
15
+ def __init__(self, max_pages: int = 5, max_chars: int = 8000, enable_ocr: bool = True) -> None:
16
+ """
17
+ Initialize the PyMuPDF extractor.
18
+
19
+ Args:
20
+ max_pages: Maximum pages to extract
21
+ max_chars: Maximum characters to extract
22
+ enable_ocr: Enable OCR for scanned PDFs
23
+ """
24
+ self.max_pages = max_pages
25
+ self.max_chars = max_chars
26
+ self.enable_ocr = enable_ocr
27
+
28
+ async def extract(self, pdf_path: Path) -> PDFContent:
29
+ """
30
+ Extract text and metadata from PDF using PyMuPDF.
31
+
32
+ Args:
33
+ pdf_path: Path to PDF file
34
+
35
+ Returns:
36
+ PDFContent with extracted text and metadata
37
+
38
+ Raises:
39
+ RuntimeError: If extraction fails
40
+ """
41
+ try:
42
+ doc = pymupdf.open(pdf_path)
43
+ text_parts: list[str] = []
44
+ total_chars = 0
45
+
46
+ for page_num in range(min(self.max_pages, len(doc))):
47
+ page = doc[page_num]
48
+ page_text = page.get_text()
49
+
50
+ # Add page text until we hit the character limit
51
+ remaining_chars = self.max_chars - total_chars
52
+ if remaining_chars <= 0:
53
+ break
54
+
55
+ text_parts.append(page_text[:remaining_chars])
56
+ total_chars += len(page_text)
57
+
58
+ extracted_text = "\n".join(text_parts).strip()
59
+
60
+ # If very little text and OCR enabled, try OCR
61
+ if len(extracted_text) < 200 and self.enable_ocr:
62
+ extracted_text = await self._extract_with_ocr(pdf_path, doc)
63
+
64
+ # Extract metadata
65
+ metadata = await self._extract_metadata(pdf_path, doc, extracted_text)
66
+
67
+ page_count = len(doc)
68
+ doc.close()
69
+
70
+ return PDFContent(text=extracted_text, metadata=metadata, page_count=page_count)
71
+
72
+ except Exception as e:
73
+ msg = f"Failed to extract text from {pdf_path} using PyMuPDF: {e}"
74
+ raise RuntimeError(msg) from e
75
+
76
+ async def _extract_with_ocr(self, pdf_path: Path, doc: pymupdf.Document) -> str:
77
+ """
78
+ Extract text using OCR for scanned PDFs.
79
+
80
+ Args:
81
+ pdf_path: Path to PDF file
82
+ doc: PyMuPDF document
83
+
84
+ Returns:
85
+ Extracted text
86
+ """
87
+ text_parts: list[str] = []
88
+ total_chars = 0
89
+
90
+ for page_num in range(min(self.max_pages, len(doc))):
91
+ page = doc[page_num]
92
+
93
+ try:
94
+ # Try OCR with Tesseract (if available)
95
+ tp = page.get_textpage(flags=0)
96
+ page_text = tp.extractText()
97
+
98
+ # If still no text, try with flags
99
+ if not page_text or len(page_text.strip()) < 50:
100
+ page_text = page.get_text("text", flags=pymupdf.TEXT_PRESERVE_WHITESPACE)
101
+ except Exception:
102
+ # If OCR fails, get whatever text is available
103
+ page_text = page.get_text()
104
+
105
+ # Add page text until we hit the character limit
106
+ remaining_chars = self.max_chars - total_chars
107
+ if remaining_chars <= 0:
108
+ break
109
+
110
+ text_parts.append(page_text[:remaining_chars])
111
+ total_chars += len(page_text)
112
+
113
+ return "\n".join(text_parts).strip()
114
+
115
+ async def _extract_metadata(
116
+ self, pdf_path: Path, doc: pymupdf.Document, text: str
117
+ ) -> PDFMetadata:
118
+ """
119
+ Extract metadata from PDF.
120
+
121
+ Args:
122
+ pdf_path: Path to PDF file
123
+ doc: PyMuPDF document
124
+ text: Extracted text content
125
+
126
+ Returns:
127
+ PDFMetadata
128
+ """
129
+ # Get PDF metadata
130
+ meta = doc.metadata or {}
131
+
132
+ # Extract focused metadata from text
133
+ header_text = text[:500] if text else ""
134
+
135
+ # Extract year hints
136
+ year_pattern = r"\b(19\d{2}|20\d{2})\b"
137
+ years = re.findall(year_pattern, header_text)
138
+
139
+ # Extract email hints
140
+ email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
141
+ emails = re.findall(email_pattern, text[:2000])
142
+
143
+ # Look for author indicators
144
+ author_indicators = ["by ", "author:", "authors:", "written by"]
145
+ author_hints: list[str] = []
146
+ text_lower = text[:2000].lower()
147
+ for indicator in author_indicators:
148
+ if indicator in text_lower:
149
+ idx = text_lower.index(indicator)
150
+ author_hints.append(text[idx : idx + 100])
151
+
152
+ return PDFMetadata(
153
+ title=meta.get("title"),
154
+ author=meta.get("author"),
155
+ subject=meta.get("subject"),
156
+ keywords=meta.get("keywords"),
157
+ creator=meta.get("creator"),
158
+ producer=meta.get("producer"),
159
+ creation_date=meta.get("creationDate"),
160
+ modification_date=meta.get("modDate"),
161
+ header_text=header_text,
162
+ year_hints=years[:3] if years else None,
163
+ email_hints=emails[:3] if emails else None,
164
+ author_hints=author_hints[:2] if author_hints else None,
165
+ )
pdf_renamer/main.py ADDED
@@ -0,0 +1,6 @@
1
+ """Main entry point for the PDF renamer application."""
2
+
3
+ from pdf_renamer.presentation.cli import app
4
+
5
+ if __name__ == "__main__":
6
+ app()
@@ -0,0 +1,6 @@
1
+ """Presentation layer - CLI and user interaction."""
2
+
3
+ from pdf_renamer.presentation.cli import app
4
+ from pdf_renamer.presentation.formatters import ProgressDisplay
5
+
6
+ __all__ = ["ProgressDisplay", "app"]
@@ -0,0 +1,233 @@
1
+ """CLI interface using Typer."""
2
+
3
+ import asyncio
4
+ import contextlib
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ import typer
9
+ from rich.console import Console
10
+ from rich.live import Live
11
+
12
+ from pdf_renamer.application import (
13
+ FilenameService,
14
+ PDFRenameWorkflow,
15
+ RenameService,
16
+ )
17
+ from pdf_renamer.infrastructure.config import Settings
18
+ from pdf_renamer.infrastructure.llm import PydanticAIProvider
19
+ from pdf_renamer.infrastructure.pdf import (
20
+ CompositePDFExtractor,
21
+ DoclingPDFExtractor,
22
+ PyMuPDFExtractor,
23
+ )
24
+ from pdf_renamer.presentation.formatters import (
25
+ InteractivePrompt,
26
+ ProgressDisplay,
27
+ ResultsTable,
28
+ )
29
+
30
+ app = typer.Typer(help="Intelligent PDF renaming using LLMs")
31
+ console = Console()
32
+
33
+
34
+ def create_workflow(settings: Settings) -> PDFRenameWorkflow:
35
+ """
36
+ Create the workflow with all dependencies (Dependency Injection).
37
+
38
+ This is the "Composition Root" where we wire up all dependencies.
39
+
40
+ Args:
41
+ settings: Application settings
42
+
43
+ Returns:
44
+ Configured PDFRenameWorkflow
45
+ """
46
+ # Create PDF extractor (composite with fallback strategy)
47
+ extractors = [
48
+ DoclingPDFExtractor(max_pages=settings.pdf_max_pages, max_chars=settings.pdf_max_chars),
49
+ PyMuPDFExtractor(
50
+ max_pages=settings.pdf_max_pages,
51
+ max_chars=settings.pdf_max_chars,
52
+ enable_ocr=True,
53
+ ),
54
+ ]
55
+ pdf_extractor = CompositePDFExtractor(extractors)
56
+
57
+ # Create LLM provider
58
+ llm_provider = PydanticAIProvider(
59
+ model_name=settings.llm_model,
60
+ api_key=settings.openai_api_key,
61
+ base_url=settings.llm_base_url,
62
+ retry_max_attempts=settings.retry_max_attempts,
63
+ retry_min_wait=settings.retry_min_wait,
64
+ retry_max_wait=settings.retry_max_wait,
65
+ )
66
+
67
+ # Create application services
68
+ filename_service = FilenameService(llm_provider)
69
+ file_renamer = RenameService()
70
+
71
+ # Create workflow
72
+ return PDFRenameWorkflow(
73
+ pdf_extractor=pdf_extractor,
74
+ filename_generator=filename_service,
75
+ file_renamer=file_renamer,
76
+ max_concurrent_api=settings.max_concurrent_api,
77
+ max_concurrent_pdf=settings.max_concurrent_pdf,
78
+ )
79
+
80
+
81
+ @app.command()
82
+ def main(
83
+ directory: Annotated[
84
+ Path, typer.Argument(help="Directory containing PDF files to rename")
85
+ ] = Path.cwd(),
86
+ dry_run: Annotated[
87
+ bool, typer.Option("--dry-run/--no-dry-run", help="Show suggestions without renaming")
88
+ ] = True,
89
+ model: Annotated[
90
+ str | None,
91
+ typer.Option("--model", help="Model to use (overrides config)"),
92
+ ] = None,
93
+ url: Annotated[
94
+ str | None,
95
+ typer.Option("--url", help="Custom base URL for OpenAI-compatible APIs"),
96
+ ] = None,
97
+ interactive: Annotated[
98
+ bool, typer.Option("--interactive", "-i", help="Confirm each rename")
99
+ ] = False,
100
+ pattern: Annotated[str, typer.Option("--pattern", help="Glob pattern for PDF files")] = "*.pdf",
101
+ output_dir: Annotated[
102
+ Path | None,
103
+ typer.Option("--output-dir", "-o", help="Move renamed files to this directory"),
104
+ ] = None,
105
+ ) -> None:
106
+ """Rename PDF files in a directory using LLM-generated suggestions."""
107
+ # Load settings
108
+ settings = Settings()
109
+
110
+ # Override settings from CLI args
111
+ if model:
112
+ settings.llm_model = model
113
+ if url:
114
+ settings.llm_base_url = url
115
+
116
+ # Validate output directory
117
+ if output_dir:
118
+ output_dir.mkdir(parents=True, exist_ok=True)
119
+ if not output_dir.is_dir():
120
+ console.print(f"[red]Error: {output_dir} is not a directory[/red]")
121
+ raise typer.Exit(1)
122
+
123
+ # Find PDF files
124
+ pdf_files = sorted(directory.glob(pattern))
125
+ if not pdf_files:
126
+ console.print(f"[yellow]No PDF files found matching '{pattern}' in {directory}[/yellow]")
127
+ raise typer.Exit(0)
128
+
129
+ console.print(f"Found {len(pdf_files)} PDF files to process\n")
130
+
131
+ # Create workflow
132
+ workflow = create_workflow(settings)
133
+
134
+ # Process files with progress display
135
+ async def process_all() -> list:
136
+ progress = ProgressDisplay(console, len(pdf_files))
137
+
138
+ def status_callback(filename: str, status: dict[str, str]) -> None:
139
+ progress.update_status(filename, status)
140
+
141
+ # Run with live display
142
+ with Live(progress.create_display(), console=console, refresh_per_second=4) as live:
143
+
144
+ async def update_display() -> None:
145
+ while True:
146
+ live.update(progress.create_display())
147
+ await asyncio.sleep(0.25)
148
+
149
+ display_task = asyncio.create_task(update_display())
150
+
151
+ results = await workflow.process_batch(pdf_files, status_callback)
152
+
153
+ display_task.cancel()
154
+ with contextlib.suppress(asyncio.CancelledError):
155
+ await display_task
156
+
157
+ live.update(progress.create_display())
158
+
159
+ return results
160
+
161
+ # Run processing
162
+ console.print(
163
+ f"[bold]Processing {len(pdf_files)} PDFs with max {settings.max_concurrent_api} "
164
+ f"concurrent API calls and {settings.max_concurrent_pdf} concurrent extractions[/bold]\n"
165
+ )
166
+ results = asyncio.run(process_all())
167
+
168
+ # Filter successful operations
169
+ operations = [r for r in results if r is not None]
170
+
171
+ if not operations:
172
+ console.print("[red]No files could be processed successfully[/red]")
173
+ raise typer.Exit(1)
174
+
175
+ # Display results (if not interactive)
176
+ if not interactive:
177
+ ResultsTable.create(operations, console)
178
+
179
+ # Execute renames
180
+ if not dry_run or interactive:
181
+ renamed_count = 0
182
+ skipped_count = 0
183
+
184
+ async def execute_renames() -> None:
185
+ nonlocal renamed_count, skipped_count
186
+
187
+ prompt = InteractivePrompt(console) if interactive else None
188
+
189
+ for operation in operations:
190
+ # Interactive mode
191
+ if interactive and prompt:
192
+ final_name, should_rename = await prompt.prompt_for_action(operation)
193
+ if not should_rename:
194
+ skipped_count += 1
195
+ continue
196
+ # Update operation with user's choice
197
+ operation.suggested_filename = final_name
198
+
199
+ # Skip if no change
200
+ if not output_dir and operation.original_path.name == operation.new_filename:
201
+ skipped_count += 1
202
+ continue
203
+
204
+ # Execute rename
205
+ try:
206
+ success = await workflow.execute_rename(operation, output_dir, dry_run)
207
+ if success:
208
+ if dry_run:
209
+ console.print(
210
+ f"[dim]Would rename: {operation.original_path.name} → "
211
+ f"{operation.new_filename}[/dim]"
212
+ )
213
+ else:
214
+ new_path = operation.create_new_path(output_dir)
215
+ console.print(
216
+ f"[green]✓[/green] {operation.original_path.name} → {new_path.name}"
217
+ )
218
+ renamed_count += 1
219
+ except Exception as e:
220
+ console.print(
221
+ f"[red]✗[/red] Failed to rename {operation.original_path.name}: {e}"
222
+ )
223
+ skipped_count += 1
224
+
225
+ asyncio.run(execute_renames())
226
+ console.print(f"\n[bold]Summary:[/bold] {renamed_count} renamed, {skipped_count} skipped")
227
+ else:
228
+ console.print("\n[bold yellow]Dry run mode - no files were renamed[/bold yellow]")
229
+ console.print("Run without --dry-run to apply changes")
230
+
231
+
232
+ if __name__ == "__main__":
233
+ app()