pdf-file-renamer 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,80 @@
1
+ """Domain models - core business entities."""
2
+
3
+ from dataclasses import dataclass
4
+ from enum import Enum
5
+ from pathlib import Path
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class ConfidenceLevel(str, Enum):
11
+ """Confidence level for filename suggestions."""
12
+
13
+ HIGH = "high"
14
+ MEDIUM = "medium"
15
+ LOW = "low"
16
+ ERROR = "error"
17
+
18
+
19
+ class FilenameResult(BaseModel):
20
+ """Result of filename generation."""
21
+
22
+ model_config = {"use_enum_values": True}
23
+
24
+ filename: str = Field(description="Suggested filename without extension")
25
+ confidence: ConfidenceLevel = Field(description="Confidence level of the suggestion")
26
+ reasoning: str = Field(description="Explanation of why this filename was chosen")
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class PDFMetadata:
31
+ """Metadata extracted from PDF."""
32
+
33
+ title: str | None = None
34
+ author: str | None = None
35
+ subject: str | None = None
36
+ keywords: str | None = None
37
+ creator: str | None = None
38
+ producer: str | None = None
39
+ creation_date: str | None = None
40
+ modification_date: str | None = None
41
+ # Focused metadata extracted from document content
42
+ header_text: str | None = None
43
+ year_hints: list[str] | None = None
44
+ email_hints: list[str] | None = None
45
+ author_hints: list[str] | None = None
46
+
47
+ def to_dict(self) -> dict[str, str | list[str] | None]:
48
+ """Convert to dictionary, excluding None values."""
49
+ return {k: v for k, v in self.__dict__.items() if v is not None}
50
+
51
+
52
+ @dataclass(frozen=True)
53
+ class PDFContent:
54
+ """Extracted content from PDF."""
55
+
56
+ text: str
57
+ metadata: PDFMetadata
58
+ page_count: int
59
+
60
+
61
+ @dataclass
62
+ class FileRenameOperation:
63
+ """Represents a file rename operation."""
64
+
65
+ original_path: Path
66
+ suggested_filename: str
67
+ confidence: ConfidenceLevel
68
+ reasoning: str
69
+ text_excerpt: str
70
+ metadata: PDFMetadata
71
+
72
+ @property
73
+ def new_filename(self) -> str:
74
+ """Get the new filename with extension."""
75
+ return f"{self.suggested_filename}.pdf"
76
+
77
+ def create_new_path(self, output_dir: Path | None = None) -> Path:
78
+ """Create the new path for the renamed file."""
79
+ target_dir = output_dir if output_dir else self.original_path.parent
80
+ return target_dir / self.new_filename
@@ -0,0 +1,106 @@
1
+ """Domain ports - interfaces for external dependencies (Dependency Inversion Principle)."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from pathlib import Path
5
+
6
+ from pdf_renamer.domain.models import FilenameResult, PDFContent
7
+
8
+
9
+ class PDFExtractor(ABC):
10
+ """Interface for PDF text extraction."""
11
+
12
+ @abstractmethod
13
+ async def extract(self, pdf_path: Path) -> PDFContent:
14
+ """
15
+ Extract text and metadata from a PDF file.
16
+
17
+ Args:
18
+ pdf_path: Path to the PDF file
19
+
20
+ Returns:
21
+ PDFContent with extracted text and metadata
22
+
23
+ Raises:
24
+ RuntimeError: If extraction fails
25
+ """
26
+ pass
27
+
28
+
29
+ class LLMProvider(ABC):
30
+ """Interface for LLM providers."""
31
+
32
+ @abstractmethod
33
+ async def generate_filename(
34
+ self,
35
+ original_filename: str,
36
+ text_excerpt: str,
37
+ metadata_dict: dict[str, str | list[str] | None],
38
+ ) -> FilenameResult:
39
+ """
40
+ Generate a filename suggestion using an LLM.
41
+
42
+ Args:
43
+ original_filename: Current filename
44
+ text_excerpt: Extracted text from PDF
45
+ metadata_dict: PDF metadata dictionary
46
+
47
+ Returns:
48
+ FilenameResult with suggestion and confidence
49
+
50
+ Raises:
51
+ RuntimeError: If generation fails
52
+ """
53
+ pass
54
+
55
+
56
+ class FilenameGenerator(ABC):
57
+ """Interface for filename generation service."""
58
+
59
+ @abstractmethod
60
+ async def generate(self, original_filename: str, content: PDFContent) -> FilenameResult:
61
+ """
62
+ Generate a filename suggestion based on PDF content.
63
+
64
+ Args:
65
+ original_filename: Current filename
66
+ content: Extracted PDF content
67
+
68
+ Returns:
69
+ FilenameResult with suggestion
70
+ """
71
+ pass
72
+
73
+ @abstractmethod
74
+ def sanitize(self, filename: str) -> str:
75
+ """
76
+ Sanitize a filename to be filesystem-safe.
77
+
78
+ Args:
79
+ filename: Raw filename
80
+
81
+ Returns:
82
+ Sanitized filename
83
+ """
84
+ pass
85
+
86
+
87
+ class FileRenamer(ABC):
88
+ """Interface for file renaming operations."""
89
+
90
+ @abstractmethod
91
+ async def rename(self, original_path: Path, new_path: Path, dry_run: bool = True) -> bool:
92
+ """
93
+ Rename a file.
94
+
95
+ Args:
96
+ original_path: Original file path
97
+ new_path: New file path
98
+ dry_run: If True, don't actually rename
99
+
100
+ Returns:
101
+ True if successful
102
+
103
+ Raises:
104
+ RuntimeError: If rename fails
105
+ """
106
+ pass
@@ -0,0 +1,5 @@
1
+ """Infrastructure layer - external dependencies and implementations."""
2
+
3
+ from pdf_renamer.infrastructure.config import Settings, get_settings
4
+
5
+ __all__ = ["Settings", "get_settings"]
@@ -0,0 +1,94 @@
1
+ """Configuration management using Pydantic Settings."""
2
+
3
+ from functools import lru_cache
4
+ from typing import Literal
5
+
6
+ from pydantic import Field
7
+ from pydantic_settings import BaseSettings, SettingsConfigDict
8
+
9
+
10
+ class Settings(BaseSettings):
11
+ """Application settings loaded from environment variables."""
12
+
13
+ model_config = SettingsConfigDict(
14
+ env_file=".env",
15
+ env_file_encoding="utf-8",
16
+ case_sensitive=False,
17
+ extra="ignore",
18
+ )
19
+
20
+ # LLM Configuration
21
+ llm_provider: Literal["openai"] = Field(
22
+ default="openai",
23
+ description="LLM provider to use",
24
+ )
25
+ llm_model: str = Field(
26
+ default="llama3.2",
27
+ description="Model name to use",
28
+ )
29
+ llm_base_url: str = Field(
30
+ default="http://localhost:11434/v1",
31
+ description="Base URL for OpenAI-compatible API",
32
+ )
33
+ openai_api_key: str | None = Field(
34
+ default=None,
35
+ description="OpenAI API key (optional for local models)",
36
+ )
37
+
38
+ # PDF Extraction Configuration
39
+ pdf_max_pages: int = Field(
40
+ default=5,
41
+ ge=1,
42
+ le=50,
43
+ description="Maximum pages to extract from PDF",
44
+ )
45
+ pdf_max_chars: int = Field(
46
+ default=8000,
47
+ ge=1000,
48
+ le=50000,
49
+ description="Maximum characters to extract from PDF",
50
+ )
51
+ pdf_extractor: Literal["docling", "pymupdf"] = Field(
52
+ default="docling",
53
+ description="PDF extractor to use (docling for better structure, pymupdf for speed)",
54
+ )
55
+
56
+ # Processing Configuration
57
+ max_concurrent_api: int = Field(
58
+ default=3,
59
+ ge=1,
60
+ le=20,
61
+ description="Maximum concurrent API calls",
62
+ )
63
+ max_concurrent_pdf: int = Field(
64
+ default=10,
65
+ ge=1,
66
+ le=50,
67
+ description="Maximum concurrent PDF extractions",
68
+ )
69
+
70
+ # Retry Configuration
71
+ retry_max_attempts: int = Field(
72
+ default=3,
73
+ ge=1,
74
+ le=10,
75
+ description="Maximum retry attempts for API calls",
76
+ )
77
+ retry_min_wait: int = Field(
78
+ default=4,
79
+ ge=1,
80
+ le=60,
81
+ description="Minimum wait time for exponential backoff (seconds)",
82
+ )
83
+ retry_max_wait: int = Field(
84
+ default=30,
85
+ ge=1,
86
+ le=300,
87
+ description="Maximum wait time for exponential backoff (seconds)",
88
+ )
89
+
90
+
91
+ @lru_cache
92
+ def get_settings() -> Settings:
93
+ """Get cached settings instance (singleton pattern)."""
94
+ return Settings()
@@ -0,0 +1,5 @@
1
+ """LLM provider implementations."""
2
+
3
+ from pdf_renamer.infrastructure.llm.pydantic_ai_provider import PydanticAIProvider
4
+
5
+ __all__ = ["PydanticAIProvider"]
@@ -0,0 +1,234 @@
1
+ """LLM provider using Pydantic AI for structured output generation."""
2
+
3
+ from openai import (
4
+ APIConnectionError,
5
+ APIError,
6
+ APITimeoutError,
7
+ AsyncOpenAI,
8
+ RateLimitError,
9
+ )
10
+ from pydantic_ai import Agent
11
+ from pydantic_ai.models.openai import OpenAIModel
12
+ from pydantic_ai.providers.openai import OpenAIProvider
13
+ from tenacity import (
14
+ retry,
15
+ retry_if_exception_type,
16
+ stop_after_attempt,
17
+ wait_exponential,
18
+ )
19
+
20
+ from pdf_renamer.domain.models import ConfidenceLevel, FilenameResult
21
+ from pdf_renamer.domain.ports import LLMProvider
22
+
23
+ # System prompt for filename generation
24
+ FILENAME_GENERATION_PROMPT = """You are an expert at creating concise, descriptive filenames for academic papers and technical documents.
25
+
26
+ Your task is to analyze PDF content and suggest a clear, descriptive filename that accurately captures the document's identity.
27
+
28
+ CRITICAL: PDF metadata (title, author, subject) is often UNRELIABLE or MISSING. Always prioritize what you find in the actual document text over metadata fields.
29
+
30
+ Filename Format: Author-Topic-Year
31
+ Example: Smith-Neural-Networks-Deep-Learning-2020
32
+
33
+ EXTRACTION STRATEGY:
34
+ 1. AUTHOR: Look for author names in these locations (in order of reliability):
35
+ - First page header/title area
36
+ - After the title (often in smaller font or with affiliations)
37
+ - Paper byline (e.g., "by John Smith" or "Authors: Smith et al.")
38
+ - Email addresses can help confirm author names
39
+ - If multiple authors, use ONLY the first author's last name
40
+ - IGNORE metadata author field if it conflicts with document text
41
+
42
+ 2. TOPIC/TITLE: Look for the main title in:
43
+ - Large text at top of first page (usually biggest font)
44
+ - Abstract section which often restates the title
45
+ - Running headers on subsequent pages
46
+ - Condense long titles to key terms (3-6 words)
47
+ - Remove generic words like "A Study of", "An Analysis of", "Introduction to"
48
+ - Keep domain-specific terminology intact
49
+
50
+ 3. YEAR: Look for publication year in:
51
+ - Copyright notice or footer on first page
52
+ - Date near title or author information
53
+ - Conference/journal citation info
54
+ - Page headers/footers
55
+ - ONLY include year if you find it clearly stated
56
+ - Do NOT guess or estimate years
57
+
58
+ EXAMPLES OF GOOD FILENAMES:
59
+ - Hinton-Deep-Learning-Review-2015
60
+ - Vapnik-Support-Vector-Networks-1995
61
+ - Goodfellow-Generative-Adversarial-Networks-2014
62
+ - Hochreiter-Long-Short-Term-Memory-1997
63
+
64
+ FORMATTING RULES:
65
+ - Use hyphens between ALL words (no spaces or underscores)
66
+ - Use title case for all words
67
+ - Remove special characters: colons, quotes, commas, parentheses
68
+ - Target 60-100 characters total (can be shorter or slightly longer if needed)
69
+ - If title is very long, focus on the most distinctive/searchable terms
70
+
71
+ CONFIDENCE LEVELS:
72
+ - HIGH: You found author (first page), clear title, and year in the document text
73
+ - MEDIUM: You found title and either author OR year, or title is very clear but other elements missing
74
+ - LOW: Document text is unclear, heavily formatted, or you can only extract partial information
75
+
76
+ IMPORTANT: When metadata contradicts document text, TRUST THE DOCUMENT TEXT. Explain your reasoning briefly."""
77
+
78
+
79
+ class PydanticAIProvider(LLMProvider):
80
+ """LLM provider using Pydantic AI with structured outputs."""
81
+
82
+ def __init__(
83
+ self,
84
+ model_name: str,
85
+ api_key: str | None = None,
86
+ base_url: str | None = None,
87
+ retry_max_attempts: int = 3,
88
+ retry_min_wait: int = 4,
89
+ retry_max_wait: int = 30,
90
+ ) -> None:
91
+ """
92
+ Initialize the Pydantic AI provider.
93
+
94
+ Args:
95
+ model_name: Model name to use
96
+ api_key: API key (optional for local models)
97
+ base_url: Base URL for OpenAI-compatible API
98
+ retry_max_attempts: Maximum retry attempts
99
+ retry_min_wait: Minimum wait time for retries (seconds)
100
+ retry_max_wait: Maximum wait time for retries (seconds)
101
+ """
102
+ self.model_name = model_name
103
+ self.retry_max_attempts = retry_max_attempts
104
+ self.retry_min_wait = retry_min_wait
105
+ self.retry_max_wait = retry_max_wait
106
+
107
+ # Create model with appropriate configuration
108
+ if base_url:
109
+ client = AsyncOpenAI(base_url=base_url, api_key=api_key or "dummy-key")
110
+ provider = OpenAIProvider(openai_client=client)
111
+ model = OpenAIModel(model_name, provider=provider)
112
+ else:
113
+ if api_key:
114
+ client = AsyncOpenAI(api_key=api_key)
115
+ provider = OpenAIProvider(openai_client=client)
116
+ model = OpenAIModel(model_name, provider=provider)
117
+ else:
118
+ model = OpenAIModel(model_name)
119
+
120
+ # Create agent with structured output
121
+ self.agent: Agent[None, FilenameResult] = Agent(
122
+ model=model,
123
+ output_type=FilenameResult,
124
+ system_prompt=FILENAME_GENERATION_PROMPT,
125
+ )
126
+
127
+ @retry(
128
+ stop=stop_after_attempt(3),
129
+ wait=wait_exponential(multiplier=2, min=4, max=30),
130
+ retry=retry_if_exception_type(
131
+ (APIError, APIConnectionError, RateLimitError, APITimeoutError)
132
+ ),
133
+ reraise=True,
134
+ )
135
+ async def generate_filename(
136
+ self,
137
+ original_filename: str,
138
+ text_excerpt: str,
139
+ metadata_dict: dict[str, str | list[str] | None],
140
+ ) -> FilenameResult:
141
+ """
142
+ Generate filename using LLM with retry logic.
143
+
144
+ Args:
145
+ original_filename: Current filename
146
+ text_excerpt: Extracted text from PDF
147
+ metadata_dict: PDF metadata dictionary
148
+
149
+ Returns:
150
+ FilenameResult with suggestion
151
+
152
+ Raises:
153
+ RuntimeError: If generation fails after retries
154
+ """
155
+ try:
156
+ # Build context for LLM
157
+ context_parts = [f"Original filename: {original_filename}"]
158
+
159
+ # Add metadata hints if available
160
+ if title := metadata_dict.get("title"):
161
+ context_parts.append(f"PDF Title metadata (may be unreliable): {title}")
162
+ if author := metadata_dict.get("author"):
163
+ context_parts.append(f"PDF Author metadata (may be unreliable): {author}")
164
+ if subject := metadata_dict.get("subject"):
165
+ context_parts.append(f"PDF Subject metadata (may be unreliable): {subject}")
166
+
167
+ # Add focused metadata hints
168
+ year_hints = metadata_dict.get("year_hints")
169
+ if year_hints and isinstance(year_hints, list):
170
+ context_parts.append(f"Years found in document: {', '.join(year_hints)}")
171
+
172
+ email_hints = metadata_dict.get("email_hints")
173
+ if email_hints and isinstance(email_hints, list):
174
+ context_parts.append(
175
+ f"Email addresses found (often near authors): {', '.join(email_hints[:2])}"
176
+ )
177
+
178
+ author_hints = metadata_dict.get("author_hints")
179
+ if author_hints and isinstance(author_hints, list):
180
+ context_parts.append("Possible author sections:\n" + "\n".join(author_hints[:2]))
181
+ if header_text := metadata_dict.get("header_text"):
182
+ context_parts.append(f"First 500 chars (likely title/author area):\n{header_text}")
183
+
184
+ # Add full text excerpt
185
+ context_parts.append(f"\nFull content excerpt (first ~5 pages):\n{text_excerpt}")
186
+
187
+ context = "\n".join(context_parts)
188
+
189
+ # Generate filename
190
+ result = await self.agent.run(context)
191
+ suggestion = result.output
192
+
193
+ # If confidence is low, try a focused second pass
194
+ if suggestion.confidence == ConfidenceLevel.LOW:
195
+ suggestion = await self._retry_with_focus(original_filename, text_excerpt)
196
+
197
+ return suggestion
198
+
199
+ except Exception as e:
200
+ msg = f"Failed to generate filename: {e}"
201
+ raise RuntimeError(msg) from e
202
+
203
+ async def _retry_with_focus(self, original_filename: str, text_excerpt: str) -> FilenameResult:
204
+ """
205
+ Retry filename generation with more focused prompting.
206
+
207
+ Args:
208
+ original_filename: Current filename
209
+ text_excerpt: Extracted text from PDF
210
+
211
+ Returns:
212
+ FilenameResult from second pass
213
+ """
214
+ # Focus on first portion of text
215
+ first_pages = text_excerpt[:4000]
216
+
217
+ focused_context = f"""SECOND PASS - The initial analysis had low confidence. Please analyze more carefully.
218
+
219
+ Original filename: {original_filename}
220
+
221
+ FOCUS ON: The first few pages contain the most important metadata (title, author, year).
222
+ Look VERY carefully at:
223
+ 1. The largest text on page 1 (this is usually the title)
224
+ 2. Text immediately after the title (usually authors and affiliations)
225
+ 3. Any dates, copyright notices, or publication info on page 1
226
+ 4. Headers and footers that might contain publication info
227
+
228
+ First pages content:
229
+ {first_pages}
230
+
231
+ Please extract whatever information you can find with certainty. If you cannot find author or year, that's OK - just provide the best title you can determine."""
232
+
233
+ result = await self.agent.run(focused_context)
234
+ return result.output
@@ -0,0 +1,7 @@
1
+ """PDF extraction implementations."""
2
+
3
+ from pdf_renamer.infrastructure.pdf.composite import CompositePDFExtractor
4
+ from pdf_renamer.infrastructure.pdf.docling_extractor import DoclingPDFExtractor
5
+ from pdf_renamer.infrastructure.pdf.pymupdf_extractor import PyMuPDFExtractor
6
+
7
+ __all__ = ["CompositePDFExtractor", "DoclingPDFExtractor", "PyMuPDFExtractor"]
@@ -0,0 +1,57 @@
1
+ """Composite PDF extractor that tries multiple strategies."""
2
+
3
+ from pathlib import Path
4
+
5
+ from pdf_renamer.domain.models import PDFContent
6
+ from pdf_renamer.domain.ports import PDFExtractor
7
+
8
+
9
+ class CompositePDFExtractor(PDFExtractor):
10
+ """
11
+ Composite PDF extractor that tries multiple extractors in sequence.
12
+
13
+ This implements the Chain of Responsibility pattern with fallback strategy.
14
+ """
15
+
16
+ def __init__(self, extractors: list[PDFExtractor]) -> None:
17
+ """
18
+ Initialize the composite extractor.
19
+
20
+ Args:
21
+ extractors: List of extractors to try in order
22
+ """
23
+ if not extractors:
24
+ msg = "At least one extractor must be provided"
25
+ raise ValueError(msg)
26
+ self.extractors = extractors
27
+
28
+ async def extract(self, pdf_path: Path) -> PDFContent:
29
+ """
30
+ Try extractors in sequence until one succeeds.
31
+
32
+ Args:
33
+ pdf_path: Path to PDF file
34
+
35
+ Returns:
36
+ PDFContent from first successful extractor
37
+
38
+ Raises:
39
+ RuntimeError: If all extractors fail
40
+ """
41
+ errors: list[str] = []
42
+
43
+ for extractor in self.extractors:
44
+ try:
45
+ content = await extractor.extract(pdf_path)
46
+ # Only accept if we got meaningful text
47
+ if len(content.text.strip()) > 100:
48
+ return content
49
+ errors.append(f"{extractor.__class__.__name__}: Insufficient text extracted")
50
+ except Exception as e:
51
+ errors.append(f"{extractor.__class__.__name__}: {e}")
52
+ continue
53
+
54
+ # All extractors failed
55
+ error_msg = "; ".join(errors)
56
+ msg = f"All PDF extractors failed for {pdf_path}: {error_msg}"
57
+ raise RuntimeError(msg)