pdf-file-renamer 0.4.2__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {pdf_renamer → pdf_file_renamer}/__init__.py +1 -1
  2. pdf_file_renamer/application/__init__.py +7 -0
  3. pdf_file_renamer/application/filename_service.py +172 -0
  4. {pdf_renamer → pdf_file_renamer}/application/pdf_rename_workflow.py +29 -4
  5. {pdf_renamer → pdf_file_renamer}/application/rename_service.py +1 -1
  6. {pdf_renamer → pdf_file_renamer}/domain/__init__.py +2 -2
  7. {pdf_renamer → pdf_file_renamer}/domain/models.py +29 -0
  8. {pdf_renamer → pdf_file_renamer}/domain/ports.py +18 -1
  9. {pdf_renamer → pdf_file_renamer}/infrastructure/__init__.py +1 -1
  10. pdf_file_renamer/infrastructure/doi/__init__.py +5 -0
  11. pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py +129 -0
  12. pdf_file_renamer/infrastructure/llm/__init__.py +5 -0
  13. {pdf_renamer → pdf_file_renamer}/infrastructure/llm/pydantic_ai_provider.py +2 -2
  14. pdf_file_renamer/infrastructure/pdf/__init__.py +7 -0
  15. {pdf_renamer → pdf_file_renamer}/infrastructure/pdf/composite.py +2 -2
  16. {pdf_renamer → pdf_file_renamer}/infrastructure/pdf/docling_extractor.py +2 -2
  17. {pdf_renamer → pdf_file_renamer}/infrastructure/pdf/pymupdf_extractor.py +2 -2
  18. {pdf_renamer → pdf_file_renamer}/main.py +1 -1
  19. pdf_file_renamer/presentation/__init__.py +6 -0
  20. {pdf_renamer → pdf_file_renamer}/presentation/cli.py +10 -5
  21. {pdf_renamer → pdf_file_renamer}/presentation/formatters.py +1 -1
  22. {pdf_file_renamer-0.4.2.dist-info → pdf_file_renamer-0.6.0.dist-info}/METADATA +50 -23
  23. pdf_file_renamer-0.6.0.dist-info/RECORD +27 -0
  24. {pdf_file_renamer-0.4.2.dist-info → pdf_file_renamer-0.6.0.dist-info}/WHEEL +1 -2
  25. pdf_file_renamer-0.6.0.dist-info/entry_points.txt +2 -0
  26. pdf_file_renamer-0.4.2.dist-info/RECORD +0 -26
  27. pdf_file_renamer-0.4.2.dist-info/entry_points.txt +0 -2
  28. pdf_file_renamer-0.4.2.dist-info/top_level.txt +0 -1
  29. pdf_renamer/application/__init__.py +0 -7
  30. pdf_renamer/application/filename_service.py +0 -70
  31. pdf_renamer/infrastructure/llm/__init__.py +0 -5
  32. pdf_renamer/infrastructure/pdf/__init__.py +0 -7
  33. pdf_renamer/presentation/__init__.py +0 -6
  34. {pdf_renamer → pdf_file_renamer}/infrastructure/config.py +0 -0
  35. {pdf_file_renamer-0.4.2.dist-info → pdf_file_renamer-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,3 +1,3 @@
1
1
  """PDF Renamer - Intelligent PDF file renaming using LLMs."""
2
2
 
3
- __version__ = "0.4.2"
3
+ __version__ = "0.6.0"
@@ -0,0 +1,7 @@
1
+ """Application layer - use cases and business logic orchestration."""
2
+
3
+ from pdf_file_renamer.application.filename_service import FilenameService
4
+ from pdf_file_renamer.application.pdf_rename_workflow import PDFRenameWorkflow
5
+ from pdf_file_renamer.application.rename_service import RenameService
6
+
7
+ __all__ = ["FilenameService", "PDFRenameWorkflow", "RenameService"]
@@ -0,0 +1,172 @@
1
+ """Filename generation service - coordinates PDF extraction and LLM generation."""
2
+
3
+ import re
4
+
5
+ from pdf_file_renamer.domain.models import ConfidenceLevel, FilenameResult, PDFContent
6
+ from pdf_file_renamer.domain.ports import FilenameGenerator, LLMProvider
7
+
8
+
9
+ class FilenameService(FilenameGenerator):
10
+ """Service for generating filenames from PDF content."""
11
+
12
+ def __init__(self, llm_provider: LLMProvider) -> None:
13
+ """
14
+ Initialize the filename service.
15
+
16
+ Args:
17
+ llm_provider: LLM provider for filename generation
18
+ """
19
+ self.llm_provider = llm_provider
20
+
21
+ async def generate(self, original_filename: str, content: PDFContent) -> FilenameResult:
22
+ """
23
+ Generate a filename suggestion based on PDF content.
24
+
25
+ Args:
26
+ original_filename: Current filename
27
+ content: Extracted PDF content
28
+
29
+ Returns:
30
+ FilenameResult with suggestion
31
+ """
32
+ # If DOI metadata is available, use it directly for high-confidence naming
33
+ if content.doi_metadata:
34
+ return self._generate_from_doi(content)
35
+
36
+ # Otherwise, fall back to LLM-based generation
37
+ # Convert metadata to dictionary
38
+ metadata_dict = content.metadata.to_dict()
39
+
40
+ # Generate filename using LLM
41
+ result = await self.llm_provider.generate_filename(
42
+ original_filename=original_filename,
43
+ text_excerpt=content.text,
44
+ metadata_dict=metadata_dict,
45
+ )
46
+
47
+ # Sanitize the generated filename
48
+ result.filename = self.sanitize(result.filename)
49
+
50
+ return result
51
+
52
+ def _generate_from_doi(self, content: PDFContent) -> FilenameResult:
53
+ """
54
+ Generate filename directly from DOI metadata.
55
+
56
+ Args:
57
+ content: PDF content with DOI metadata
58
+
59
+ Returns:
60
+ FilenameResult with very high confidence
61
+ """
62
+ doi_meta = content.doi_metadata
63
+ if not doi_meta:
64
+ msg = "DOI metadata not available"
65
+ raise ValueError(msg)
66
+
67
+ # Extract components for filename
68
+ author = doi_meta.first_author or "Unknown"
69
+
70
+ # Get title and clean it
71
+ title = doi_meta.title or "Document"
72
+ # Extract key words from title (remove common words)
73
+ title_words = self._extract_key_words(title)
74
+
75
+ year = doi_meta.year or ""
76
+
77
+ # Build filename: Author-KeyWords-Year
78
+ parts = [author]
79
+ if title_words:
80
+ parts.append(title_words)
81
+ if year:
82
+ parts.append(year)
83
+
84
+ filename = "-".join(parts)
85
+ filename = self.sanitize(filename)
86
+
87
+ return FilenameResult(
88
+ filename=filename,
89
+ confidence=ConfidenceLevel.VERY_HIGH,
90
+ reasoning=f"Filename generated from DOI metadata (DOI: {doi_meta.doi}). "
91
+ f"Author: {author}, Year: {year}",
92
+ )
93
+
94
+ def _extract_key_words(self, title: str, max_words: int = 6) -> str:
95
+ """
96
+ Extract key words from title, removing common words.
97
+
98
+ Args:
99
+ title: Paper title
100
+ max_words: Maximum number of words to include
101
+
102
+ Returns:
103
+ Hyphenated key words
104
+ """
105
+ # Common words to skip
106
+ stop_words = {
107
+ "a",
108
+ "an",
109
+ "the",
110
+ "and",
111
+ "or",
112
+ "but",
113
+ "in",
114
+ "on",
115
+ "at",
116
+ "to",
117
+ "for",
118
+ "of",
119
+ "with",
120
+ "by",
121
+ "from",
122
+ "as",
123
+ "is",
124
+ "was",
125
+ "are",
126
+ "were",
127
+ "been",
128
+ "be",
129
+ "this",
130
+ "that",
131
+ "these",
132
+ "those",
133
+ }
134
+
135
+ # Clean and split title
136
+ words = re.sub(r"[^\w\s-]", " ", title.lower()).split()
137
+
138
+ # Filter stop words and keep significant words
139
+ key_words = [w for w in words if w not in stop_words and len(w) > 2]
140
+
141
+ # Limit to max_words
142
+ key_words = key_words[:max_words]
143
+
144
+ # Capitalize first letter of each word
145
+ key_words = [w.capitalize() for w in key_words]
146
+
147
+ return "-".join(key_words)
148
+
149
+ def sanitize(self, filename: str) -> str:
150
+ """
151
+ Sanitize a filename to be filesystem-safe.
152
+
153
+ Args:
154
+ filename: Raw filename
155
+
156
+ Returns:
157
+ Sanitized filename
158
+ """
159
+ # Remove or replace invalid characters
160
+ filename = re.sub(r'[<>:"/\\|?*]', "", filename)
161
+
162
+ # Replace multiple spaces/hyphens with single hyphen
163
+ filename = re.sub(r"[\s\-]+", "-", filename)
164
+
165
+ # Remove leading/trailing hyphens
166
+ filename = filename.strip("-")
167
+
168
+ # Limit length
169
+ if len(filename) > 100:
170
+ filename = filename[:100].rstrip("-")
171
+
172
+ return filename
@@ -1,11 +1,13 @@
1
1
  """PDF rename workflow - orchestrates the complete process."""
2
2
 
3
3
  import asyncio
4
+ import contextlib
4
5
  from collections.abc import Callable
5
6
  from pathlib import Path
6
7
 
7
- from pdf_renamer.domain.models import FileRenameOperation
8
- from pdf_renamer.domain.ports import (
8
+ from pdf_file_renamer.domain.models import FileRenameOperation
9
+ from pdf_file_renamer.domain.ports import (
10
+ DOIExtractor,
9
11
  FilenameGenerator,
10
12
  FileRenamer,
11
13
  PDFExtractor,
@@ -25,6 +27,7 @@ class PDFRenameWorkflow:
25
27
  pdf_extractor: PDFExtractor,
26
28
  filename_generator: FilenameGenerator,
27
29
  file_renamer: FileRenamer,
30
+ doi_extractor: DOIExtractor | None = None,
28
31
  max_concurrent_api: int = 3,
29
32
  max_concurrent_pdf: int = 10,
30
33
  ) -> None:
@@ -35,12 +38,14 @@ class PDFRenameWorkflow:
35
38
  pdf_extractor: PDF extraction service
36
39
  filename_generator: Filename generation service
37
40
  file_renamer: File renaming service
41
+ doi_extractor: Optional DOI extraction service
38
42
  max_concurrent_api: Maximum concurrent API calls
39
43
  max_concurrent_pdf: Maximum concurrent PDF extractions
40
44
  """
41
45
  self.pdf_extractor = pdf_extractor
42
46
  self.filename_generator = filename_generator
43
47
  self.file_renamer = file_renamer
48
+ self.doi_extractor = doi_extractor
44
49
  self.api_semaphore = asyncio.Semaphore(max_concurrent_api)
45
50
  self.pdf_semaphore = asyncio.Semaphore(max_concurrent_pdf)
46
51
 
@@ -62,17 +67,36 @@ class PDFRenameWorkflow:
62
67
  filename = pdf_path.name
63
68
 
64
69
  try:
70
+ # Try DOI extraction first (if extractor available)
71
+ doi_metadata = None
72
+ if self.doi_extractor:
73
+ if status_callback:
74
+ status_callback(filename, {"status": "DOI Lookup", "stage": "🔍"})
75
+
76
+ # DOI extraction is optional, continue if it fails
77
+ with contextlib.suppress(Exception):
78
+ doi_metadata = await self.doi_extractor.extract_doi(pdf_path)
79
+
65
80
  # Update status: extracting
66
81
  if status_callback:
67
- status_callback(filename, {"status": "Extracting", "stage": "📄"})
82
+ status = "Extracting" if not doi_metadata else "Extracting (DOI found)"
83
+ status_callback(filename, {"status": status, "stage": "📄"})
68
84
 
69
85
  # Extract PDF content (with PDF semaphore to limit memory usage)
70
86
  async with self.pdf_semaphore:
71
87
  content = await self.pdf_extractor.extract(pdf_path)
72
88
 
89
+ # Attach DOI metadata to content if found
90
+ if doi_metadata:
91
+ # Create new content with DOI metadata
92
+ from dataclasses import replace
93
+
94
+ content = replace(content, doi_metadata=doi_metadata)
95
+
73
96
  # Generate filename (with API semaphore to limit API load)
74
97
  if status_callback:
75
- status_callback(filename, {"status": "Analyzing", "stage": "🤖"})
98
+ status = "Analyzing" if not doi_metadata else "Formatting (DOI-based)"
99
+ status_callback(filename, {"status": status, "stage": "🤖"})
76
100
 
77
101
  async with self.api_semaphore:
78
102
  result = await self.filename_generator.generate(filename, content)
@@ -95,6 +119,7 @@ class PDFRenameWorkflow:
95
119
  reasoning=result.reasoning,
96
120
  text_excerpt=content.text,
97
121
  metadata=content.metadata,
122
+ doi_metadata=content.doi_metadata,
98
123
  )
99
124
 
100
125
  except Exception as e:
@@ -3,7 +3,7 @@
3
3
  import shutil
4
4
  from pathlib import Path
5
5
 
6
- from pdf_renamer.domain.ports import FileRenamer
6
+ from pdf_file_renamer.domain.ports import FileRenamer
7
7
 
8
8
 
9
9
  class RenameService(FileRenamer):
@@ -1,12 +1,12 @@
1
1
  """Domain layer - pure business logic with no external dependencies."""
2
2
 
3
- from pdf_renamer.domain.models import (
3
+ from pdf_file_renamer.domain.models import (
4
4
  FilenameResult,
5
5
  FileRenameOperation,
6
6
  PDFContent,
7
7
  PDFMetadata,
8
8
  )
9
- from pdf_renamer.domain.ports import (
9
+ from pdf_file_renamer.domain.ports import (
10
10
  FilenameGenerator,
11
11
  FileRenamer,
12
12
  LLMProvider,
@@ -10,12 +10,39 @@ from pydantic import BaseModel, Field
10
10
  class ConfidenceLevel(str, Enum):
11
11
  """Confidence level for filename suggestions."""
12
12
 
13
+ VERY_HIGH = "very_high" # DOI-backed metadata
13
14
  HIGH = "high"
14
15
  MEDIUM = "medium"
15
16
  LOW = "low"
16
17
  ERROR = "error"
17
18
 
18
19
 
20
+ @dataclass(frozen=True)
21
+ class DOIMetadata:
22
+ """Metadata extracted from DOI lookup."""
23
+
24
+ doi: str
25
+ title: str | None = None
26
+ authors: list[str] | None = None
27
+ year: str | None = None
28
+ journal: str | None = None
29
+ publisher: str | None = None
30
+ raw_bibtex: str | None = None
31
+
32
+ @property
33
+ def first_author(self) -> str | None:
34
+ """Get the first author's last name."""
35
+ if not self.authors or len(self.authors) == 0:
36
+ return None
37
+ # Extract last name from first author (handles "Last, First" or "First Last" formats)
38
+ first = self.authors[0]
39
+ if "," in first:
40
+ return first.split(",")[0].strip()
41
+ # Assume last word is last name
42
+ parts = first.strip().split()
43
+ return parts[-1] if parts else None
44
+
45
+
19
46
  class FilenameResult(BaseModel):
20
47
  """Result of filename generation."""
21
48
 
@@ -56,6 +83,7 @@ class PDFContent:
56
83
  text: str
57
84
  metadata: PDFMetadata
58
85
  page_count: int
86
+ doi_metadata: DOIMetadata | None = None
59
87
 
60
88
 
61
89
  @dataclass
@@ -68,6 +96,7 @@ class FileRenameOperation:
68
96
  reasoning: str
69
97
  text_excerpt: str
70
98
  metadata: PDFMetadata
99
+ doi_metadata: DOIMetadata | None = None
71
100
 
72
101
  @property
73
102
  def new_filename(self) -> str:
@@ -3,7 +3,24 @@
3
3
  from abc import ABC, abstractmethod
4
4
  from pathlib import Path
5
5
 
6
- from pdf_renamer.domain.models import FilenameResult, PDFContent
6
+ from pdf_file_renamer.domain.models import DOIMetadata, FilenameResult, PDFContent
7
+
8
+
9
+ class DOIExtractor(ABC):
10
+ """Interface for DOI extraction and metadata lookup."""
11
+
12
+ @abstractmethod
13
+ async def extract_doi(self, pdf_path: Path) -> DOIMetadata | None:
14
+ """
15
+ Extract DOI from PDF and fetch metadata.
16
+
17
+ Args:
18
+ pdf_path: Path to the PDF file
19
+
20
+ Returns:
21
+ DOIMetadata if DOI found and validated, None otherwise
22
+ """
23
+ pass
7
24
 
8
25
 
9
26
  class PDFExtractor(ABC):
@@ -1,5 +1,5 @@
1
1
  """Infrastructure layer - external dependencies and implementations."""
2
2
 
3
- from pdf_renamer.infrastructure.config import Settings, get_settings
3
+ from pdf_file_renamer.infrastructure.config import Settings, get_settings
4
4
 
5
5
  __all__ = ["Settings", "get_settings"]
@@ -0,0 +1,5 @@
1
+ """DOI extraction infrastructure."""
2
+
3
+ from pdf_file_renamer.infrastructure.doi.pdf2doi_extractor import PDF2DOIExtractor
4
+
5
+ __all__ = ["PDF2DOIExtractor"]
@@ -0,0 +1,129 @@
1
+ """DOI extraction using pdf2doi library."""
2
+
3
+ import asyncio
4
+ import re
5
+ from pathlib import Path
6
+
7
+ import pdf2doi
8
+
9
+ from pdf_file_renamer.domain.models import DOIMetadata
10
+ from pdf_file_renamer.domain.ports import DOIExtractor
11
+
12
+
13
+ class PDF2DOIExtractor(DOIExtractor):
14
+ """Extract DOI from PDF files using pdf2doi library."""
15
+
16
+ def __init__(self) -> None:
17
+ """Initialize the PDF2DOI extractor."""
18
+ # Suppress pdf2doi verbose output
19
+ pdf2doi.config.set("verbose", False)
20
+
21
+ async def extract_doi(self, pdf_path: Path) -> DOIMetadata | None:
22
+ """
23
+ Extract DOI from PDF and fetch metadata.
24
+
25
+ Args:
26
+ pdf_path: Path to the PDF file
27
+
28
+ Returns:
29
+ DOIMetadata if DOI found and validated, None otherwise
30
+ """
31
+ try:
32
+ # Run pdf2doi in executor to avoid blocking
33
+ loop = asyncio.get_event_loop()
34
+ results = await loop.run_in_executor(
35
+ None, pdf2doi.pdf2doi, str(pdf_path)
36
+ )
37
+
38
+ if not results or len(results) == 0:
39
+ return None
40
+
41
+ # Get the first result
42
+ result = results[0]
43
+
44
+ # Check if DOI was found
45
+ identifier = result.get("identifier")
46
+ if not identifier:
47
+ return None
48
+
49
+ identifier_type = result.get("identifier_type", "")
50
+ if identifier_type.lower() not in ("doi", "arxiv"):
51
+ return None
52
+
53
+ # Extract metadata from validation_info (bibtex)
54
+ validation_info = result.get("validation_info", "")
55
+
56
+ # Parse bibtex for metadata
57
+ title = self._extract_bibtex_field(validation_info, "title")
58
+ authors = self._extract_bibtex_authors(validation_info)
59
+ year = self._extract_bibtex_field(validation_info, "year")
60
+ journal = self._extract_bibtex_field(validation_info, "journal")
61
+ publisher = self._extract_bibtex_field(validation_info, "publisher")
62
+
63
+ return DOIMetadata(
64
+ doi=identifier,
65
+ title=title,
66
+ authors=authors,
67
+ year=year,
68
+ journal=journal,
69
+ publisher=publisher,
70
+ raw_bibtex=validation_info if validation_info else None,
71
+ )
72
+
73
+ except Exception:
74
+ # Silently fail - DOI extraction is opportunistic
75
+ return None
76
+
77
+ def _extract_bibtex_field(self, bibtex: str, field: str) -> str | None:
78
+ """
79
+ Extract a field from bibtex string.
80
+
81
+ Args:
82
+ bibtex: Bibtex string
83
+ field: Field name to extract
84
+
85
+ Returns:
86
+ Field value or None
87
+ """
88
+ if not bibtex:
89
+ return None
90
+
91
+ # Match field = {value} or field = "value"
92
+ pattern = rf"{field}\s*=\s*[{{\"](.*?)[\}}\"](,|\n|$)"
93
+ match = re.search(pattern, bibtex, re.IGNORECASE)
94
+
95
+ if match:
96
+ return match.group(1).strip()
97
+
98
+ return None
99
+
100
+ def _extract_bibtex_authors(self, bibtex: str) -> list[str] | None:
101
+ """
102
+ Extract authors from bibtex string.
103
+
104
+ Args:
105
+ bibtex: Bibtex string
106
+
107
+ Returns:
108
+ List of author names or None
109
+ """
110
+ if not bibtex:
111
+ return None
112
+
113
+ # Match author = {Name1 and Name2 and Name3}
114
+ pattern = r"author\s*=\s*[{\"](.*?)[\}\"](,|\n|$)"
115
+ match = re.search(pattern, bibtex, re.IGNORECASE)
116
+
117
+ if not match:
118
+ return None
119
+
120
+ authors_str = match.group(1).strip()
121
+
122
+ # Split by "and" and clean up
123
+ authors = [
124
+ author.strip()
125
+ for author in re.split(r"\s+and\s+", authors_str, flags=re.IGNORECASE)
126
+ if author.strip()
127
+ ]
128
+
129
+ return authors if authors else None
@@ -0,0 +1,5 @@
1
+ """LLM provider implementations."""
2
+
3
+ from pdf_file_renamer.infrastructure.llm.pydantic_ai_provider import PydanticAIProvider
4
+
5
+ __all__ = ["PydanticAIProvider"]
@@ -17,8 +17,8 @@ from tenacity import (
17
17
  wait_exponential,
18
18
  )
19
19
 
20
- from pdf_renamer.domain.models import ConfidenceLevel, FilenameResult
21
- from pdf_renamer.domain.ports import LLMProvider
20
+ from pdf_file_renamer.domain.models import ConfidenceLevel, FilenameResult
21
+ from pdf_file_renamer.domain.ports import LLMProvider
22
22
 
23
23
  # System prompt for filename generation
24
24
  FILENAME_GENERATION_PROMPT = """You are an expert at creating concise, descriptive filenames for academic papers and technical documents.
@@ -0,0 +1,7 @@
1
+ """PDF extraction implementations."""
2
+
3
+ from pdf_file_renamer.infrastructure.pdf.composite import CompositePDFExtractor
4
+ from pdf_file_renamer.infrastructure.pdf.docling_extractor import DoclingPDFExtractor
5
+ from pdf_file_renamer.infrastructure.pdf.pymupdf_extractor import PyMuPDFExtractor
6
+
7
+ __all__ = ["CompositePDFExtractor", "DoclingPDFExtractor", "PyMuPDFExtractor"]
@@ -2,8 +2,8 @@
2
2
 
3
3
  from pathlib import Path
4
4
 
5
- from pdf_renamer.domain.models import PDFContent
6
- from pdf_renamer.domain.ports import PDFExtractor
5
+ from pdf_file_renamer.domain.models import PDFContent
6
+ from pdf_file_renamer.domain.ports import PDFExtractor
7
7
 
8
8
 
9
9
  class CompositePDFExtractor(PDFExtractor):
@@ -6,8 +6,8 @@ from pathlib import Path
6
6
  from docling_core.types.doc.page import TextCellUnit
7
7
  from docling_parse.pdf_parser import DoclingPdfParser
8
8
 
9
- from pdf_renamer.domain.models import PDFContent, PDFMetadata
10
- from pdf_renamer.domain.ports import PDFExtractor
9
+ from pdf_file_renamer.domain.models import PDFContent, PDFMetadata
10
+ from pdf_file_renamer.domain.ports import PDFExtractor
11
11
 
12
12
 
13
13
  class DoclingPDFExtractor(PDFExtractor):
@@ -5,8 +5,8 @@ from pathlib import Path
5
5
 
6
6
  import pymupdf
7
7
 
8
- from pdf_renamer.domain.models import PDFContent, PDFMetadata
9
- from pdf_renamer.domain.ports import PDFExtractor
8
+ from pdf_file_renamer.domain.models import PDFContent, PDFMetadata
9
+ from pdf_file_renamer.domain.ports import PDFExtractor
10
10
 
11
11
 
12
12
  class PyMuPDFExtractor(PDFExtractor):
@@ -1,6 +1,6 @@
1
1
  """Main entry point for the PDF renamer application."""
2
2
 
3
- from pdf_renamer.presentation.cli import app
3
+ from pdf_file_renamer.presentation.cli import app
4
4
 
5
5
  if __name__ == "__main__":
6
6
  app()
@@ -0,0 +1,6 @@
1
+ """Presentation layer - CLI and user interaction."""
2
+
3
+ from pdf_file_renamer.presentation.cli import app
4
+ from pdf_file_renamer.presentation.formatters import ProgressDisplay
5
+
6
+ __all__ = ["ProgressDisplay", "app"]
@@ -9,19 +9,20 @@ import typer
9
9
  from rich.console import Console
10
10
  from rich.live import Live
11
11
 
12
- from pdf_renamer.application import (
12
+ from pdf_file_renamer.application import (
13
13
  FilenameService,
14
14
  PDFRenameWorkflow,
15
15
  RenameService,
16
16
  )
17
- from pdf_renamer.infrastructure.config import Settings
18
- from pdf_renamer.infrastructure.llm import PydanticAIProvider
19
- from pdf_renamer.infrastructure.pdf import (
17
+ from pdf_file_renamer.infrastructure.config import Settings
18
+ from pdf_file_renamer.infrastructure.doi import PDF2DOIExtractor
19
+ from pdf_file_renamer.infrastructure.llm import PydanticAIProvider
20
+ from pdf_file_renamer.infrastructure.pdf import (
20
21
  CompositePDFExtractor,
21
22
  DoclingPDFExtractor,
22
23
  PyMuPDFExtractor,
23
24
  )
24
- from pdf_renamer.presentation.formatters import (
25
+ from pdf_file_renamer.presentation.formatters import (
25
26
  InteractivePrompt,
26
27
  ProgressDisplay,
27
28
  ResultsTable,
@@ -64,6 +65,9 @@ def create_workflow(settings: Settings) -> PDFRenameWorkflow:
64
65
  retry_max_wait=settings.retry_max_wait,
65
66
  )
66
67
 
68
+ # Create DOI extractor
69
+ doi_extractor = PDF2DOIExtractor()
70
+
67
71
  # Create application services
68
72
  filename_service = FilenameService(llm_provider)
69
73
  file_renamer = RenameService()
@@ -73,6 +77,7 @@ def create_workflow(settings: Settings) -> PDFRenameWorkflow:
73
77
  pdf_extractor=pdf_extractor,
74
78
  filename_generator=filename_service,
75
79
  file_renamer=file_renamer,
80
+ doi_extractor=doi_extractor,
76
81
  max_concurrent_api=settings.max_concurrent_api,
77
82
  max_concurrent_pdf=settings.max_concurrent_pdf,
78
83
  )
@@ -7,7 +7,7 @@ from rich.prompt import Prompt
7
7
  from rich.table import Table
8
8
  from rich.text import Text
9
9
 
10
- from pdf_renamer.domain.models import FileRenameOperation
10
+ from pdf_file_renamer.domain.models import FileRenameOperation
11
11
 
12
12
 
13
13
  class ProgressDisplay:
@@ -1,28 +1,28 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pdf-file-renamer
3
- Version: 0.4.2
3
+ Version: 0.6.0
4
4
  Summary: Intelligent PDF renaming using LLMs
5
- Requires-Python: >=3.11
6
- Description-Content-Type: text/markdown
7
5
  License-File: LICENSE
8
- Requires-Dist: pydantic>=2.10.6
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: docling-core>=2.0.0
8
+ Requires-Dist: docling-parse>=2.0.0
9
+ Requires-Dist: pdf2doi>=1.7
9
10
  Requires-Dist: pydantic-ai>=1.0.17
10
11
  Requires-Dist: pydantic-settings>=2.7.1
12
+ Requires-Dist: pydantic>=2.10.6
11
13
  Requires-Dist: pymupdf>=1.26.5
12
- Requires-Dist: docling-parse>=2.0.0
13
- Requires-Dist: docling-core>=2.0.0
14
14
  Requires-Dist: python-dotenv>=1.1.1
15
15
  Requires-Dist: rich>=14.2.0
16
- Requires-Dist: typer>=0.19.2
17
16
  Requires-Dist: tenacity>=9.0.0
17
+ Requires-Dist: typer>=0.19.2
18
18
  Provides-Extra: dev
19
- Requires-Dist: pytest>=8.3.4; extra == "dev"
20
- Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
21
- Requires-Dist: pytest-asyncio>=0.25.2; extra == "dev"
22
- Requires-Dist: pytest-mock>=3.14.0; extra == "dev"
23
- Requires-Dist: ruff>=0.9.1; extra == "dev"
24
- Requires-Dist: mypy>=1.14.1; extra == "dev"
25
- Dynamic: license-file
19
+ Requires-Dist: mypy>=1.14.1; extra == 'dev'
20
+ Requires-Dist: pytest-asyncio>=0.25.2; extra == 'dev'
21
+ Requires-Dist: pytest-cov>=6.0.0; extra == 'dev'
22
+ Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
23
+ Requires-Dist: pytest>=8.3.4; extra == 'dev'
24
+ Requires-Dist: ruff>=0.9.1; extra == 'dev'
25
+ Description-Content-Type: text/markdown
26
26
 
27
27
  # PDF Renamer
28
28
 
@@ -44,9 +44,11 @@ Intelligent PDF file renaming using LLMs. This tool analyzes PDF content and met
44
44
 
45
45
  ## Features
46
46
 
47
+ - **DOI-based naming** - Automatically extracts DOI and fetches authoritative metadata for academic papers
47
48
  - **Advanced PDF parsing** using docling-parse for better structure-aware extraction
48
49
  - **OCR fallback** for scanned PDFs with low text content
49
50
  - **Smart LLM prompting** with multi-pass analysis for improved accuracy
51
+ - **Hybrid approach** - Uses DOI metadata when available, falls back to LLM analysis for other documents
50
52
  - Suggests filenames in format: `Author-Topic-Year.pdf`
51
53
  - Dry-run mode to preview changes before applying
52
54
  - **Enhanced interactive mode** with options to accept, manually edit, retry, or skip each file
@@ -209,19 +211,44 @@ You can use interactive mode with `--dry-run` to preview without actually renami
209
211
 
210
212
  ## How It Works
211
213
 
212
- 1. **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
213
- 2. **OCR**: Automatically applies OCR for scanned PDFs with minimal text
214
- 3. **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
215
- 4. **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
216
- 5. **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
217
- 6. **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
218
- 7. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
219
- 8. **Rename**: Applies suggestions (if not in dry-run mode)
214
+ ### Intelligent Hybrid Approach
215
+
216
+ The tool uses a multi-strategy approach to generate accurate filenames:
217
+
218
+ 1. **DOI Detection** (for academic papers)
219
+ - Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
220
+ - If found, queries authoritative metadata (title, authors, year, journal)
221
+ - Generates filename with **very high confidence** from validated metadata
222
+ - **Saves API costs** - no LLM call needed for papers with DOIs
223
+
224
+ 2. **LLM Analysis** (fallback for non-academic PDFs)
225
+ - **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
226
+ - **OCR**: Automatically applies OCR for scanned PDFs with minimal text
227
+ - **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
228
+ - **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
229
+ - **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
230
+ - **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
231
+
232
+ 3. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
233
+ 4. **Rename**: Applies suggestions (if not in dry-run mode)
234
+
235
+ ### Benefits of DOI Integration
236
+
237
+ - **Accuracy**: DOI metadata is canonical and verified
238
+ - **Speed**: Instant lookup vs. LLM processing time
239
+ - **Cost**: Free DOI lookups save on API costs for academic papers
240
+ - **Reliability**: Works even when PDF text extraction is poor
220
241
 
221
242
  ## Cost Considerations
222
243
 
223
- **OpenAI:**
244
+ **DOI-based Naming (Academic Papers):**
245
+ - **Completely free** - No API costs
246
+ - **No LLM needed** - Direct metadata lookup
247
+ - Works for most academic papers with embedded DOIs
248
+
249
+ **OpenAI (Fallback):**
224
250
  - Uses `gpt-4o-mini` by default (very cost-effective)
251
+ - Only called when DOI not found
225
252
  - Processes first ~4500 characters per PDF
226
253
  - Typical cost: ~$0.001-0.003 per PDF
227
254
 
@@ -0,0 +1,27 @@
1
+ pdf_file_renamer/__init__.py,sha256=yCEfy0jblhbUMNTSjJKPuW4zADXoI6IfICx8XvB4R8Q,85
2
+ pdf_file_renamer/main.py,sha256=FTEEb-9QmOOsN9SE8L1SZvFVIkVpQDy8xZ5a8t8CWUs,145
3
+ pdf_file_renamer/application/__init__.py,sha256=riSV7UXBenkDst7Nnf11N1_RuRtM7wpKdwugxOhumS4,363
4
+ pdf_file_renamer/application/filename_service.py,sha256=IbeCNBwyhFlCMCZveq16nmQ2qvyTdtgLmr6PDWPckOs,4868
5
+ pdf_file_renamer/application/pdf_rename_workflow.py,sha256=gd53KoR1aFrK__6TArm7Rtn1yNxylEI2ikmubDOByF4,5842
6
+ pdf_file_renamer/application/rename_service.py,sha256=vviNQolk_w-qDQvOKTKj8ZhqYyyNWL-VJMfuUnL6WLw,2357
7
+ pdf_file_renamer/domain/__init__.py,sha256=jxbH3h6xaCnSRuBxclFESl6ZE1pua_I1K4CRAaYxu_I,503
8
+ pdf_file_renamer/domain/models.py,sha256=QwN79TzWmqvQvz-m9ymebvAx3pWlVpSWXNdSEAk4qq0,3186
9
+ pdf_file_renamer/domain/ports.py,sha256=ebOcHptiOK119NCmIwM32_fbRK5xkZP9K67vjL-4k0g,2976
10
+ pdf_file_renamer/infrastructure/__init__.py,sha256=C3ZQ7WCPCa6PMfP00lu4wqb0r57GVyDdiD5EL2DhCeY,187
11
+ pdf_file_renamer/infrastructure/config.py,sha256=baNL5_6_NNiS50ZNdql7fDwQbeAwf6f58HGYIWFQxQQ,2464
12
+ pdf_file_renamer/infrastructure/doi/__init__.py,sha256=8N9ZEwfG7q5xomzh187YtP8t4CfEBHM334xNRblPeuI,153
13
+ pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py,sha256=rAo0q5HTCqVTyyIXzW3D6riGS5Q9xlXhbT2AY0Hb7nk,3820
14
+ pdf_file_renamer/infrastructure/llm/__init__.py,sha256=ToB8__mHvXwaIukGKPEAQ8SeC4ZLiH4auZI1P1yH5PQ,159
15
+ pdf_file_renamer/infrastructure/llm/pydantic_ai_provider.py,sha256=kVsmj0NIawkj-1WWM0hZXbsNH09GabVZm9HPlYsxGuo,9217
16
+ pdf_file_renamer/infrastructure/pdf/__init__.py,sha256=uMHqxSXNLZH5WH_e1kXrp9m7uTqPkiI2hXjNo6rCRoo,368
17
+ pdf_file_renamer/infrastructure/pdf/composite.py,sha256=dNrrcGTsGf1LLF4F0AoF7jRbvLkgRGnIF6XNGlg92n4,1801
18
+ pdf_file_renamer/infrastructure/pdf/docling_extractor.py,sha256=auZrJpK7mMg1mUXK6ptjZC1pnAUQje1h7ZAS7gFUBzo,3974
19
+ pdf_file_renamer/infrastructure/pdf/pymupdf_extractor.py,sha256=C61udZCqGqiVx7T0HWNyjvnhgv5AgMIcCYtrhgHOJwk,5465
20
+ pdf_file_renamer/presentation/__init__.py,sha256=1VR44GoPGTixk3hG5YzhGyQf7a4BTKsJBd2VP3rHcFM,211
21
+ pdf_file_renamer/presentation/cli.py,sha256=0t_59-utRWLNCYjFetU0ZHoF1DPTjdNiWM9Au0jFaOg,8013
22
+ pdf_file_renamer/presentation/formatters.py,sha256=Yl-Et7OKMfthyLqTA5qEtSAqh0PfHKp3lNNBA_dn01c,8519
23
+ pdf_file_renamer-0.6.0.dist-info/METADATA,sha256=6XmqT7jtJuqkWlks3FlPWCpNP_tKOGokhxy2Yju5R7k,9912
24
+ pdf_file_renamer-0.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
+ pdf_file_renamer-0.6.0.dist-info/entry_points.txt,sha256=0fEGYy60chGE9rECWeCVPxjxzz6vMtIAYdFvmH7xzbw,63
26
+ pdf_file_renamer-0.6.0.dist-info/licenses/LICENSE,sha256=_w08V08WgoMpDMlGNlkIatC5QfQ_Ds_rXOBM8pl7ffE,1068
27
+ pdf_file_renamer-0.6.0.dist-info/RECORD,,
@@ -1,5 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
-
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ pdf-file-renamer = pdf_file_renamer.main:app
@@ -1,26 +0,0 @@
1
- pdf_file_renamer-0.4.2.dist-info/licenses/LICENSE,sha256=_w08V08WgoMpDMlGNlkIatC5QfQ_Ds_rXOBM8pl7ffE,1068
2
- pdf_renamer/__init__.py,sha256=3RvsqaTO80Ud1KZZdLL_Lh-HXxagncoqI4m6u3VL_UE,85
3
- pdf_renamer/main.py,sha256=5eTsrCQaotNwbdwJwandOlzrWODI73-L5mALHUIvqyw,140
4
- pdf_renamer/application/__init__.py,sha256=EebV66jsZjubnh6PSEeNGs0A_JGeYXFghzGLDQ92eco,348
5
- pdf_renamer/application/filename_service.py,sha256=Gk-nPnURsJYLDvoG_NZ4o_yHwAqK6bHU8kqzlev0XXM,2029
6
- pdf_renamer/application/pdf_rename_workflow.py,sha256=MEUmDR6bLRB-ncNgKk3ahIfsIIk3Gsw1048cId6pYv4,4710
7
- pdf_renamer/application/rename_service.py,sha256=rnScP2JwKMrIJcplFvxC0b2MOLzWqxpPKc3uDLHPjRI,2352
8
- pdf_renamer/domain/__init__.py,sha256=UPcXunsI30iFK9dupv2Fc_YDreT1tAqsYaGEAK9sJew,493
9
- pdf_renamer/domain/models.py,sha256=7S2ul3BoWi2aivWtmDa9LRlmeqURrGEV1sfSu8W6x5k,2246
10
- pdf_renamer/domain/ports.py,sha256=ecnpkFYB3259ZjaZaOVo1sjP8nXD3x1NGR6hN5nn3gc,2550
11
- pdf_renamer/infrastructure/__init__.py,sha256=CxBinDAuNm2X57-Y7XdXxVL6uHQXQqWpPrlznzu5_1M,182
12
- pdf_renamer/infrastructure/config.py,sha256=baNL5_6_NNiS50ZNdql7fDwQbeAwf6f58HGYIWFQxQQ,2464
13
- pdf_renamer/infrastructure/llm/__init__.py,sha256=evEhabaBshvekLO9DlAZvp-pQ_u03zYXqXaDfa9QUww,154
14
- pdf_renamer/infrastructure/llm/pydantic_ai_provider.py,sha256=FM2Sd3n3lltJC76afrem5QuuS8qApEma52YD-Y8K89Y,9207
15
- pdf_renamer/infrastructure/pdf/__init__.py,sha256=-WHYNLeBekm7jwIXRj4xpSIXyZz9olDiMIJLUjv2B-U,353
16
- pdf_renamer/infrastructure/pdf/composite.py,sha256=1tlZ_X9_KVY01GTr1Hg3x_Ag7g3g4ik6_8R0jip8Wx0,1791
17
- pdf_renamer/infrastructure/pdf/docling_extractor.py,sha256=7UamnbYFMgtD53oMqu1qKAq3FyQTQlq0Uw0k1sNzPw8,3964
18
- pdf_renamer/infrastructure/pdf/pymupdf_extractor.py,sha256=lwIPr9yhy2hZVnuvoLcZvmjYSzbTra0AyW59UvU7GgU,5455
19
- pdf_renamer/presentation/__init__.py,sha256=mxIxy8POUwewiMsmrOMVA8z9pe57lOghuwHZ5RAbMo4,201
20
- pdf_renamer/presentation/cli.py,sha256=ykZx22quR9ye-ui9bLrRinD7BSChjSbGTRsazCafo5s,7819
21
- pdf_renamer/presentation/formatters.py,sha256=ilUcXZ-7MpBlz7k7cqRAuixfkVT3cuD-pBcy5fsE2Qo,8514
22
- pdf_file_renamer-0.4.2.dist-info/METADATA,sha256=xSIAQrGaKmT2o2vOT5HlX6ILaTmDyYbn6P8YG8JtK8U,8668
23
- pdf_file_renamer-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
- pdf_file_renamer-0.4.2.dist-info/entry_points.txt,sha256=IvW2oP2SRPv5qqFwDYBRCE53Q3JAyi_chbCo-0rdKQA,53
25
- pdf_file_renamer-0.4.2.dist-info/top_level.txt,sha256=CFtpWKQjLObHZIssi5I3q7FXfLJZWKpHo7uuAiJ0pVY,12
26
- pdf_file_renamer-0.4.2.dist-info/RECORD,,
@@ -1,2 +0,0 @@
1
- [console_scripts]
2
- pdf-renamer = pdf_renamer.main:app
@@ -1 +0,0 @@
1
- pdf_renamer
@@ -1,7 +0,0 @@
1
- """Application layer - use cases and business logic orchestration."""
2
-
3
- from pdf_renamer.application.filename_service import FilenameService
4
- from pdf_renamer.application.pdf_rename_workflow import PDFRenameWorkflow
5
- from pdf_renamer.application.rename_service import RenameService
6
-
7
- __all__ = ["FilenameService", "PDFRenameWorkflow", "RenameService"]
@@ -1,70 +0,0 @@
1
- """Filename generation service - coordinates PDF extraction and LLM generation."""
2
-
3
- import re
4
-
5
- from pdf_renamer.domain.models import FilenameResult, PDFContent
6
- from pdf_renamer.domain.ports import FilenameGenerator, LLMProvider
7
-
8
-
9
- class FilenameService(FilenameGenerator):
10
- """Service for generating filenames from PDF content."""
11
-
12
- def __init__(self, llm_provider: LLMProvider) -> None:
13
- """
14
- Initialize the filename service.
15
-
16
- Args:
17
- llm_provider: LLM provider for filename generation
18
- """
19
- self.llm_provider = llm_provider
20
-
21
- async def generate(self, original_filename: str, content: PDFContent) -> FilenameResult:
22
- """
23
- Generate a filename suggestion based on PDF content.
24
-
25
- Args:
26
- original_filename: Current filename
27
- content: Extracted PDF content
28
-
29
- Returns:
30
- FilenameResult with suggestion
31
- """
32
- # Convert metadata to dictionary
33
- metadata_dict = content.metadata.to_dict()
34
-
35
- # Generate filename using LLM
36
- result = await self.llm_provider.generate_filename(
37
- original_filename=original_filename,
38
- text_excerpt=content.text,
39
- metadata_dict=metadata_dict,
40
- )
41
-
42
- # Sanitize the generated filename
43
- result.filename = self.sanitize(result.filename)
44
-
45
- return result
46
-
47
- def sanitize(self, filename: str) -> str:
48
- """
49
- Sanitize a filename to be filesystem-safe.
50
-
51
- Args:
52
- filename: Raw filename
53
-
54
- Returns:
55
- Sanitized filename
56
- """
57
- # Remove or replace invalid characters
58
- filename = re.sub(r'[<>:"/\\|?*]', "", filename)
59
-
60
- # Replace multiple spaces/hyphens with single hyphen
61
- filename = re.sub(r"[\s\-]+", "-", filename)
62
-
63
- # Remove leading/trailing hyphens
64
- filename = filename.strip("-")
65
-
66
- # Limit length
67
- if len(filename) > 100:
68
- filename = filename[:100].rstrip("-")
69
-
70
- return filename
@@ -1,5 +0,0 @@
1
- """LLM provider implementations."""
2
-
3
- from pdf_renamer.infrastructure.llm.pydantic_ai_provider import PydanticAIProvider
4
-
5
- __all__ = ["PydanticAIProvider"]
@@ -1,7 +0,0 @@
1
- """PDF extraction implementations."""
2
-
3
- from pdf_renamer.infrastructure.pdf.composite import CompositePDFExtractor
4
- from pdf_renamer.infrastructure.pdf.docling_extractor import DoclingPDFExtractor
5
- from pdf_renamer.infrastructure.pdf.pymupdf_extractor import PyMuPDFExtractor
6
-
7
- __all__ = ["CompositePDFExtractor", "DoclingPDFExtractor", "PyMuPDFExtractor"]
@@ -1,6 +0,0 @@
1
- """Presentation layer - CLI and user interaction."""
2
-
3
- from pdf_renamer.presentation.cli import app
4
- from pdf_renamer.presentation.formatters import ProgressDisplay
5
-
6
- __all__ = ["ProgressDisplay", "app"]