pdf-file-renamer 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_file_renamer/__init__.py +1 -1
- pdf_file_renamer/application/filename_service.py +103 -1
- pdf_file_renamer/application/pdf_rename_workflow.py +35 -4
- pdf_file_renamer/domain/models.py +29 -0
- pdf_file_renamer/domain/ports.py +18 -1
- pdf_file_renamer/infrastructure/doi/__init__.py +5 -0
- pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py +163 -0
- pdf_file_renamer/presentation/cli.py +5 -0
- pdf_file_renamer/presentation/formatters.py +15 -3
- {pdf_file_renamer-0.5.0.dist-info → pdf_file_renamer-0.6.1.dist-info}/METADATA +38 -10
- {pdf_file_renamer-0.5.0.dist-info → pdf_file_renamer-0.6.1.dist-info}/RECORD +14 -12
- {pdf_file_renamer-0.5.0.dist-info → pdf_file_renamer-0.6.1.dist-info}/WHEEL +0 -0
- {pdf_file_renamer-0.5.0.dist-info → pdf_file_renamer-0.6.1.dist-info}/entry_points.txt +0 -0
- {pdf_file_renamer-0.5.0.dist-info → pdf_file_renamer-0.6.1.dist-info}/licenses/LICENSE +0 -0
pdf_file_renamer/__init__.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
import re
|
4
4
|
|
5
|
-
from pdf_file_renamer.domain.models import FilenameResult, PDFContent
|
5
|
+
from pdf_file_renamer.domain.models import ConfidenceLevel, FilenameResult, PDFContent
|
6
6
|
from pdf_file_renamer.domain.ports import FilenameGenerator, LLMProvider
|
7
7
|
|
8
8
|
|
@@ -29,6 +29,11 @@ class FilenameService(FilenameGenerator):
|
|
29
29
|
Returns:
|
30
30
|
FilenameResult with suggestion
|
31
31
|
"""
|
32
|
+
# If DOI metadata is available, use it directly for high-confidence naming
|
33
|
+
if content.doi_metadata:
|
34
|
+
return self._generate_from_doi(content)
|
35
|
+
|
36
|
+
# Otherwise, fall back to LLM-based generation
|
32
37
|
# Convert metadata to dictionary
|
33
38
|
metadata_dict = content.metadata.to_dict()
|
34
39
|
|
@@ -44,6 +49,103 @@ class FilenameService(FilenameGenerator):
|
|
44
49
|
|
45
50
|
return result
|
46
51
|
|
52
|
+
def _generate_from_doi(self, content: PDFContent) -> FilenameResult:
|
53
|
+
"""
|
54
|
+
Generate filename directly from DOI metadata.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
content: PDF content with DOI metadata
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
FilenameResult with very high confidence
|
61
|
+
"""
|
62
|
+
doi_meta = content.doi_metadata
|
63
|
+
if not doi_meta:
|
64
|
+
msg = "DOI metadata not available"
|
65
|
+
raise ValueError(msg)
|
66
|
+
|
67
|
+
# Extract components for filename
|
68
|
+
author = doi_meta.first_author or "Unknown"
|
69
|
+
|
70
|
+
# Get title and clean it
|
71
|
+
title = doi_meta.title or "Document"
|
72
|
+
# Extract key words from title (remove common words)
|
73
|
+
title_words = self._extract_key_words(title)
|
74
|
+
|
75
|
+
year = doi_meta.year or ""
|
76
|
+
|
77
|
+
# Build filename: Author-KeyWords-Year
|
78
|
+
parts = [author]
|
79
|
+
if title_words:
|
80
|
+
parts.append(title_words)
|
81
|
+
if year:
|
82
|
+
parts.append(year)
|
83
|
+
|
84
|
+
filename = "-".join(parts)
|
85
|
+
filename = self.sanitize(filename)
|
86
|
+
|
87
|
+
return FilenameResult(
|
88
|
+
filename=filename,
|
89
|
+
confidence=ConfidenceLevel.VERY_HIGH,
|
90
|
+
reasoning=f"Filename generated from DOI metadata (DOI: {doi_meta.doi}). "
|
91
|
+
f"Author: {author}, Year: {year}",
|
92
|
+
)
|
93
|
+
|
94
|
+
def _extract_key_words(self, title: str, max_words: int = 6) -> str:
|
95
|
+
"""
|
96
|
+
Extract key words from title, removing common words.
|
97
|
+
|
98
|
+
Args:
|
99
|
+
title: Paper title
|
100
|
+
max_words: Maximum number of words to include
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
Hyphenated key words
|
104
|
+
"""
|
105
|
+
# Common words to skip
|
106
|
+
stop_words = {
|
107
|
+
"a",
|
108
|
+
"an",
|
109
|
+
"the",
|
110
|
+
"and",
|
111
|
+
"or",
|
112
|
+
"but",
|
113
|
+
"in",
|
114
|
+
"on",
|
115
|
+
"at",
|
116
|
+
"to",
|
117
|
+
"for",
|
118
|
+
"of",
|
119
|
+
"with",
|
120
|
+
"by",
|
121
|
+
"from",
|
122
|
+
"as",
|
123
|
+
"is",
|
124
|
+
"was",
|
125
|
+
"are",
|
126
|
+
"were",
|
127
|
+
"been",
|
128
|
+
"be",
|
129
|
+
"this",
|
130
|
+
"that",
|
131
|
+
"these",
|
132
|
+
"those",
|
133
|
+
}
|
134
|
+
|
135
|
+
# Clean and split title
|
136
|
+
words = re.sub(r"[^\w\s-]", " ", title.lower()).split()
|
137
|
+
|
138
|
+
# Filter stop words and keep significant words
|
139
|
+
key_words = [w for w in words if w not in stop_words and len(w) > 2]
|
140
|
+
|
141
|
+
# Limit to max_words
|
142
|
+
key_words = key_words[:max_words]
|
143
|
+
|
144
|
+
# Capitalize first letter of each word
|
145
|
+
key_words = [w.capitalize() for w in key_words]
|
146
|
+
|
147
|
+
return "-".join(key_words)
|
148
|
+
|
47
149
|
def sanitize(self, filename: str) -> str:
|
48
150
|
"""
|
49
151
|
Sanitize a filename to be filesystem-safe.
|
@@ -1,11 +1,13 @@
|
|
1
1
|
"""PDF rename workflow - orchestrates the complete process."""
|
2
2
|
|
3
3
|
import asyncio
|
4
|
+
import contextlib
|
4
5
|
from collections.abc import Callable
|
5
6
|
from pathlib import Path
|
6
7
|
|
7
|
-
from pdf_file_renamer.domain.models import FileRenameOperation
|
8
|
+
from pdf_file_renamer.domain.models import ConfidenceLevel, FileRenameOperation
|
8
9
|
from pdf_file_renamer.domain.ports import (
|
10
|
+
DOIExtractor,
|
9
11
|
FilenameGenerator,
|
10
12
|
FileRenamer,
|
11
13
|
PDFExtractor,
|
@@ -25,6 +27,7 @@ class PDFRenameWorkflow:
|
|
25
27
|
pdf_extractor: PDFExtractor,
|
26
28
|
filename_generator: FilenameGenerator,
|
27
29
|
file_renamer: FileRenamer,
|
30
|
+
doi_extractor: DOIExtractor | None = None,
|
28
31
|
max_concurrent_api: int = 3,
|
29
32
|
max_concurrent_pdf: int = 10,
|
30
33
|
) -> None:
|
@@ -35,12 +38,14 @@ class PDFRenameWorkflow:
|
|
35
38
|
pdf_extractor: PDF extraction service
|
36
39
|
filename_generator: Filename generation service
|
37
40
|
file_renamer: File renaming service
|
41
|
+
doi_extractor: Optional DOI extraction service
|
38
42
|
max_concurrent_api: Maximum concurrent API calls
|
39
43
|
max_concurrent_pdf: Maximum concurrent PDF extractions
|
40
44
|
"""
|
41
45
|
self.pdf_extractor = pdf_extractor
|
42
46
|
self.filename_generator = filename_generator
|
43
47
|
self.file_renamer = file_renamer
|
48
|
+
self.doi_extractor = doi_extractor
|
44
49
|
self.api_semaphore = asyncio.Semaphore(max_concurrent_api)
|
45
50
|
self.pdf_semaphore = asyncio.Semaphore(max_concurrent_pdf)
|
46
51
|
|
@@ -62,29 +67,54 @@ class PDFRenameWorkflow:
|
|
62
67
|
filename = pdf_path.name
|
63
68
|
|
64
69
|
try:
|
70
|
+
# Try DOI extraction first (if extractor available)
|
71
|
+
doi_metadata = None
|
72
|
+
if self.doi_extractor:
|
73
|
+
if status_callback:
|
74
|
+
status_callback(filename, {"status": "DOI Lookup", "stage": "🔍"})
|
75
|
+
|
76
|
+
# DOI extraction is optional, continue if it fails
|
77
|
+
with contextlib.suppress(Exception):
|
78
|
+
doi_metadata = await self.doi_extractor.extract_doi(pdf_path)
|
79
|
+
|
65
80
|
# Update status: extracting
|
66
81
|
if status_callback:
|
67
|
-
|
82
|
+
status = "Extracting" if not doi_metadata else "Extracting (DOI found)"
|
83
|
+
status_callback(filename, {"status": status, "stage": "📄"})
|
68
84
|
|
69
85
|
# Extract PDF content (with PDF semaphore to limit memory usage)
|
70
86
|
async with self.pdf_semaphore:
|
71
87
|
content = await self.pdf_extractor.extract(pdf_path)
|
72
88
|
|
89
|
+
# Attach DOI metadata to content if found
|
90
|
+
if doi_metadata:
|
91
|
+
# Create new content with DOI metadata
|
92
|
+
from dataclasses import replace
|
93
|
+
|
94
|
+
content = replace(content, doi_metadata=doi_metadata)
|
95
|
+
|
73
96
|
# Generate filename (with API semaphore to limit API load)
|
74
97
|
if status_callback:
|
75
|
-
|
98
|
+
status = "Analyzing" if not doi_metadata else "Formatting (DOI-based)"
|
99
|
+
status_callback(filename, {"status": status, "stage": "🤖"})
|
76
100
|
|
77
101
|
async with self.api_semaphore:
|
78
102
|
result = await self.filename_generator.generate(filename, content)
|
79
103
|
|
80
104
|
# Mark complete
|
81
105
|
if status_callback:
|
106
|
+
# result.confidence is already a string due to use_enum_values=True
|
107
|
+
confidence_str = (
|
108
|
+
result.confidence.value
|
109
|
+
if isinstance(result.confidence, ConfidenceLevel)
|
110
|
+
else result.confidence
|
111
|
+
)
|
82
112
|
status_callback(
|
83
113
|
filename,
|
84
114
|
{
|
85
115
|
"status": "Complete",
|
86
116
|
"stage": "✓",
|
87
|
-
"confidence":
|
117
|
+
"confidence": confidence_str,
|
88
118
|
},
|
89
119
|
)
|
90
120
|
|
@@ -95,6 +125,7 @@ class PDFRenameWorkflow:
|
|
95
125
|
reasoning=result.reasoning,
|
96
126
|
text_excerpt=content.text,
|
97
127
|
metadata=content.metadata,
|
128
|
+
doi_metadata=content.doi_metadata,
|
98
129
|
)
|
99
130
|
|
100
131
|
except Exception as e:
|
@@ -10,12 +10,39 @@ from pydantic import BaseModel, Field
|
|
10
10
|
class ConfidenceLevel(str, Enum):
|
11
11
|
"""Confidence level for filename suggestions."""
|
12
12
|
|
13
|
+
VERY_HIGH = "very_high" # DOI-backed metadata
|
13
14
|
HIGH = "high"
|
14
15
|
MEDIUM = "medium"
|
15
16
|
LOW = "low"
|
16
17
|
ERROR = "error"
|
17
18
|
|
18
19
|
|
20
|
+
@dataclass(frozen=True)
|
21
|
+
class DOIMetadata:
|
22
|
+
"""Metadata extracted from DOI lookup."""
|
23
|
+
|
24
|
+
doi: str
|
25
|
+
title: str | None = None
|
26
|
+
authors: list[str] | None = None
|
27
|
+
year: str | None = None
|
28
|
+
journal: str | None = None
|
29
|
+
publisher: str | None = None
|
30
|
+
raw_bibtex: str | None = None
|
31
|
+
|
32
|
+
@property
|
33
|
+
def first_author(self) -> str | None:
|
34
|
+
"""Get the first author's last name."""
|
35
|
+
if not self.authors or len(self.authors) == 0:
|
36
|
+
return None
|
37
|
+
# Extract last name from first author (handles "Last, First" or "First Last" formats)
|
38
|
+
first = self.authors[0]
|
39
|
+
if "," in first:
|
40
|
+
return first.split(",")[0].strip()
|
41
|
+
# Assume last word is last name
|
42
|
+
parts = first.strip().split()
|
43
|
+
return parts[-1] if parts else None
|
44
|
+
|
45
|
+
|
19
46
|
class FilenameResult(BaseModel):
|
20
47
|
"""Result of filename generation."""
|
21
48
|
|
@@ -56,6 +83,7 @@ class PDFContent:
|
|
56
83
|
text: str
|
57
84
|
metadata: PDFMetadata
|
58
85
|
page_count: int
|
86
|
+
doi_metadata: DOIMetadata | None = None
|
59
87
|
|
60
88
|
|
61
89
|
@dataclass
|
@@ -68,6 +96,7 @@ class FileRenameOperation:
|
|
68
96
|
reasoning: str
|
69
97
|
text_excerpt: str
|
70
98
|
metadata: PDFMetadata
|
99
|
+
doi_metadata: DOIMetadata | None = None
|
71
100
|
|
72
101
|
@property
|
73
102
|
def new_filename(self) -> str:
|
pdf_file_renamer/domain/ports.py
CHANGED
@@ -3,7 +3,24 @@
|
|
3
3
|
from abc import ABC, abstractmethod
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
|
-
from pdf_file_renamer.domain.models import FilenameResult, PDFContent
|
6
|
+
from pdf_file_renamer.domain.models import DOIMetadata, FilenameResult, PDFContent
|
7
|
+
|
8
|
+
|
9
|
+
class DOIExtractor(ABC):
|
10
|
+
"""Interface for DOI extraction and metadata lookup."""
|
11
|
+
|
12
|
+
@abstractmethod
|
13
|
+
async def extract_doi(self, pdf_path: Path) -> DOIMetadata | None:
|
14
|
+
"""
|
15
|
+
Extract DOI from PDF and fetch metadata.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
pdf_path: Path to the PDF file
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
DOIMetadata if DOI found and validated, None otherwise
|
22
|
+
"""
|
23
|
+
pass
|
7
24
|
|
8
25
|
|
9
26
|
class PDFExtractor(ABC):
|
@@ -0,0 +1,163 @@
|
|
1
|
+
"""DOI extraction using pdf2doi library."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import re
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
import pdf2doi
|
8
|
+
|
9
|
+
from pdf_file_renamer.domain.models import DOIMetadata
|
10
|
+
from pdf_file_renamer.domain.ports import DOIExtractor
|
11
|
+
|
12
|
+
|
13
|
+
class PDF2DOIExtractor(DOIExtractor):
|
14
|
+
"""Extract DOI from PDF files using pdf2doi library."""
|
15
|
+
|
16
|
+
def __init__(self) -> None:
|
17
|
+
"""Initialize the PDF2DOI extractor."""
|
18
|
+
# Suppress pdf2doi verbose output
|
19
|
+
pdf2doi.config.set("verbose", False)
|
20
|
+
|
21
|
+
async def extract_doi(self, pdf_path: Path) -> DOIMetadata | None:
|
22
|
+
"""
|
23
|
+
Extract DOI from PDF and fetch metadata.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
pdf_path: Path to the PDF file
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
DOIMetadata if DOI found and validated, None otherwise
|
30
|
+
"""
|
31
|
+
try:
|
32
|
+
# Run pdf2doi in executor to avoid blocking
|
33
|
+
loop = asyncio.get_event_loop()
|
34
|
+
result = await loop.run_in_executor(
|
35
|
+
None, pdf2doi.pdf2doi, str(pdf_path)
|
36
|
+
)
|
37
|
+
|
38
|
+
# pdf2doi returns a dict (not a list)
|
39
|
+
if not result or not isinstance(result, dict):
|
40
|
+
return None
|
41
|
+
|
42
|
+
# Check if DOI was found
|
43
|
+
identifier = result.get("identifier")
|
44
|
+
if not identifier:
|
45
|
+
return None
|
46
|
+
|
47
|
+
identifier_type = result.get("identifier_type", "")
|
48
|
+
if identifier_type.lower() not in ("doi", "arxiv"):
|
49
|
+
return None
|
50
|
+
|
51
|
+
# Extract metadata from validation_info (JSON string from CrossRef API)
|
52
|
+
validation_info = result.get("validation_info", "")
|
53
|
+
|
54
|
+
# Parse JSON metadata
|
55
|
+
import json
|
56
|
+
|
57
|
+
metadata = {}
|
58
|
+
if validation_info:
|
59
|
+
try:
|
60
|
+
metadata = json.loads(validation_info)
|
61
|
+
except json.JSONDecodeError:
|
62
|
+
pass
|
63
|
+
|
64
|
+
# Extract title
|
65
|
+
title = metadata.get("title")
|
66
|
+
|
67
|
+
# Extract authors (list of dicts with 'given' and 'family' fields)
|
68
|
+
authors = None
|
69
|
+
if "author" in metadata:
|
70
|
+
author_list = metadata["author"]
|
71
|
+
authors = []
|
72
|
+
for author in author_list:
|
73
|
+
if isinstance(author, dict):
|
74
|
+
family = author.get("family", "")
|
75
|
+
given = author.get("given", "")
|
76
|
+
if family:
|
77
|
+
full_name = f"{given} {family}".strip() if given else family
|
78
|
+
authors.append(full_name)
|
79
|
+
if not authors:
|
80
|
+
authors = None
|
81
|
+
|
82
|
+
# Extract year from published-online or published
|
83
|
+
year = None
|
84
|
+
for date_field in ["published-online", "published", "created"]:
|
85
|
+
if date_field in metadata and "date-parts" in metadata[date_field]:
|
86
|
+
date_parts = metadata[date_field]["date-parts"]
|
87
|
+
if date_parts and len(date_parts) > 0 and len(date_parts[0]) > 0:
|
88
|
+
year = str(date_parts[0][0])
|
89
|
+
break
|
90
|
+
|
91
|
+
# Extract journal (container-title)
|
92
|
+
journal = metadata.get("container-title")
|
93
|
+
|
94
|
+
# Extract publisher
|
95
|
+
publisher = metadata.get("publisher")
|
96
|
+
|
97
|
+
return DOIMetadata(
|
98
|
+
doi=identifier,
|
99
|
+
title=title,
|
100
|
+
authors=authors,
|
101
|
+
year=year,
|
102
|
+
journal=journal,
|
103
|
+
publisher=publisher,
|
104
|
+
raw_bibtex=validation_info if validation_info else None,
|
105
|
+
)
|
106
|
+
|
107
|
+
except Exception:
|
108
|
+
# Silently fail - DOI extraction is opportunistic
|
109
|
+
return None
|
110
|
+
|
111
|
+
def _extract_bibtex_field(self, bibtex: str, field: str) -> str | None:
|
112
|
+
"""
|
113
|
+
Extract a field from bibtex string.
|
114
|
+
|
115
|
+
Args:
|
116
|
+
bibtex: Bibtex string
|
117
|
+
field: Field name to extract
|
118
|
+
|
119
|
+
Returns:
|
120
|
+
Field value or None
|
121
|
+
"""
|
122
|
+
if not bibtex:
|
123
|
+
return None
|
124
|
+
|
125
|
+
# Match field = {value} or field = "value"
|
126
|
+
pattern = rf"{field}\s*=\s*[{{\"](.*?)[\}}\"](,|\n|$)"
|
127
|
+
match = re.search(pattern, bibtex, re.IGNORECASE)
|
128
|
+
|
129
|
+
if match:
|
130
|
+
return match.group(1).strip()
|
131
|
+
|
132
|
+
return None
|
133
|
+
|
134
|
+
def _extract_bibtex_authors(self, bibtex: str) -> list[str] | None:
|
135
|
+
"""
|
136
|
+
Extract authors from bibtex string.
|
137
|
+
|
138
|
+
Args:
|
139
|
+
bibtex: Bibtex string
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
List of author names or None
|
143
|
+
"""
|
144
|
+
if not bibtex:
|
145
|
+
return None
|
146
|
+
|
147
|
+
# Match author = {Name1 and Name2 and Name3}
|
148
|
+
pattern = r"author\s*=\s*[{\"](.*?)[\}\"](,|\n|$)"
|
149
|
+
match = re.search(pattern, bibtex, re.IGNORECASE)
|
150
|
+
|
151
|
+
if not match:
|
152
|
+
return None
|
153
|
+
|
154
|
+
authors_str = match.group(1).strip()
|
155
|
+
|
156
|
+
# Split by "and" and clean up
|
157
|
+
authors = [
|
158
|
+
author.strip()
|
159
|
+
for author in re.split(r"\s+and\s+", authors_str, flags=re.IGNORECASE)
|
160
|
+
if author.strip()
|
161
|
+
]
|
162
|
+
|
163
|
+
return authors if authors else None
|
@@ -15,6 +15,7 @@ from pdf_file_renamer.application import (
|
|
15
15
|
RenameService,
|
16
16
|
)
|
17
17
|
from pdf_file_renamer.infrastructure.config import Settings
|
18
|
+
from pdf_file_renamer.infrastructure.doi import PDF2DOIExtractor
|
18
19
|
from pdf_file_renamer.infrastructure.llm import PydanticAIProvider
|
19
20
|
from pdf_file_renamer.infrastructure.pdf import (
|
20
21
|
CompositePDFExtractor,
|
@@ -64,6 +65,9 @@ def create_workflow(settings: Settings) -> PDFRenameWorkflow:
|
|
64
65
|
retry_max_wait=settings.retry_max_wait,
|
65
66
|
)
|
66
67
|
|
68
|
+
# Create DOI extractor
|
69
|
+
doi_extractor = PDF2DOIExtractor()
|
70
|
+
|
67
71
|
# Create application services
|
68
72
|
filename_service = FilenameService(llm_provider)
|
69
73
|
file_renamer = RenameService()
|
@@ -73,6 +77,7 @@ def create_workflow(settings: Settings) -> PDFRenameWorkflow:
|
|
73
77
|
pdf_extractor=pdf_extractor,
|
74
78
|
filename_generator=filename_service,
|
75
79
|
file_renamer=file_renamer,
|
80
|
+
doi_extractor=doi_extractor,
|
76
81
|
max_concurrent_api=settings.max_concurrent_api,
|
77
82
|
max_concurrent_pdf=settings.max_concurrent_pdf,
|
78
83
|
)
|
@@ -7,7 +7,7 @@ from rich.prompt import Prompt
|
|
7
7
|
from rich.table import Table
|
8
8
|
from rich.text import Text
|
9
9
|
|
10
|
-
from pdf_file_renamer.domain.models import FileRenameOperation
|
10
|
+
from pdf_file_renamer.domain.models import ConfidenceLevel, FileRenameOperation
|
11
11
|
|
12
12
|
|
13
13
|
class ProgressDisplay:
|
@@ -146,7 +146,13 @@ class InteractivePrompt:
|
|
146
146
|
info_text.append("Suggested: ", style="bold green")
|
147
147
|
info_text.append(f"{operation.new_filename}\n", style="green")
|
148
148
|
info_text.append("Confidence: ", style="bold yellow")
|
149
|
-
|
149
|
+
# Handle both enum and string confidence
|
150
|
+
conf_str = (
|
151
|
+
operation.confidence.value
|
152
|
+
if isinstance(operation.confidence, ConfidenceLevel)
|
153
|
+
else operation.confidence
|
154
|
+
)
|
155
|
+
info_text.append(f"{conf_str}\n", style="yellow")
|
150
156
|
info_text.append("Reasoning: ", style="bold white")
|
151
157
|
info_text.append(operation.reasoning, style="dim white")
|
152
158
|
|
@@ -206,10 +212,16 @@ class ResultsTable:
|
|
206
212
|
reasoning = op.reasoning
|
207
213
|
if len(reasoning) > 100:
|
208
214
|
reasoning = reasoning[:100] + "..."
|
215
|
+
# Handle both enum and string confidence
|
216
|
+
conf_str = (
|
217
|
+
op.confidence.value
|
218
|
+
if isinstance(op.confidence, ConfidenceLevel)
|
219
|
+
else op.confidence
|
220
|
+
)
|
209
221
|
table.add_row(
|
210
222
|
op.original_path.name,
|
211
223
|
op.new_filename,
|
212
|
-
|
224
|
+
conf_str,
|
213
225
|
reasoning,
|
214
226
|
)
|
215
227
|
|
@@ -1,11 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pdf-file-renamer
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.6.1
|
4
4
|
Summary: Intelligent PDF renaming using LLMs
|
5
5
|
License-File: LICENSE
|
6
6
|
Requires-Python: >=3.11
|
7
7
|
Requires-Dist: docling-core>=2.0.0
|
8
8
|
Requires-Dist: docling-parse>=2.0.0
|
9
|
+
Requires-Dist: pdf2doi>=1.7
|
9
10
|
Requires-Dist: pydantic-ai>=1.0.17
|
10
11
|
Requires-Dist: pydantic-settings>=2.7.1
|
11
12
|
Requires-Dist: pydantic>=2.10.6
|
@@ -43,9 +44,11 @@ Intelligent PDF file renaming using LLMs. This tool analyzes PDF content and met
|
|
43
44
|
|
44
45
|
## Features
|
45
46
|
|
47
|
+
- **DOI-based naming** - Automatically extracts DOI and fetches authoritative metadata for academic papers
|
46
48
|
- **Advanced PDF parsing** using docling-parse for better structure-aware extraction
|
47
49
|
- **OCR fallback** for scanned PDFs with low text content
|
48
50
|
- **Smart LLM prompting** with multi-pass analysis for improved accuracy
|
51
|
+
- **Hybrid approach** - Uses DOI metadata when available, falls back to LLM analysis for other documents
|
49
52
|
- Suggests filenames in format: `Author-Topic-Year.pdf`
|
50
53
|
- Dry-run mode to preview changes before applying
|
51
54
|
- **Enhanced interactive mode** with options to accept, manually edit, retry, or skip each file
|
@@ -208,19 +211,44 @@ You can use interactive mode with `--dry-run` to preview without actually renami
|
|
208
211
|
|
209
212
|
## How It Works
|
210
213
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
214
|
+
### Intelligent Hybrid Approach
|
215
|
+
|
216
|
+
The tool uses a multi-strategy approach to generate accurate filenames:
|
217
|
+
|
218
|
+
1. **DOI Detection** (for academic papers)
|
219
|
+
- Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
|
220
|
+
- If found, queries authoritative metadata (title, authors, year, journal)
|
221
|
+
- Generates filename with **very high confidence** from validated metadata
|
222
|
+
- **Saves API costs** - no LLM call needed for papers with DOIs
|
223
|
+
|
224
|
+
2. **LLM Analysis** (fallback for non-academic PDFs)
|
225
|
+
- **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
|
226
|
+
- **OCR**: Automatically applies OCR for scanned PDFs with minimal text
|
227
|
+
- **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
|
228
|
+
- **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
|
229
|
+
- **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
|
230
|
+
- **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
|
231
|
+
|
232
|
+
3. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
|
233
|
+
4. **Rename**: Applies suggestions (if not in dry-run mode)
|
234
|
+
|
235
|
+
### Benefits of DOI Integration
|
236
|
+
|
237
|
+
- **Accuracy**: DOI metadata is canonical and verified
|
238
|
+
- **Speed**: Instant lookup vs. LLM processing time
|
239
|
+
- **Cost**: Free DOI lookups save on API costs for academic papers
|
240
|
+
- **Reliability**: Works even when PDF text extraction is poor
|
219
241
|
|
220
242
|
## Cost Considerations
|
221
243
|
|
222
|
-
**
|
244
|
+
**DOI-based Naming (Academic Papers):**
|
245
|
+
- **Completely free** - No API costs
|
246
|
+
- **No LLM needed** - Direct metadata lookup
|
247
|
+
- Works for most academic papers with embedded DOIs
|
248
|
+
|
249
|
+
**OpenAI (Fallback):**
|
223
250
|
- Uses `gpt-4o-mini` by default (very cost-effective)
|
251
|
+
- Only called when DOI not found
|
224
252
|
- Processes first ~4500 characters per PDF
|
225
253
|
- Typical cost: ~$0.001-0.003 per PDF
|
226
254
|
|
@@ -1,14 +1,16 @@
|
|
1
|
-
pdf_file_renamer/__init__.py,sha256
|
1
|
+
pdf_file_renamer/__init__.py,sha256=1hyyq0EM6vqGG8Gxxdkg3MuLU_4Mwj3mc812ikutUB8,85
|
2
2
|
pdf_file_renamer/main.py,sha256=FTEEb-9QmOOsN9SE8L1SZvFVIkVpQDy8xZ5a8t8CWUs,145
|
3
3
|
pdf_file_renamer/application/__init__.py,sha256=riSV7UXBenkDst7Nnf11N1_RuRtM7wpKdwugxOhumS4,363
|
4
|
-
pdf_file_renamer/application/filename_service.py,sha256=
|
5
|
-
pdf_file_renamer/application/pdf_rename_workflow.py,sha256=
|
4
|
+
pdf_file_renamer/application/filename_service.py,sha256=IbeCNBwyhFlCMCZveq16nmQ2qvyTdtgLmr6PDWPckOs,4868
|
5
|
+
pdf_file_renamer/application/pdf_rename_workflow.py,sha256=WLcGJ4ufEmAnGSxVQcOFDeGG8gXSccs11DaP521YDzo,6144
|
6
6
|
pdf_file_renamer/application/rename_service.py,sha256=vviNQolk_w-qDQvOKTKj8ZhqYyyNWL-VJMfuUnL6WLw,2357
|
7
7
|
pdf_file_renamer/domain/__init__.py,sha256=jxbH3h6xaCnSRuBxclFESl6ZE1pua_I1K4CRAaYxu_I,503
|
8
|
-
pdf_file_renamer/domain/models.py,sha256=
|
9
|
-
pdf_file_renamer/domain/ports.py,sha256=
|
8
|
+
pdf_file_renamer/domain/models.py,sha256=QwN79TzWmqvQvz-m9ymebvAx3pWlVpSWXNdSEAk4qq0,3186
|
9
|
+
pdf_file_renamer/domain/ports.py,sha256=ebOcHptiOK119NCmIwM32_fbRK5xkZP9K67vjL-4k0g,2976
|
10
10
|
pdf_file_renamer/infrastructure/__init__.py,sha256=C3ZQ7WCPCa6PMfP00lu4wqb0r57GVyDdiD5EL2DhCeY,187
|
11
11
|
pdf_file_renamer/infrastructure/config.py,sha256=baNL5_6_NNiS50ZNdql7fDwQbeAwf6f58HGYIWFQxQQ,2464
|
12
|
+
pdf_file_renamer/infrastructure/doi/__init__.py,sha256=8N9ZEwfG7q5xomzh187YtP8t4CfEBHM334xNRblPeuI,153
|
13
|
+
pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py,sha256=mK2Z5oOwN-TgiEHLgoLM5yCSe_-G9kWXLr4Sw3nMkEM,5105
|
12
14
|
pdf_file_renamer/infrastructure/llm/__init__.py,sha256=ToB8__mHvXwaIukGKPEAQ8SeC4ZLiH4auZI1P1yH5PQ,159
|
13
15
|
pdf_file_renamer/infrastructure/llm/pydantic_ai_provider.py,sha256=kVsmj0NIawkj-1WWM0hZXbsNH09GabVZm9HPlYsxGuo,9217
|
14
16
|
pdf_file_renamer/infrastructure/pdf/__init__.py,sha256=uMHqxSXNLZH5WH_e1kXrp9m7uTqPkiI2hXjNo6rCRoo,368
|
@@ -16,10 +18,10 @@ pdf_file_renamer/infrastructure/pdf/composite.py,sha256=dNrrcGTsGf1LLF4F0AoF7jRb
|
|
16
18
|
pdf_file_renamer/infrastructure/pdf/docling_extractor.py,sha256=auZrJpK7mMg1mUXK6ptjZC1pnAUQje1h7ZAS7gFUBzo,3974
|
17
19
|
pdf_file_renamer/infrastructure/pdf/pymupdf_extractor.py,sha256=C61udZCqGqiVx7T0HWNyjvnhgv5AgMIcCYtrhgHOJwk,5465
|
18
20
|
pdf_file_renamer/presentation/__init__.py,sha256=1VR44GoPGTixk3hG5YzhGyQf7a4BTKsJBd2VP3rHcFM,211
|
19
|
-
pdf_file_renamer/presentation/cli.py,sha256=
|
20
|
-
pdf_file_renamer/presentation/formatters.py,sha256=
|
21
|
-
pdf_file_renamer-0.
|
22
|
-
pdf_file_renamer-0.
|
23
|
-
pdf_file_renamer-0.
|
24
|
-
pdf_file_renamer-0.
|
25
|
-
pdf_file_renamer-0.
|
21
|
+
pdf_file_renamer/presentation/cli.py,sha256=0t_59-utRWLNCYjFetU0ZHoF1DPTjdNiWM9Au0jFaOg,8013
|
22
|
+
pdf_file_renamer/presentation/formatters.py,sha256=Es7pZoHw5bEPtNfa_s43eHXa_m0yrTmX6S2aU78JUE0,8978
|
23
|
+
pdf_file_renamer-0.6.1.dist-info/METADATA,sha256=OyZKW601xnQFXR-SDLakLEnasq5rtfP7YO6IYn6f-z4,9912
|
24
|
+
pdf_file_renamer-0.6.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
25
|
+
pdf_file_renamer-0.6.1.dist-info/entry_points.txt,sha256=0fEGYy60chGE9rECWeCVPxjxzz6vMtIAYdFvmH7xzbw,63
|
26
|
+
pdf_file_renamer-0.6.1.dist-info/licenses/LICENSE,sha256=_w08V08WgoMpDMlGNlkIatC5QfQ_Ds_rXOBM8pl7ffE,1068
|
27
|
+
pdf_file_renamer-0.6.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|