pdf-file-renamer 0.4.2__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pdf_renamer → pdf_file_renamer}/__init__.py +1 -1
- pdf_file_renamer/application/__init__.py +7 -0
- pdf_file_renamer/application/filename_service.py +172 -0
- {pdf_renamer → pdf_file_renamer}/application/pdf_rename_workflow.py +29 -4
- {pdf_renamer → pdf_file_renamer}/application/rename_service.py +1 -1
- {pdf_renamer → pdf_file_renamer}/domain/__init__.py +2 -2
- {pdf_renamer → pdf_file_renamer}/domain/models.py +29 -0
- {pdf_renamer → pdf_file_renamer}/domain/ports.py +18 -1
- {pdf_renamer → pdf_file_renamer}/infrastructure/__init__.py +1 -1
- pdf_file_renamer/infrastructure/doi/__init__.py +5 -0
- pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py +129 -0
- pdf_file_renamer/infrastructure/llm/__init__.py +5 -0
- {pdf_renamer → pdf_file_renamer}/infrastructure/llm/pydantic_ai_provider.py +2 -2
- pdf_file_renamer/infrastructure/pdf/__init__.py +7 -0
- {pdf_renamer → pdf_file_renamer}/infrastructure/pdf/composite.py +2 -2
- {pdf_renamer → pdf_file_renamer}/infrastructure/pdf/docling_extractor.py +2 -2
- {pdf_renamer → pdf_file_renamer}/infrastructure/pdf/pymupdf_extractor.py +2 -2
- {pdf_renamer → pdf_file_renamer}/main.py +1 -1
- pdf_file_renamer/presentation/__init__.py +6 -0
- {pdf_renamer → pdf_file_renamer}/presentation/cli.py +10 -5
- {pdf_renamer → pdf_file_renamer}/presentation/formatters.py +1 -1
- {pdf_file_renamer-0.4.2.dist-info → pdf_file_renamer-0.6.0.dist-info}/METADATA +50 -23
- pdf_file_renamer-0.6.0.dist-info/RECORD +27 -0
- {pdf_file_renamer-0.4.2.dist-info → pdf_file_renamer-0.6.0.dist-info}/WHEEL +1 -2
- pdf_file_renamer-0.6.0.dist-info/entry_points.txt +2 -0
- pdf_file_renamer-0.4.2.dist-info/RECORD +0 -26
- pdf_file_renamer-0.4.2.dist-info/entry_points.txt +0 -2
- pdf_file_renamer-0.4.2.dist-info/top_level.txt +0 -1
- pdf_renamer/application/__init__.py +0 -7
- pdf_renamer/application/filename_service.py +0 -70
- pdf_renamer/infrastructure/llm/__init__.py +0 -5
- pdf_renamer/infrastructure/pdf/__init__.py +0 -7
- pdf_renamer/presentation/__init__.py +0 -6
- {pdf_renamer → pdf_file_renamer}/infrastructure/config.py +0 -0
- {pdf_file_renamer-0.4.2.dist-info → pdf_file_renamer-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,7 @@
|
|
1
|
+
"""Application layer - use cases and business logic orchestration."""
|
2
|
+
|
3
|
+
from pdf_file_renamer.application.filename_service import FilenameService
|
4
|
+
from pdf_file_renamer.application.pdf_rename_workflow import PDFRenameWorkflow
|
5
|
+
from pdf_file_renamer.application.rename_service import RenameService
|
6
|
+
|
7
|
+
__all__ = ["FilenameService", "PDFRenameWorkflow", "RenameService"]
|
@@ -0,0 +1,172 @@
|
|
1
|
+
"""Filename generation service - coordinates PDF extraction and LLM generation."""
|
2
|
+
|
3
|
+
import re
|
4
|
+
|
5
|
+
from pdf_file_renamer.domain.models import ConfidenceLevel, FilenameResult, PDFContent
|
6
|
+
from pdf_file_renamer.domain.ports import FilenameGenerator, LLMProvider
|
7
|
+
|
8
|
+
|
9
|
+
class FilenameService(FilenameGenerator):
|
10
|
+
"""Service for generating filenames from PDF content."""
|
11
|
+
|
12
|
+
def __init__(self, llm_provider: LLMProvider) -> None:
|
13
|
+
"""
|
14
|
+
Initialize the filename service.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
llm_provider: LLM provider for filename generation
|
18
|
+
"""
|
19
|
+
self.llm_provider = llm_provider
|
20
|
+
|
21
|
+
async def generate(self, original_filename: str, content: PDFContent) -> FilenameResult:
|
22
|
+
"""
|
23
|
+
Generate a filename suggestion based on PDF content.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
original_filename: Current filename
|
27
|
+
content: Extracted PDF content
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
FilenameResult with suggestion
|
31
|
+
"""
|
32
|
+
# If DOI metadata is available, use it directly for high-confidence naming
|
33
|
+
if content.doi_metadata:
|
34
|
+
return self._generate_from_doi(content)
|
35
|
+
|
36
|
+
# Otherwise, fall back to LLM-based generation
|
37
|
+
# Convert metadata to dictionary
|
38
|
+
metadata_dict = content.metadata.to_dict()
|
39
|
+
|
40
|
+
# Generate filename using LLM
|
41
|
+
result = await self.llm_provider.generate_filename(
|
42
|
+
original_filename=original_filename,
|
43
|
+
text_excerpt=content.text,
|
44
|
+
metadata_dict=metadata_dict,
|
45
|
+
)
|
46
|
+
|
47
|
+
# Sanitize the generated filename
|
48
|
+
result.filename = self.sanitize(result.filename)
|
49
|
+
|
50
|
+
return result
|
51
|
+
|
52
|
+
def _generate_from_doi(self, content: PDFContent) -> FilenameResult:
|
53
|
+
"""
|
54
|
+
Generate filename directly from DOI metadata.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
content: PDF content with DOI metadata
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
FilenameResult with very high confidence
|
61
|
+
"""
|
62
|
+
doi_meta = content.doi_metadata
|
63
|
+
if not doi_meta:
|
64
|
+
msg = "DOI metadata not available"
|
65
|
+
raise ValueError(msg)
|
66
|
+
|
67
|
+
# Extract components for filename
|
68
|
+
author = doi_meta.first_author or "Unknown"
|
69
|
+
|
70
|
+
# Get title and clean it
|
71
|
+
title = doi_meta.title or "Document"
|
72
|
+
# Extract key words from title (remove common words)
|
73
|
+
title_words = self._extract_key_words(title)
|
74
|
+
|
75
|
+
year = doi_meta.year or ""
|
76
|
+
|
77
|
+
# Build filename: Author-KeyWords-Year
|
78
|
+
parts = [author]
|
79
|
+
if title_words:
|
80
|
+
parts.append(title_words)
|
81
|
+
if year:
|
82
|
+
parts.append(year)
|
83
|
+
|
84
|
+
filename = "-".join(parts)
|
85
|
+
filename = self.sanitize(filename)
|
86
|
+
|
87
|
+
return FilenameResult(
|
88
|
+
filename=filename,
|
89
|
+
confidence=ConfidenceLevel.VERY_HIGH,
|
90
|
+
reasoning=f"Filename generated from DOI metadata (DOI: {doi_meta.doi}). "
|
91
|
+
f"Author: {author}, Year: {year}",
|
92
|
+
)
|
93
|
+
|
94
|
+
def _extract_key_words(self, title: str, max_words: int = 6) -> str:
|
95
|
+
"""
|
96
|
+
Extract key words from title, removing common words.
|
97
|
+
|
98
|
+
Args:
|
99
|
+
title: Paper title
|
100
|
+
max_words: Maximum number of words to include
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
Hyphenated key words
|
104
|
+
"""
|
105
|
+
# Common words to skip
|
106
|
+
stop_words = {
|
107
|
+
"a",
|
108
|
+
"an",
|
109
|
+
"the",
|
110
|
+
"and",
|
111
|
+
"or",
|
112
|
+
"but",
|
113
|
+
"in",
|
114
|
+
"on",
|
115
|
+
"at",
|
116
|
+
"to",
|
117
|
+
"for",
|
118
|
+
"of",
|
119
|
+
"with",
|
120
|
+
"by",
|
121
|
+
"from",
|
122
|
+
"as",
|
123
|
+
"is",
|
124
|
+
"was",
|
125
|
+
"are",
|
126
|
+
"were",
|
127
|
+
"been",
|
128
|
+
"be",
|
129
|
+
"this",
|
130
|
+
"that",
|
131
|
+
"these",
|
132
|
+
"those",
|
133
|
+
}
|
134
|
+
|
135
|
+
# Clean and split title
|
136
|
+
words = re.sub(r"[^\w\s-]", " ", title.lower()).split()
|
137
|
+
|
138
|
+
# Filter stop words and keep significant words
|
139
|
+
key_words = [w for w in words if w not in stop_words and len(w) > 2]
|
140
|
+
|
141
|
+
# Limit to max_words
|
142
|
+
key_words = key_words[:max_words]
|
143
|
+
|
144
|
+
# Capitalize first letter of each word
|
145
|
+
key_words = [w.capitalize() for w in key_words]
|
146
|
+
|
147
|
+
return "-".join(key_words)
|
148
|
+
|
149
|
+
def sanitize(self, filename: str) -> str:
|
150
|
+
"""
|
151
|
+
Sanitize a filename to be filesystem-safe.
|
152
|
+
|
153
|
+
Args:
|
154
|
+
filename: Raw filename
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
Sanitized filename
|
158
|
+
"""
|
159
|
+
# Remove or replace invalid characters
|
160
|
+
filename = re.sub(r'[<>:"/\\|?*]', "", filename)
|
161
|
+
|
162
|
+
# Replace multiple spaces/hyphens with single hyphen
|
163
|
+
filename = re.sub(r"[\s\-]+", "-", filename)
|
164
|
+
|
165
|
+
# Remove leading/trailing hyphens
|
166
|
+
filename = filename.strip("-")
|
167
|
+
|
168
|
+
# Limit length
|
169
|
+
if len(filename) > 100:
|
170
|
+
filename = filename[:100].rstrip("-")
|
171
|
+
|
172
|
+
return filename
|
@@ -1,11 +1,13 @@
|
|
1
1
|
"""PDF rename workflow - orchestrates the complete process."""
|
2
2
|
|
3
3
|
import asyncio
|
4
|
+
import contextlib
|
4
5
|
from collections.abc import Callable
|
5
6
|
from pathlib import Path
|
6
7
|
|
7
|
-
from
|
8
|
-
from
|
8
|
+
from pdf_file_renamer.domain.models import FileRenameOperation
|
9
|
+
from pdf_file_renamer.domain.ports import (
|
10
|
+
DOIExtractor,
|
9
11
|
FilenameGenerator,
|
10
12
|
FileRenamer,
|
11
13
|
PDFExtractor,
|
@@ -25,6 +27,7 @@ class PDFRenameWorkflow:
|
|
25
27
|
pdf_extractor: PDFExtractor,
|
26
28
|
filename_generator: FilenameGenerator,
|
27
29
|
file_renamer: FileRenamer,
|
30
|
+
doi_extractor: DOIExtractor | None = None,
|
28
31
|
max_concurrent_api: int = 3,
|
29
32
|
max_concurrent_pdf: int = 10,
|
30
33
|
) -> None:
|
@@ -35,12 +38,14 @@ class PDFRenameWorkflow:
|
|
35
38
|
pdf_extractor: PDF extraction service
|
36
39
|
filename_generator: Filename generation service
|
37
40
|
file_renamer: File renaming service
|
41
|
+
doi_extractor: Optional DOI extraction service
|
38
42
|
max_concurrent_api: Maximum concurrent API calls
|
39
43
|
max_concurrent_pdf: Maximum concurrent PDF extractions
|
40
44
|
"""
|
41
45
|
self.pdf_extractor = pdf_extractor
|
42
46
|
self.filename_generator = filename_generator
|
43
47
|
self.file_renamer = file_renamer
|
48
|
+
self.doi_extractor = doi_extractor
|
44
49
|
self.api_semaphore = asyncio.Semaphore(max_concurrent_api)
|
45
50
|
self.pdf_semaphore = asyncio.Semaphore(max_concurrent_pdf)
|
46
51
|
|
@@ -62,17 +67,36 @@ class PDFRenameWorkflow:
|
|
62
67
|
filename = pdf_path.name
|
63
68
|
|
64
69
|
try:
|
70
|
+
# Try DOI extraction first (if extractor available)
|
71
|
+
doi_metadata = None
|
72
|
+
if self.doi_extractor:
|
73
|
+
if status_callback:
|
74
|
+
status_callback(filename, {"status": "DOI Lookup", "stage": "🔍"})
|
75
|
+
|
76
|
+
# DOI extraction is optional, continue if it fails
|
77
|
+
with contextlib.suppress(Exception):
|
78
|
+
doi_metadata = await self.doi_extractor.extract_doi(pdf_path)
|
79
|
+
|
65
80
|
# Update status: extracting
|
66
81
|
if status_callback:
|
67
|
-
|
82
|
+
status = "Extracting" if not doi_metadata else "Extracting (DOI found)"
|
83
|
+
status_callback(filename, {"status": status, "stage": "📄"})
|
68
84
|
|
69
85
|
# Extract PDF content (with PDF semaphore to limit memory usage)
|
70
86
|
async with self.pdf_semaphore:
|
71
87
|
content = await self.pdf_extractor.extract(pdf_path)
|
72
88
|
|
89
|
+
# Attach DOI metadata to content if found
|
90
|
+
if doi_metadata:
|
91
|
+
# Create new content with DOI metadata
|
92
|
+
from dataclasses import replace
|
93
|
+
|
94
|
+
content = replace(content, doi_metadata=doi_metadata)
|
95
|
+
|
73
96
|
# Generate filename (with API semaphore to limit API load)
|
74
97
|
if status_callback:
|
75
|
-
|
98
|
+
status = "Analyzing" if not doi_metadata else "Formatting (DOI-based)"
|
99
|
+
status_callback(filename, {"status": status, "stage": "🤖"})
|
76
100
|
|
77
101
|
async with self.api_semaphore:
|
78
102
|
result = await self.filename_generator.generate(filename, content)
|
@@ -95,6 +119,7 @@ class PDFRenameWorkflow:
|
|
95
119
|
reasoning=result.reasoning,
|
96
120
|
text_excerpt=content.text,
|
97
121
|
metadata=content.metadata,
|
122
|
+
doi_metadata=content.doi_metadata,
|
98
123
|
)
|
99
124
|
|
100
125
|
except Exception as e:
|
@@ -1,12 +1,12 @@
|
|
1
1
|
"""Domain layer - pure business logic with no external dependencies."""
|
2
2
|
|
3
|
-
from
|
3
|
+
from pdf_file_renamer.domain.models import (
|
4
4
|
FilenameResult,
|
5
5
|
FileRenameOperation,
|
6
6
|
PDFContent,
|
7
7
|
PDFMetadata,
|
8
8
|
)
|
9
|
-
from
|
9
|
+
from pdf_file_renamer.domain.ports import (
|
10
10
|
FilenameGenerator,
|
11
11
|
FileRenamer,
|
12
12
|
LLMProvider,
|
@@ -10,12 +10,39 @@ from pydantic import BaseModel, Field
|
|
10
10
|
class ConfidenceLevel(str, Enum):
|
11
11
|
"""Confidence level for filename suggestions."""
|
12
12
|
|
13
|
+
VERY_HIGH = "very_high" # DOI-backed metadata
|
13
14
|
HIGH = "high"
|
14
15
|
MEDIUM = "medium"
|
15
16
|
LOW = "low"
|
16
17
|
ERROR = "error"
|
17
18
|
|
18
19
|
|
20
|
+
@dataclass(frozen=True)
|
21
|
+
class DOIMetadata:
|
22
|
+
"""Metadata extracted from DOI lookup."""
|
23
|
+
|
24
|
+
doi: str
|
25
|
+
title: str | None = None
|
26
|
+
authors: list[str] | None = None
|
27
|
+
year: str | None = None
|
28
|
+
journal: str | None = None
|
29
|
+
publisher: str | None = None
|
30
|
+
raw_bibtex: str | None = None
|
31
|
+
|
32
|
+
@property
|
33
|
+
def first_author(self) -> str | None:
|
34
|
+
"""Get the first author's last name."""
|
35
|
+
if not self.authors or len(self.authors) == 0:
|
36
|
+
return None
|
37
|
+
# Extract last name from first author (handles "Last, First" or "First Last" formats)
|
38
|
+
first = self.authors[0]
|
39
|
+
if "," in first:
|
40
|
+
return first.split(",")[0].strip()
|
41
|
+
# Assume last word is last name
|
42
|
+
parts = first.strip().split()
|
43
|
+
return parts[-1] if parts else None
|
44
|
+
|
45
|
+
|
19
46
|
class FilenameResult(BaseModel):
|
20
47
|
"""Result of filename generation."""
|
21
48
|
|
@@ -56,6 +83,7 @@ class PDFContent:
|
|
56
83
|
text: str
|
57
84
|
metadata: PDFMetadata
|
58
85
|
page_count: int
|
86
|
+
doi_metadata: DOIMetadata | None = None
|
59
87
|
|
60
88
|
|
61
89
|
@dataclass
|
@@ -68,6 +96,7 @@ class FileRenameOperation:
|
|
68
96
|
reasoning: str
|
69
97
|
text_excerpt: str
|
70
98
|
metadata: PDFMetadata
|
99
|
+
doi_metadata: DOIMetadata | None = None
|
71
100
|
|
72
101
|
@property
|
73
102
|
def new_filename(self) -> str:
|
@@ -3,7 +3,24 @@
|
|
3
3
|
from abc import ABC, abstractmethod
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
|
-
from
|
6
|
+
from pdf_file_renamer.domain.models import DOIMetadata, FilenameResult, PDFContent
|
7
|
+
|
8
|
+
|
9
|
+
class DOIExtractor(ABC):
|
10
|
+
"""Interface for DOI extraction and metadata lookup."""
|
11
|
+
|
12
|
+
@abstractmethod
|
13
|
+
async def extract_doi(self, pdf_path: Path) -> DOIMetadata | None:
|
14
|
+
"""
|
15
|
+
Extract DOI from PDF and fetch metadata.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
pdf_path: Path to the PDF file
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
DOIMetadata if DOI found and validated, None otherwise
|
22
|
+
"""
|
23
|
+
pass
|
7
24
|
|
8
25
|
|
9
26
|
class PDFExtractor(ABC):
|
@@ -0,0 +1,129 @@
|
|
1
|
+
"""DOI extraction using pdf2doi library."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import re
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
import pdf2doi
|
8
|
+
|
9
|
+
from pdf_file_renamer.domain.models import DOIMetadata
|
10
|
+
from pdf_file_renamer.domain.ports import DOIExtractor
|
11
|
+
|
12
|
+
|
13
|
+
class PDF2DOIExtractor(DOIExtractor):
|
14
|
+
"""Extract DOI from PDF files using pdf2doi library."""
|
15
|
+
|
16
|
+
def __init__(self) -> None:
|
17
|
+
"""Initialize the PDF2DOI extractor."""
|
18
|
+
# Suppress pdf2doi verbose output
|
19
|
+
pdf2doi.config.set("verbose", False)
|
20
|
+
|
21
|
+
async def extract_doi(self, pdf_path: Path) -> DOIMetadata | None:
|
22
|
+
"""
|
23
|
+
Extract DOI from PDF and fetch metadata.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
pdf_path: Path to the PDF file
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
DOIMetadata if DOI found and validated, None otherwise
|
30
|
+
"""
|
31
|
+
try:
|
32
|
+
# Run pdf2doi in executor to avoid blocking
|
33
|
+
loop = asyncio.get_event_loop()
|
34
|
+
results = await loop.run_in_executor(
|
35
|
+
None, pdf2doi.pdf2doi, str(pdf_path)
|
36
|
+
)
|
37
|
+
|
38
|
+
if not results or len(results) == 0:
|
39
|
+
return None
|
40
|
+
|
41
|
+
# Get the first result
|
42
|
+
result = results[0]
|
43
|
+
|
44
|
+
# Check if DOI was found
|
45
|
+
identifier = result.get("identifier")
|
46
|
+
if not identifier:
|
47
|
+
return None
|
48
|
+
|
49
|
+
identifier_type = result.get("identifier_type", "")
|
50
|
+
if identifier_type.lower() not in ("doi", "arxiv"):
|
51
|
+
return None
|
52
|
+
|
53
|
+
# Extract metadata from validation_info (bibtex)
|
54
|
+
validation_info = result.get("validation_info", "")
|
55
|
+
|
56
|
+
# Parse bibtex for metadata
|
57
|
+
title = self._extract_bibtex_field(validation_info, "title")
|
58
|
+
authors = self._extract_bibtex_authors(validation_info)
|
59
|
+
year = self._extract_bibtex_field(validation_info, "year")
|
60
|
+
journal = self._extract_bibtex_field(validation_info, "journal")
|
61
|
+
publisher = self._extract_bibtex_field(validation_info, "publisher")
|
62
|
+
|
63
|
+
return DOIMetadata(
|
64
|
+
doi=identifier,
|
65
|
+
title=title,
|
66
|
+
authors=authors,
|
67
|
+
year=year,
|
68
|
+
journal=journal,
|
69
|
+
publisher=publisher,
|
70
|
+
raw_bibtex=validation_info if validation_info else None,
|
71
|
+
)
|
72
|
+
|
73
|
+
except Exception:
|
74
|
+
# Silently fail - DOI extraction is opportunistic
|
75
|
+
return None
|
76
|
+
|
77
|
+
def _extract_bibtex_field(self, bibtex: str, field: str) -> str | None:
|
78
|
+
"""
|
79
|
+
Extract a field from bibtex string.
|
80
|
+
|
81
|
+
Args:
|
82
|
+
bibtex: Bibtex string
|
83
|
+
field: Field name to extract
|
84
|
+
|
85
|
+
Returns:
|
86
|
+
Field value or None
|
87
|
+
"""
|
88
|
+
if not bibtex:
|
89
|
+
return None
|
90
|
+
|
91
|
+
# Match field = {value} or field = "value"
|
92
|
+
pattern = rf"{field}\s*=\s*[{{\"](.*?)[\}}\"](,|\n|$)"
|
93
|
+
match = re.search(pattern, bibtex, re.IGNORECASE)
|
94
|
+
|
95
|
+
if match:
|
96
|
+
return match.group(1).strip()
|
97
|
+
|
98
|
+
return None
|
99
|
+
|
100
|
+
def _extract_bibtex_authors(self, bibtex: str) -> list[str] | None:
|
101
|
+
"""
|
102
|
+
Extract authors from bibtex string.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
bibtex: Bibtex string
|
106
|
+
|
107
|
+
Returns:
|
108
|
+
List of author names or None
|
109
|
+
"""
|
110
|
+
if not bibtex:
|
111
|
+
return None
|
112
|
+
|
113
|
+
# Match author = {Name1 and Name2 and Name3}
|
114
|
+
pattern = r"author\s*=\s*[{\"](.*?)[\}\"](,|\n|$)"
|
115
|
+
match = re.search(pattern, bibtex, re.IGNORECASE)
|
116
|
+
|
117
|
+
if not match:
|
118
|
+
return None
|
119
|
+
|
120
|
+
authors_str = match.group(1).strip()
|
121
|
+
|
122
|
+
# Split by "and" and clean up
|
123
|
+
authors = [
|
124
|
+
author.strip()
|
125
|
+
for author in re.split(r"\s+and\s+", authors_str, flags=re.IGNORECASE)
|
126
|
+
if author.strip()
|
127
|
+
]
|
128
|
+
|
129
|
+
return authors if authors else None
|
@@ -17,8 +17,8 @@ from tenacity import (
|
|
17
17
|
wait_exponential,
|
18
18
|
)
|
19
19
|
|
20
|
-
from
|
21
|
-
from
|
20
|
+
from pdf_file_renamer.domain.models import ConfidenceLevel, FilenameResult
|
21
|
+
from pdf_file_renamer.domain.ports import LLMProvider
|
22
22
|
|
23
23
|
# System prompt for filename generation
|
24
24
|
FILENAME_GENERATION_PROMPT = """You are an expert at creating concise, descriptive filenames for academic papers and technical documents.
|
@@ -0,0 +1,7 @@
|
|
1
|
+
"""PDF extraction implementations."""
|
2
|
+
|
3
|
+
from pdf_file_renamer.infrastructure.pdf.composite import CompositePDFExtractor
|
4
|
+
from pdf_file_renamer.infrastructure.pdf.docling_extractor import DoclingPDFExtractor
|
5
|
+
from pdf_file_renamer.infrastructure.pdf.pymupdf_extractor import PyMuPDFExtractor
|
6
|
+
|
7
|
+
__all__ = ["CompositePDFExtractor", "DoclingPDFExtractor", "PyMuPDFExtractor"]
|
@@ -2,8 +2,8 @@
|
|
2
2
|
|
3
3
|
from pathlib import Path
|
4
4
|
|
5
|
-
from
|
6
|
-
from
|
5
|
+
from pdf_file_renamer.domain.models import PDFContent
|
6
|
+
from pdf_file_renamer.domain.ports import PDFExtractor
|
7
7
|
|
8
8
|
|
9
9
|
class CompositePDFExtractor(PDFExtractor):
|
@@ -6,8 +6,8 @@ from pathlib import Path
|
|
6
6
|
from docling_core.types.doc.page import TextCellUnit
|
7
7
|
from docling_parse.pdf_parser import DoclingPdfParser
|
8
8
|
|
9
|
-
from
|
10
|
-
from
|
9
|
+
from pdf_file_renamer.domain.models import PDFContent, PDFMetadata
|
10
|
+
from pdf_file_renamer.domain.ports import PDFExtractor
|
11
11
|
|
12
12
|
|
13
13
|
class DoclingPDFExtractor(PDFExtractor):
|
@@ -5,8 +5,8 @@ from pathlib import Path
|
|
5
5
|
|
6
6
|
import pymupdf
|
7
7
|
|
8
|
-
from
|
9
|
-
from
|
8
|
+
from pdf_file_renamer.domain.models import PDFContent, PDFMetadata
|
9
|
+
from pdf_file_renamer.domain.ports import PDFExtractor
|
10
10
|
|
11
11
|
|
12
12
|
class PyMuPDFExtractor(PDFExtractor):
|
@@ -9,19 +9,20 @@ import typer
|
|
9
9
|
from rich.console import Console
|
10
10
|
from rich.live import Live
|
11
11
|
|
12
|
-
from
|
12
|
+
from pdf_file_renamer.application import (
|
13
13
|
FilenameService,
|
14
14
|
PDFRenameWorkflow,
|
15
15
|
RenameService,
|
16
16
|
)
|
17
|
-
from
|
18
|
-
from
|
19
|
-
from
|
17
|
+
from pdf_file_renamer.infrastructure.config import Settings
|
18
|
+
from pdf_file_renamer.infrastructure.doi import PDF2DOIExtractor
|
19
|
+
from pdf_file_renamer.infrastructure.llm import PydanticAIProvider
|
20
|
+
from pdf_file_renamer.infrastructure.pdf import (
|
20
21
|
CompositePDFExtractor,
|
21
22
|
DoclingPDFExtractor,
|
22
23
|
PyMuPDFExtractor,
|
23
24
|
)
|
24
|
-
from
|
25
|
+
from pdf_file_renamer.presentation.formatters import (
|
25
26
|
InteractivePrompt,
|
26
27
|
ProgressDisplay,
|
27
28
|
ResultsTable,
|
@@ -64,6 +65,9 @@ def create_workflow(settings: Settings) -> PDFRenameWorkflow:
|
|
64
65
|
retry_max_wait=settings.retry_max_wait,
|
65
66
|
)
|
66
67
|
|
68
|
+
# Create DOI extractor
|
69
|
+
doi_extractor = PDF2DOIExtractor()
|
70
|
+
|
67
71
|
# Create application services
|
68
72
|
filename_service = FilenameService(llm_provider)
|
69
73
|
file_renamer = RenameService()
|
@@ -73,6 +77,7 @@ def create_workflow(settings: Settings) -> PDFRenameWorkflow:
|
|
73
77
|
pdf_extractor=pdf_extractor,
|
74
78
|
filename_generator=filename_service,
|
75
79
|
file_renamer=file_renamer,
|
80
|
+
doi_extractor=doi_extractor,
|
76
81
|
max_concurrent_api=settings.max_concurrent_api,
|
77
82
|
max_concurrent_pdf=settings.max_concurrent_pdf,
|
78
83
|
)
|
@@ -1,28 +1,28 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pdf-file-renamer
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.6.0
|
4
4
|
Summary: Intelligent PDF renaming using LLMs
|
5
|
-
Requires-Python: >=3.11
|
6
|
-
Description-Content-Type: text/markdown
|
7
5
|
License-File: LICENSE
|
8
|
-
Requires-
|
6
|
+
Requires-Python: >=3.11
|
7
|
+
Requires-Dist: docling-core>=2.0.0
|
8
|
+
Requires-Dist: docling-parse>=2.0.0
|
9
|
+
Requires-Dist: pdf2doi>=1.7
|
9
10
|
Requires-Dist: pydantic-ai>=1.0.17
|
10
11
|
Requires-Dist: pydantic-settings>=2.7.1
|
12
|
+
Requires-Dist: pydantic>=2.10.6
|
11
13
|
Requires-Dist: pymupdf>=1.26.5
|
12
|
-
Requires-Dist: docling-parse>=2.0.0
|
13
|
-
Requires-Dist: docling-core>=2.0.0
|
14
14
|
Requires-Dist: python-dotenv>=1.1.1
|
15
15
|
Requires-Dist: rich>=14.2.0
|
16
|
-
Requires-Dist: typer>=0.19.2
|
17
16
|
Requires-Dist: tenacity>=9.0.0
|
17
|
+
Requires-Dist: typer>=0.19.2
|
18
18
|
Provides-Extra: dev
|
19
|
-
Requires-Dist:
|
20
|
-
Requires-Dist: pytest-
|
21
|
-
Requires-Dist: pytest-
|
22
|
-
Requires-Dist: pytest-mock>=3.14.0; extra ==
|
23
|
-
Requires-Dist:
|
24
|
-
Requires-Dist:
|
25
|
-
|
19
|
+
Requires-Dist: mypy>=1.14.1; extra == 'dev'
|
20
|
+
Requires-Dist: pytest-asyncio>=0.25.2; extra == 'dev'
|
21
|
+
Requires-Dist: pytest-cov>=6.0.0; extra == 'dev'
|
22
|
+
Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
|
23
|
+
Requires-Dist: pytest>=8.3.4; extra == 'dev'
|
24
|
+
Requires-Dist: ruff>=0.9.1; extra == 'dev'
|
25
|
+
Description-Content-Type: text/markdown
|
26
26
|
|
27
27
|
# PDF Renamer
|
28
28
|
|
@@ -44,9 +44,11 @@ Intelligent PDF file renaming using LLMs. This tool analyzes PDF content and met
|
|
44
44
|
|
45
45
|
## Features
|
46
46
|
|
47
|
+
- **DOI-based naming** - Automatically extracts DOI and fetches authoritative metadata for academic papers
|
47
48
|
- **Advanced PDF parsing** using docling-parse for better structure-aware extraction
|
48
49
|
- **OCR fallback** for scanned PDFs with low text content
|
49
50
|
- **Smart LLM prompting** with multi-pass analysis for improved accuracy
|
51
|
+
- **Hybrid approach** - Uses DOI metadata when available, falls back to LLM analysis for other documents
|
50
52
|
- Suggests filenames in format: `Author-Topic-Year.pdf`
|
51
53
|
- Dry-run mode to preview changes before applying
|
52
54
|
- **Enhanced interactive mode** with options to accept, manually edit, retry, or skip each file
|
@@ -209,19 +211,44 @@ You can use interactive mode with `--dry-run` to preview without actually renami
|
|
209
211
|
|
210
212
|
## How It Works
|
211
213
|
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
214
|
+
### Intelligent Hybrid Approach
|
215
|
+
|
216
|
+
The tool uses a multi-strategy approach to generate accurate filenames:
|
217
|
+
|
218
|
+
1. **DOI Detection** (for academic papers)
|
219
|
+
- Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
|
220
|
+
- If found, queries authoritative metadata (title, authors, year, journal)
|
221
|
+
- Generates filename with **very high confidence** from validated metadata
|
222
|
+
- **Saves API costs** - no LLM call needed for papers with DOIs
|
223
|
+
|
224
|
+
2. **LLM Analysis** (fallback for non-academic PDFs)
|
225
|
+
- **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
|
226
|
+
- **OCR**: Automatically applies OCR for scanned PDFs with minimal text
|
227
|
+
- **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
|
228
|
+
- **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
|
229
|
+
- **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
|
230
|
+
- **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
|
231
|
+
|
232
|
+
3. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
|
233
|
+
4. **Rename**: Applies suggestions (if not in dry-run mode)
|
234
|
+
|
235
|
+
### Benefits of DOI Integration
|
236
|
+
|
237
|
+
- **Accuracy**: DOI metadata is canonical and verified
|
238
|
+
- **Speed**: Instant lookup vs. LLM processing time
|
239
|
+
- **Cost**: Free DOI lookups save on API costs for academic papers
|
240
|
+
- **Reliability**: Works even when PDF text extraction is poor
|
220
241
|
|
221
242
|
## Cost Considerations
|
222
243
|
|
223
|
-
**
|
244
|
+
**DOI-based Naming (Academic Papers):**
|
245
|
+
- **Completely free** - No API costs
|
246
|
+
- **No LLM needed** - Direct metadata lookup
|
247
|
+
- Works for most academic papers with embedded DOIs
|
248
|
+
|
249
|
+
**OpenAI (Fallback):**
|
224
250
|
- Uses `gpt-4o-mini` by default (very cost-effective)
|
251
|
+
- Only called when DOI not found
|
225
252
|
- Processes first ~4500 characters per PDF
|
226
253
|
- Typical cost: ~$0.001-0.003 per PDF
|
227
254
|
|
@@ -0,0 +1,27 @@
|
|
1
|
+
pdf_file_renamer/__init__.py,sha256=yCEfy0jblhbUMNTSjJKPuW4zADXoI6IfICx8XvB4R8Q,85
|
2
|
+
pdf_file_renamer/main.py,sha256=FTEEb-9QmOOsN9SE8L1SZvFVIkVpQDy8xZ5a8t8CWUs,145
|
3
|
+
pdf_file_renamer/application/__init__.py,sha256=riSV7UXBenkDst7Nnf11N1_RuRtM7wpKdwugxOhumS4,363
|
4
|
+
pdf_file_renamer/application/filename_service.py,sha256=IbeCNBwyhFlCMCZveq16nmQ2qvyTdtgLmr6PDWPckOs,4868
|
5
|
+
pdf_file_renamer/application/pdf_rename_workflow.py,sha256=gd53KoR1aFrK__6TArm7Rtn1yNxylEI2ikmubDOByF4,5842
|
6
|
+
pdf_file_renamer/application/rename_service.py,sha256=vviNQolk_w-qDQvOKTKj8ZhqYyyNWL-VJMfuUnL6WLw,2357
|
7
|
+
pdf_file_renamer/domain/__init__.py,sha256=jxbH3h6xaCnSRuBxclFESl6ZE1pua_I1K4CRAaYxu_I,503
|
8
|
+
pdf_file_renamer/domain/models.py,sha256=QwN79TzWmqvQvz-m9ymebvAx3pWlVpSWXNdSEAk4qq0,3186
|
9
|
+
pdf_file_renamer/domain/ports.py,sha256=ebOcHptiOK119NCmIwM32_fbRK5xkZP9K67vjL-4k0g,2976
|
10
|
+
pdf_file_renamer/infrastructure/__init__.py,sha256=C3ZQ7WCPCa6PMfP00lu4wqb0r57GVyDdiD5EL2DhCeY,187
|
11
|
+
pdf_file_renamer/infrastructure/config.py,sha256=baNL5_6_NNiS50ZNdql7fDwQbeAwf6f58HGYIWFQxQQ,2464
|
12
|
+
pdf_file_renamer/infrastructure/doi/__init__.py,sha256=8N9ZEwfG7q5xomzh187YtP8t4CfEBHM334xNRblPeuI,153
|
13
|
+
pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py,sha256=rAo0q5HTCqVTyyIXzW3D6riGS5Q9xlXhbT2AY0Hb7nk,3820
|
14
|
+
pdf_file_renamer/infrastructure/llm/__init__.py,sha256=ToB8__mHvXwaIukGKPEAQ8SeC4ZLiH4auZI1P1yH5PQ,159
|
15
|
+
pdf_file_renamer/infrastructure/llm/pydantic_ai_provider.py,sha256=kVsmj0NIawkj-1WWM0hZXbsNH09GabVZm9HPlYsxGuo,9217
|
16
|
+
pdf_file_renamer/infrastructure/pdf/__init__.py,sha256=uMHqxSXNLZH5WH_e1kXrp9m7uTqPkiI2hXjNo6rCRoo,368
|
17
|
+
pdf_file_renamer/infrastructure/pdf/composite.py,sha256=dNrrcGTsGf1LLF4F0AoF7jRbvLkgRGnIF6XNGlg92n4,1801
|
18
|
+
pdf_file_renamer/infrastructure/pdf/docling_extractor.py,sha256=auZrJpK7mMg1mUXK6ptjZC1pnAUQje1h7ZAS7gFUBzo,3974
|
19
|
+
pdf_file_renamer/infrastructure/pdf/pymupdf_extractor.py,sha256=C61udZCqGqiVx7T0HWNyjvnhgv5AgMIcCYtrhgHOJwk,5465
|
20
|
+
pdf_file_renamer/presentation/__init__.py,sha256=1VR44GoPGTixk3hG5YzhGyQf7a4BTKsJBd2VP3rHcFM,211
|
21
|
+
pdf_file_renamer/presentation/cli.py,sha256=0t_59-utRWLNCYjFetU0ZHoF1DPTjdNiWM9Au0jFaOg,8013
|
22
|
+
pdf_file_renamer/presentation/formatters.py,sha256=Yl-Et7OKMfthyLqTA5qEtSAqh0PfHKp3lNNBA_dn01c,8519
|
23
|
+
pdf_file_renamer-0.6.0.dist-info/METADATA,sha256=6XmqT7jtJuqkWlks3FlPWCpNP_tKOGokhxy2Yju5R7k,9912
|
24
|
+
pdf_file_renamer-0.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
25
|
+
pdf_file_renamer-0.6.0.dist-info/entry_points.txt,sha256=0fEGYy60chGE9rECWeCVPxjxzz6vMtIAYdFvmH7xzbw,63
|
26
|
+
pdf_file_renamer-0.6.0.dist-info/licenses/LICENSE,sha256=_w08V08WgoMpDMlGNlkIatC5QfQ_Ds_rXOBM8pl7ffE,1068
|
27
|
+
pdf_file_renamer-0.6.0.dist-info/RECORD,,
|
@@ -1,26 +0,0 @@
|
|
1
|
-
pdf_file_renamer-0.4.2.dist-info/licenses/LICENSE,sha256=_w08V08WgoMpDMlGNlkIatC5QfQ_Ds_rXOBM8pl7ffE,1068
|
2
|
-
pdf_renamer/__init__.py,sha256=3RvsqaTO80Ud1KZZdLL_Lh-HXxagncoqI4m6u3VL_UE,85
|
3
|
-
pdf_renamer/main.py,sha256=5eTsrCQaotNwbdwJwandOlzrWODI73-L5mALHUIvqyw,140
|
4
|
-
pdf_renamer/application/__init__.py,sha256=EebV66jsZjubnh6PSEeNGs0A_JGeYXFghzGLDQ92eco,348
|
5
|
-
pdf_renamer/application/filename_service.py,sha256=Gk-nPnURsJYLDvoG_NZ4o_yHwAqK6bHU8kqzlev0XXM,2029
|
6
|
-
pdf_renamer/application/pdf_rename_workflow.py,sha256=MEUmDR6bLRB-ncNgKk3ahIfsIIk3Gsw1048cId6pYv4,4710
|
7
|
-
pdf_renamer/application/rename_service.py,sha256=rnScP2JwKMrIJcplFvxC0b2MOLzWqxpPKc3uDLHPjRI,2352
|
8
|
-
pdf_renamer/domain/__init__.py,sha256=UPcXunsI30iFK9dupv2Fc_YDreT1tAqsYaGEAK9sJew,493
|
9
|
-
pdf_renamer/domain/models.py,sha256=7S2ul3BoWi2aivWtmDa9LRlmeqURrGEV1sfSu8W6x5k,2246
|
10
|
-
pdf_renamer/domain/ports.py,sha256=ecnpkFYB3259ZjaZaOVo1sjP8nXD3x1NGR6hN5nn3gc,2550
|
11
|
-
pdf_renamer/infrastructure/__init__.py,sha256=CxBinDAuNm2X57-Y7XdXxVL6uHQXQqWpPrlznzu5_1M,182
|
12
|
-
pdf_renamer/infrastructure/config.py,sha256=baNL5_6_NNiS50ZNdql7fDwQbeAwf6f58HGYIWFQxQQ,2464
|
13
|
-
pdf_renamer/infrastructure/llm/__init__.py,sha256=evEhabaBshvekLO9DlAZvp-pQ_u03zYXqXaDfa9QUww,154
|
14
|
-
pdf_renamer/infrastructure/llm/pydantic_ai_provider.py,sha256=FM2Sd3n3lltJC76afrem5QuuS8qApEma52YD-Y8K89Y,9207
|
15
|
-
pdf_renamer/infrastructure/pdf/__init__.py,sha256=-WHYNLeBekm7jwIXRj4xpSIXyZz9olDiMIJLUjv2B-U,353
|
16
|
-
pdf_renamer/infrastructure/pdf/composite.py,sha256=1tlZ_X9_KVY01GTr1Hg3x_Ag7g3g4ik6_8R0jip8Wx0,1791
|
17
|
-
pdf_renamer/infrastructure/pdf/docling_extractor.py,sha256=7UamnbYFMgtD53oMqu1qKAq3FyQTQlq0Uw0k1sNzPw8,3964
|
18
|
-
pdf_renamer/infrastructure/pdf/pymupdf_extractor.py,sha256=lwIPr9yhy2hZVnuvoLcZvmjYSzbTra0AyW59UvU7GgU,5455
|
19
|
-
pdf_renamer/presentation/__init__.py,sha256=mxIxy8POUwewiMsmrOMVA8z9pe57lOghuwHZ5RAbMo4,201
|
20
|
-
pdf_renamer/presentation/cli.py,sha256=ykZx22quR9ye-ui9bLrRinD7BSChjSbGTRsazCafo5s,7819
|
21
|
-
pdf_renamer/presentation/formatters.py,sha256=ilUcXZ-7MpBlz7k7cqRAuixfkVT3cuD-pBcy5fsE2Qo,8514
|
22
|
-
pdf_file_renamer-0.4.2.dist-info/METADATA,sha256=xSIAQrGaKmT2o2vOT5HlX6ILaTmDyYbn6P8YG8JtK8U,8668
|
23
|
-
pdf_file_renamer-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
24
|
-
pdf_file_renamer-0.4.2.dist-info/entry_points.txt,sha256=IvW2oP2SRPv5qqFwDYBRCE53Q3JAyi_chbCo-0rdKQA,53
|
25
|
-
pdf_file_renamer-0.4.2.dist-info/top_level.txt,sha256=CFtpWKQjLObHZIssi5I3q7FXfLJZWKpHo7uuAiJ0pVY,12
|
26
|
-
pdf_file_renamer-0.4.2.dist-info/RECORD,,
|
@@ -1 +0,0 @@
|
|
1
|
-
pdf_renamer
|
@@ -1,7 +0,0 @@
|
|
1
|
-
"""Application layer - use cases and business logic orchestration."""
|
2
|
-
|
3
|
-
from pdf_renamer.application.filename_service import FilenameService
|
4
|
-
from pdf_renamer.application.pdf_rename_workflow import PDFRenameWorkflow
|
5
|
-
from pdf_renamer.application.rename_service import RenameService
|
6
|
-
|
7
|
-
__all__ = ["FilenameService", "PDFRenameWorkflow", "RenameService"]
|
@@ -1,70 +0,0 @@
|
|
1
|
-
"""Filename generation service - coordinates PDF extraction and LLM generation."""
|
2
|
-
|
3
|
-
import re
|
4
|
-
|
5
|
-
from pdf_renamer.domain.models import FilenameResult, PDFContent
|
6
|
-
from pdf_renamer.domain.ports import FilenameGenerator, LLMProvider
|
7
|
-
|
8
|
-
|
9
|
-
class FilenameService(FilenameGenerator):
|
10
|
-
"""Service for generating filenames from PDF content."""
|
11
|
-
|
12
|
-
def __init__(self, llm_provider: LLMProvider) -> None:
|
13
|
-
"""
|
14
|
-
Initialize the filename service.
|
15
|
-
|
16
|
-
Args:
|
17
|
-
llm_provider: LLM provider for filename generation
|
18
|
-
"""
|
19
|
-
self.llm_provider = llm_provider
|
20
|
-
|
21
|
-
async def generate(self, original_filename: str, content: PDFContent) -> FilenameResult:
|
22
|
-
"""
|
23
|
-
Generate a filename suggestion based on PDF content.
|
24
|
-
|
25
|
-
Args:
|
26
|
-
original_filename: Current filename
|
27
|
-
content: Extracted PDF content
|
28
|
-
|
29
|
-
Returns:
|
30
|
-
FilenameResult with suggestion
|
31
|
-
"""
|
32
|
-
# Convert metadata to dictionary
|
33
|
-
metadata_dict = content.metadata.to_dict()
|
34
|
-
|
35
|
-
# Generate filename using LLM
|
36
|
-
result = await self.llm_provider.generate_filename(
|
37
|
-
original_filename=original_filename,
|
38
|
-
text_excerpt=content.text,
|
39
|
-
metadata_dict=metadata_dict,
|
40
|
-
)
|
41
|
-
|
42
|
-
# Sanitize the generated filename
|
43
|
-
result.filename = self.sanitize(result.filename)
|
44
|
-
|
45
|
-
return result
|
46
|
-
|
47
|
-
def sanitize(self, filename: str) -> str:
|
48
|
-
"""
|
49
|
-
Sanitize a filename to be filesystem-safe.
|
50
|
-
|
51
|
-
Args:
|
52
|
-
filename: Raw filename
|
53
|
-
|
54
|
-
Returns:
|
55
|
-
Sanitized filename
|
56
|
-
"""
|
57
|
-
# Remove or replace invalid characters
|
58
|
-
filename = re.sub(r'[<>:"/\\|?*]', "", filename)
|
59
|
-
|
60
|
-
# Replace multiple spaces/hyphens with single hyphen
|
61
|
-
filename = re.sub(r"[\s\-]+", "-", filename)
|
62
|
-
|
63
|
-
# Remove leading/trailing hyphens
|
64
|
-
filename = filename.strip("-")
|
65
|
-
|
66
|
-
# Limit length
|
67
|
-
if len(filename) > 100:
|
68
|
-
filename = filename[:100].rstrip("-")
|
69
|
-
|
70
|
-
return filename
|
@@ -1,7 +0,0 @@
|
|
1
|
-
"""PDF extraction implementations."""
|
2
|
-
|
3
|
-
from pdf_renamer.infrastructure.pdf.composite import CompositePDFExtractor
|
4
|
-
from pdf_renamer.infrastructure.pdf.docling_extractor import DoclingPDFExtractor
|
5
|
-
from pdf_renamer.infrastructure.pdf.pymupdf_extractor import PyMuPDFExtractor
|
6
|
-
|
7
|
-
__all__ = ["CompositePDFExtractor", "DoclingPDFExtractor", "PyMuPDFExtractor"]
|
File without changes
|
File without changes
|