pdf-file-renamer 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_file_renamer-0.4.2.dist-info/METADATA +245 -0
- pdf_file_renamer-0.4.2.dist-info/RECORD +26 -0
- pdf_file_renamer-0.4.2.dist-info/WHEEL +5 -0
- pdf_file_renamer-0.4.2.dist-info/entry_points.txt +2 -0
- pdf_file_renamer-0.4.2.dist-info/licenses/LICENSE +21 -0
- pdf_file_renamer-0.4.2.dist-info/top_level.txt +1 -0
- pdf_renamer/__init__.py +3 -0
- pdf_renamer/application/__init__.py +7 -0
- pdf_renamer/application/filename_service.py +70 -0
- pdf_renamer/application/pdf_rename_workflow.py +144 -0
- pdf_renamer/application/rename_service.py +79 -0
- pdf_renamer/domain/__init__.py +25 -0
- pdf_renamer/domain/models.py +80 -0
- pdf_renamer/domain/ports.py +106 -0
- pdf_renamer/infrastructure/__init__.py +5 -0
- pdf_renamer/infrastructure/config.py +94 -0
- pdf_renamer/infrastructure/llm/__init__.py +5 -0
- pdf_renamer/infrastructure/llm/pydantic_ai_provider.py +234 -0
- pdf_renamer/infrastructure/pdf/__init__.py +7 -0
- pdf_renamer/infrastructure/pdf/composite.py +57 -0
- pdf_renamer/infrastructure/pdf/docling_extractor.py +116 -0
- pdf_renamer/infrastructure/pdf/pymupdf_extractor.py +165 -0
- pdf_renamer/main.py +6 -0
- pdf_renamer/presentation/__init__.py +6 -0
- pdf_renamer/presentation/cli.py +233 -0
- pdf_renamer/presentation/formatters.py +216 -0
@@ -0,0 +1,245 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: pdf-file-renamer
|
3
|
+
Version: 0.4.2
|
4
|
+
Summary: Intelligent PDF renaming using LLMs
|
5
|
+
Requires-Python: >=3.11
|
6
|
+
Description-Content-Type: text/markdown
|
7
|
+
License-File: LICENSE
|
8
|
+
Requires-Dist: pydantic>=2.10.6
|
9
|
+
Requires-Dist: pydantic-ai>=1.0.17
|
10
|
+
Requires-Dist: pydantic-settings>=2.7.1
|
11
|
+
Requires-Dist: pymupdf>=1.26.5
|
12
|
+
Requires-Dist: docling-parse>=2.0.0
|
13
|
+
Requires-Dist: docling-core>=2.0.0
|
14
|
+
Requires-Dist: python-dotenv>=1.1.1
|
15
|
+
Requires-Dist: rich>=14.2.0
|
16
|
+
Requires-Dist: typer>=0.19.2
|
17
|
+
Requires-Dist: tenacity>=9.0.0
|
18
|
+
Provides-Extra: dev
|
19
|
+
Requires-Dist: pytest>=8.3.4; extra == "dev"
|
20
|
+
Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
|
21
|
+
Requires-Dist: pytest-asyncio>=0.25.2; extra == "dev"
|
22
|
+
Requires-Dist: pytest-mock>=3.14.0; extra == "dev"
|
23
|
+
Requires-Dist: ruff>=0.9.1; extra == "dev"
|
24
|
+
Requires-Dist: mypy>=1.14.1; extra == "dev"
|
25
|
+
Dynamic: license-file
|
26
|
+
|
27
|
+
# PDF Renamer
|
28
|
+
|
29
|
+
[](https://pypi.org/project/pdf-file-renamer/)
|
30
|
+
[](https://www.python.org/downloads/)
|
31
|
+
[](https://docs.astral.sh/uv/)
|
32
|
+
[](https://opensource.org/licenses/MIT)
|
33
|
+
[](https://ai.pydantic.dev/)
|
34
|
+
[](https://github.com/nostoslabs/pdf-renamer)
|
35
|
+
|
36
|
+
[](https://github.com/nostoslabs/pdf-renamer)
|
37
|
+
[](https://github.com/astral-sh/ruff)
|
38
|
+
[](http://mypy-lang.org/)
|
39
|
+
[](https://github.com/astral-sh/ruff)
|
40
|
+
|
41
|
+
Intelligent PDF file renaming using LLMs. This tool analyzes PDF content and metadata to suggest descriptive, standardized filenames.
|
42
|
+
|
43
|
+
> 🚀 Works with **OpenAI**, **Ollama**, **LM Studio**, and any OpenAI-compatible API
|
44
|
+
|
45
|
+
## Features
|
46
|
+
|
47
|
+
- **Advanced PDF parsing** using docling-parse for better structure-aware extraction
|
48
|
+
- **OCR fallback** for scanned PDFs with low text content
|
49
|
+
- **Smart LLM prompting** with multi-pass analysis for improved accuracy
|
50
|
+
- Suggests filenames in format: `Author-Topic-Year.pdf`
|
51
|
+
- Dry-run mode to preview changes before applying
|
52
|
+
- **Enhanced interactive mode** with options to accept, manually edit, retry, or skip each file
|
53
|
+
- **Live progress display** with concurrent processing for speed
|
54
|
+
- **Configurable concurrency** limits for API calls and PDF extraction
|
55
|
+
- Batch processing of multiple PDFs with optional output directory
|
56
|
+
|
57
|
+
## Installation
|
58
|
+
|
59
|
+
### Quick Start (No Installation Required)
|
60
|
+
|
61
|
+
```bash
|
62
|
+
# Run directly with uvx
|
63
|
+
uvx pdf-renamer --dry-run /path/to/pdfs
|
64
|
+
```
|
65
|
+
|
66
|
+
### Install from PyPI
|
67
|
+
|
68
|
+
```bash
|
69
|
+
# Using pip
|
70
|
+
pip install pdf-file-renamer
|
71
|
+
|
72
|
+
# Using uv
|
73
|
+
uv pip install pdf-file-renamer
|
74
|
+
```
|
75
|
+
|
76
|
+
### Install from Source
|
77
|
+
|
78
|
+
```bash
|
79
|
+
# Clone and install
|
80
|
+
git clone https://github.com/nostoslabs/pdf-renamer.git
|
81
|
+
cd pdf-renamer
|
82
|
+
uv sync
|
83
|
+
```
|
84
|
+
|
85
|
+
## Configuration
|
86
|
+
|
87
|
+
Configure your LLM provider:
|
88
|
+
|
89
|
+
**Option A: OpenAI (Cloud)**
|
90
|
+
```bash
|
91
|
+
cp .env.example .env
|
92
|
+
# Edit .env and add your OPENAI_API_KEY
|
93
|
+
```
|
94
|
+
|
95
|
+
**Option B: Ollama or other local models**
|
96
|
+
```bash
|
97
|
+
# No API key needed for local models
|
98
|
+
# Either set LLM_BASE_URL in .env or use --url flag
|
99
|
+
echo "LLM_BASE_URL=http://patmos:11434/v1" > .env
|
100
|
+
```
|
101
|
+
|
102
|
+
## Usage
|
103
|
+
|
104
|
+
### Quick Start
|
105
|
+
|
106
|
+
```bash
|
107
|
+
# Preview renames (dry-run mode)
|
108
|
+
pdf-renamer --dry-run /path/to/pdf/directory
|
109
|
+
|
110
|
+
# Actually rename files
|
111
|
+
pdf-renamer --no-dry-run /path/to/pdf/directory
|
112
|
+
|
113
|
+
# Interactive mode - review each file
|
114
|
+
pdf-renamer --interactive --no-dry-run /path/to/pdf/directory
|
115
|
+
```
|
116
|
+
|
117
|
+
### Using uvx (No Installation)
|
118
|
+
|
119
|
+
```bash
|
120
|
+
# Run directly without installing
|
121
|
+
uvx pdf-renamer --dry-run /path/to/pdfs
|
122
|
+
|
123
|
+
# Run from GitHub
|
124
|
+
uvx https://github.com/nostoslabs/pdf-renamer --dry-run /path/to/pdfs
|
125
|
+
```
|
126
|
+
|
127
|
+
### Options
|
128
|
+
|
129
|
+
- `--dry-run/--no-dry-run`: Show suggestions without renaming (default: True)
|
130
|
+
- `--interactive, -i`: Interactive mode with rich options:
|
131
|
+
- **Accept** - Use the suggested filename
|
132
|
+
- **Edit** - Manually modify the filename
|
133
|
+
- **Retry** - Ask the LLM to generate a new suggestion
|
134
|
+
- **Skip** - Skip this file and move to the next
|
135
|
+
- `--model`: Model to use (default: llama3.2, works with any OpenAI-compatible API)
|
136
|
+
- `--url`: Custom base URL for OpenAI-compatible APIs (default: http://localhost:11434/v1)
|
137
|
+
- `--pattern`: Glob pattern for files (default: *.pdf)
|
138
|
+
- `--output-dir, -o`: Move renamed files to a different directory
|
139
|
+
- `--max-concurrent-api`: Maximum concurrent API calls (default: 3)
|
140
|
+
- `--max-concurrent-pdf`: Maximum concurrent PDF extractions (default: 10)
|
141
|
+
|
142
|
+
### Examples
|
143
|
+
|
144
|
+
**Using OpenAI:**
|
145
|
+
```bash
|
146
|
+
# Preview all PDFs in current directory
|
147
|
+
uvx pdf-renamer --dry-run .
|
148
|
+
|
149
|
+
# Rename PDFs in specific directory
|
150
|
+
uvx pdf-renamer --no-dry-run ~/Documents/Papers
|
151
|
+
|
152
|
+
# Use a different OpenAI model
|
153
|
+
uvx pdf-renamer --model gpt-4o --dry-run .
|
154
|
+
```
|
155
|
+
|
156
|
+
**Using Ollama (or other local models):**
|
157
|
+
```bash
|
158
|
+
# Using Ollama on patmos server with gemma model
|
159
|
+
uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --dry-run .
|
160
|
+
|
161
|
+
# Using local Ollama with qwen model
|
162
|
+
uvx pdf-renamer --url http://localhost:11434/v1 --model qwen2.5 --dry-run .
|
163
|
+
|
164
|
+
# Set URL in environment and just use model flag
|
165
|
+
export LLM_BASE_URL=http://patmos:11434/v1
|
166
|
+
uvx pdf-renamer --model gemma3:latest --dry-run .
|
167
|
+
```
|
168
|
+
|
169
|
+
**Other examples:**
|
170
|
+
```bash
|
171
|
+
# Process only specific files
|
172
|
+
uvx pdf-renamer --pattern "*2020*.pdf" --dry-run .
|
173
|
+
|
174
|
+
# Interactive mode with local model
|
175
|
+
uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --interactive --no-dry-run .
|
176
|
+
|
177
|
+
# Run directly from GitHub
|
178
|
+
uvx https://github.com/nostoslabs/pdf-renamer --no-dry-run ~/Documents/Papers
|
179
|
+
```
|
180
|
+
|
181
|
+
## Interactive Mode
|
182
|
+
|
183
|
+
When using `--interactive` mode, you'll be presented with each file one at a time with detailed options:
|
184
|
+
|
185
|
+
```
|
186
|
+
================================================================================
|
187
|
+
Original: 2024-research-paper.pdf
|
188
|
+
Suggested: Smith-Machine-Learning-Applications-2024.pdf
|
189
|
+
Confidence: high
|
190
|
+
Reasoning: Clear author and topic identified from abstract
|
191
|
+
================================================================================
|
192
|
+
|
193
|
+
Options:
|
194
|
+
y / yes / Enter - Accept suggested name
|
195
|
+
e / edit - Manually edit the filename
|
196
|
+
r / retry - Ask LLM to generate a new suggestion
|
197
|
+
n / no / skip - Skip this file
|
198
|
+
|
199
|
+
What would you like to do? [y]:
|
200
|
+
```
|
201
|
+
|
202
|
+
This mode is perfect for:
|
203
|
+
- **Reviewing suggestions** before applying them
|
204
|
+
- **Fine-tuning filenames** that are close but not quite right
|
205
|
+
- **Retrying** when the LLM suggestion isn't good enough
|
206
|
+
- **Building confidence** in the tool before batch processing
|
207
|
+
|
208
|
+
You can use interactive mode with `--dry-run` to preview without actually renaming files, or with `--no-dry-run` to apply changes immediately after confirmation.
|
209
|
+
|
210
|
+
## How It Works
|
211
|
+
|
212
|
+
1. **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
|
213
|
+
2. **OCR**: Automatically applies OCR for scanned PDFs with minimal text
|
214
|
+
3. **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
|
215
|
+
4. **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
|
216
|
+
5. **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
|
217
|
+
6. **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
|
218
|
+
7. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
|
219
|
+
8. **Rename**: Applies suggestions (if not in dry-run mode)
|
220
|
+
|
221
|
+
## Cost Considerations
|
222
|
+
|
223
|
+
**OpenAI:**
|
224
|
+
- Uses `gpt-4o-mini` by default (very cost-effective)
|
225
|
+
- Processes first ~4500 characters per PDF
|
226
|
+
- Typical cost: ~$0.001-0.003 per PDF
|
227
|
+
|
228
|
+
**Ollama/Local Models:**
|
229
|
+
- Completely free (runs on your hardware)
|
230
|
+
- Works with any Ollama model (llama3, qwen2.5, mistral, etc.)
|
231
|
+
- Also compatible with LM Studio, vLLM, and other OpenAI-compatible endpoints
|
232
|
+
|
233
|
+
## Filename Format
|
234
|
+
|
235
|
+
The tool generates filenames in this format:
|
236
|
+
- `Smith-Kalman-Filtering-Applications-2020.pdf`
|
237
|
+
- `Adamy-Electronic-Warfare-Modeling-Techniques.pdf`
|
238
|
+
- `Blair-Monopulse-Processing-Unresolved-Targets.pdf`
|
239
|
+
|
240
|
+
Guidelines:
|
241
|
+
- First author's last name
|
242
|
+
- 3-6 word topic description (prioritizes clarity over brevity)
|
243
|
+
- Year (if identifiable)
|
244
|
+
- Hyphens between words
|
245
|
+
- Target ~80 characters (can be longer if needed for clarity)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
pdf_file_renamer-0.4.2.dist-info/licenses/LICENSE,sha256=_w08V08WgoMpDMlGNlkIatC5QfQ_Ds_rXOBM8pl7ffE,1068
|
2
|
+
pdf_renamer/__init__.py,sha256=3RvsqaTO80Ud1KZZdLL_Lh-HXxagncoqI4m6u3VL_UE,85
|
3
|
+
pdf_renamer/main.py,sha256=5eTsrCQaotNwbdwJwandOlzrWODI73-L5mALHUIvqyw,140
|
4
|
+
pdf_renamer/application/__init__.py,sha256=EebV66jsZjubnh6PSEeNGs0A_JGeYXFghzGLDQ92eco,348
|
5
|
+
pdf_renamer/application/filename_service.py,sha256=Gk-nPnURsJYLDvoG_NZ4o_yHwAqK6bHU8kqzlev0XXM,2029
|
6
|
+
pdf_renamer/application/pdf_rename_workflow.py,sha256=MEUmDR6bLRB-ncNgKk3ahIfsIIk3Gsw1048cId6pYv4,4710
|
7
|
+
pdf_renamer/application/rename_service.py,sha256=rnScP2JwKMrIJcplFvxC0b2MOLzWqxpPKc3uDLHPjRI,2352
|
8
|
+
pdf_renamer/domain/__init__.py,sha256=UPcXunsI30iFK9dupv2Fc_YDreT1tAqsYaGEAK9sJew,493
|
9
|
+
pdf_renamer/domain/models.py,sha256=7S2ul3BoWi2aivWtmDa9LRlmeqURrGEV1sfSu8W6x5k,2246
|
10
|
+
pdf_renamer/domain/ports.py,sha256=ecnpkFYB3259ZjaZaOVo1sjP8nXD3x1NGR6hN5nn3gc,2550
|
11
|
+
pdf_renamer/infrastructure/__init__.py,sha256=CxBinDAuNm2X57-Y7XdXxVL6uHQXQqWpPrlznzu5_1M,182
|
12
|
+
pdf_renamer/infrastructure/config.py,sha256=baNL5_6_NNiS50ZNdql7fDwQbeAwf6f58HGYIWFQxQQ,2464
|
13
|
+
pdf_renamer/infrastructure/llm/__init__.py,sha256=evEhabaBshvekLO9DlAZvp-pQ_u03zYXqXaDfa9QUww,154
|
14
|
+
pdf_renamer/infrastructure/llm/pydantic_ai_provider.py,sha256=FM2Sd3n3lltJC76afrem5QuuS8qApEma52YD-Y8K89Y,9207
|
15
|
+
pdf_renamer/infrastructure/pdf/__init__.py,sha256=-WHYNLeBekm7jwIXRj4xpSIXyZz9olDiMIJLUjv2B-U,353
|
16
|
+
pdf_renamer/infrastructure/pdf/composite.py,sha256=1tlZ_X9_KVY01GTr1Hg3x_Ag7g3g4ik6_8R0jip8Wx0,1791
|
17
|
+
pdf_renamer/infrastructure/pdf/docling_extractor.py,sha256=7UamnbYFMgtD53oMqu1qKAq3FyQTQlq0Uw0k1sNzPw8,3964
|
18
|
+
pdf_renamer/infrastructure/pdf/pymupdf_extractor.py,sha256=lwIPr9yhy2hZVnuvoLcZvmjYSzbTra0AyW59UvU7GgU,5455
|
19
|
+
pdf_renamer/presentation/__init__.py,sha256=mxIxy8POUwewiMsmrOMVA8z9pe57lOghuwHZ5RAbMo4,201
|
20
|
+
pdf_renamer/presentation/cli.py,sha256=ykZx22quR9ye-ui9bLrRinD7BSChjSbGTRsazCafo5s,7819
|
21
|
+
pdf_renamer/presentation/formatters.py,sha256=ilUcXZ-7MpBlz7k7cqRAuixfkVT3cuD-pBcy5fsE2Qo,8514
|
22
|
+
pdf_file_renamer-0.4.2.dist-info/METADATA,sha256=xSIAQrGaKmT2o2vOT5HlX6ILaTmDyYbn6P8YG8JtK8U,8668
|
23
|
+
pdf_file_renamer-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
24
|
+
pdf_file_renamer-0.4.2.dist-info/entry_points.txt,sha256=IvW2oP2SRPv5qqFwDYBRCE53Q3JAyi_chbCo-0rdKQA,53
|
25
|
+
pdf_file_renamer-0.4.2.dist-info/top_level.txt,sha256=CFtpWKQjLObHZIssi5I3q7FXfLJZWKpHo7uuAiJ0pVY,12
|
26
|
+
pdf_file_renamer-0.4.2.dist-info/RECORD,,
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 Nostos Labs
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1 @@
|
|
1
|
+
pdf_renamer
|
pdf_renamer/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
"""Application layer - use cases and business logic orchestration."""
|
2
|
+
|
3
|
+
from pdf_renamer.application.filename_service import FilenameService
|
4
|
+
from pdf_renamer.application.pdf_rename_workflow import PDFRenameWorkflow
|
5
|
+
from pdf_renamer.application.rename_service import RenameService
|
6
|
+
|
7
|
+
__all__ = ["FilenameService", "PDFRenameWorkflow", "RenameService"]
|
@@ -0,0 +1,70 @@
|
|
1
|
+
"""Filename generation service - coordinates PDF extraction and LLM generation."""
|
2
|
+
|
3
|
+
import re
|
4
|
+
|
5
|
+
from pdf_renamer.domain.models import FilenameResult, PDFContent
|
6
|
+
from pdf_renamer.domain.ports import FilenameGenerator, LLMProvider
|
7
|
+
|
8
|
+
|
9
|
+
class FilenameService(FilenameGenerator):
|
10
|
+
"""Service for generating filenames from PDF content."""
|
11
|
+
|
12
|
+
def __init__(self, llm_provider: LLMProvider) -> None:
|
13
|
+
"""
|
14
|
+
Initialize the filename service.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
llm_provider: LLM provider for filename generation
|
18
|
+
"""
|
19
|
+
self.llm_provider = llm_provider
|
20
|
+
|
21
|
+
async def generate(self, original_filename: str, content: PDFContent) -> FilenameResult:
|
22
|
+
"""
|
23
|
+
Generate a filename suggestion based on PDF content.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
original_filename: Current filename
|
27
|
+
content: Extracted PDF content
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
FilenameResult with suggestion
|
31
|
+
"""
|
32
|
+
# Convert metadata to dictionary
|
33
|
+
metadata_dict = content.metadata.to_dict()
|
34
|
+
|
35
|
+
# Generate filename using LLM
|
36
|
+
result = await self.llm_provider.generate_filename(
|
37
|
+
original_filename=original_filename,
|
38
|
+
text_excerpt=content.text,
|
39
|
+
metadata_dict=metadata_dict,
|
40
|
+
)
|
41
|
+
|
42
|
+
# Sanitize the generated filename
|
43
|
+
result.filename = self.sanitize(result.filename)
|
44
|
+
|
45
|
+
return result
|
46
|
+
|
47
|
+
def sanitize(self, filename: str) -> str:
|
48
|
+
"""
|
49
|
+
Sanitize a filename to be filesystem-safe.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
filename: Raw filename
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
Sanitized filename
|
56
|
+
"""
|
57
|
+
# Remove or replace invalid characters
|
58
|
+
filename = re.sub(r'[<>:"/\\|?*]', "", filename)
|
59
|
+
|
60
|
+
# Replace multiple spaces/hyphens with single hyphen
|
61
|
+
filename = re.sub(r"[\s\-]+", "-", filename)
|
62
|
+
|
63
|
+
# Remove leading/trailing hyphens
|
64
|
+
filename = filename.strip("-")
|
65
|
+
|
66
|
+
# Limit length
|
67
|
+
if len(filename) > 100:
|
68
|
+
filename = filename[:100].rstrip("-")
|
69
|
+
|
70
|
+
return filename
|
@@ -0,0 +1,144 @@
|
|
1
|
+
"""PDF rename workflow - orchestrates the complete process."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
from collections.abc import Callable
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
from pdf_renamer.domain.models import FileRenameOperation
|
8
|
+
from pdf_renamer.domain.ports import (
|
9
|
+
FilenameGenerator,
|
10
|
+
FileRenamer,
|
11
|
+
PDFExtractor,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
class PDFRenameWorkflow:
|
16
|
+
"""
|
17
|
+
Orchestrates the PDF rename workflow.
|
18
|
+
|
19
|
+
This class follows the Single Responsibility Principle - it only coordinates
|
20
|
+
the workflow, delegating actual work to specialized services.
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
pdf_extractor: PDFExtractor,
|
26
|
+
filename_generator: FilenameGenerator,
|
27
|
+
file_renamer: FileRenamer,
|
28
|
+
max_concurrent_api: int = 3,
|
29
|
+
max_concurrent_pdf: int = 10,
|
30
|
+
) -> None:
|
31
|
+
"""
|
32
|
+
Initialize the workflow.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
pdf_extractor: PDF extraction service
|
36
|
+
filename_generator: Filename generation service
|
37
|
+
file_renamer: File renaming service
|
38
|
+
max_concurrent_api: Maximum concurrent API calls
|
39
|
+
max_concurrent_pdf: Maximum concurrent PDF extractions
|
40
|
+
"""
|
41
|
+
self.pdf_extractor = pdf_extractor
|
42
|
+
self.filename_generator = filename_generator
|
43
|
+
self.file_renamer = file_renamer
|
44
|
+
self.api_semaphore = asyncio.Semaphore(max_concurrent_api)
|
45
|
+
self.pdf_semaphore = asyncio.Semaphore(max_concurrent_pdf)
|
46
|
+
|
47
|
+
async def process_pdf(
|
48
|
+
self,
|
49
|
+
pdf_path: Path,
|
50
|
+
status_callback: Callable[[str, dict[str, str]], None] | None = None,
|
51
|
+
) -> FileRenameOperation | None:
|
52
|
+
"""
|
53
|
+
Process a single PDF file.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
pdf_path: Path to PDF file
|
57
|
+
status_callback: Optional callback for status updates (filename, status_dict)
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
FileRenameOperation if successful, None if error
|
61
|
+
"""
|
62
|
+
filename = pdf_path.name
|
63
|
+
|
64
|
+
try:
|
65
|
+
# Update status: extracting
|
66
|
+
if status_callback:
|
67
|
+
status_callback(filename, {"status": "Extracting", "stage": "📄"})
|
68
|
+
|
69
|
+
# Extract PDF content (with PDF semaphore to limit memory usage)
|
70
|
+
async with self.pdf_semaphore:
|
71
|
+
content = await self.pdf_extractor.extract(pdf_path)
|
72
|
+
|
73
|
+
# Generate filename (with API semaphore to limit API load)
|
74
|
+
if status_callback:
|
75
|
+
status_callback(filename, {"status": "Analyzing", "stage": "🤖"})
|
76
|
+
|
77
|
+
async with self.api_semaphore:
|
78
|
+
result = await self.filename_generator.generate(filename, content)
|
79
|
+
|
80
|
+
# Mark complete
|
81
|
+
if status_callback:
|
82
|
+
status_callback(
|
83
|
+
filename,
|
84
|
+
{
|
85
|
+
"status": "Complete",
|
86
|
+
"stage": "✓",
|
87
|
+
"confidence": result.confidence.value,
|
88
|
+
},
|
89
|
+
)
|
90
|
+
|
91
|
+
return FileRenameOperation(
|
92
|
+
original_path=pdf_path,
|
93
|
+
suggested_filename=result.filename,
|
94
|
+
confidence=result.confidence,
|
95
|
+
reasoning=result.reasoning,
|
96
|
+
text_excerpt=content.text,
|
97
|
+
metadata=content.metadata,
|
98
|
+
)
|
99
|
+
|
100
|
+
except Exception as e:
|
101
|
+
if status_callback:
|
102
|
+
status_callback(filename, {"status": "Error", "stage": "✗", "error": str(e)})
|
103
|
+
return None
|
104
|
+
|
105
|
+
async def process_batch(
|
106
|
+
self,
|
107
|
+
pdf_paths: list[Path],
|
108
|
+
status_callback: Callable[[str, dict[str, str]], None] | None = None,
|
109
|
+
) -> list[FileRenameOperation | None]:
|
110
|
+
"""
|
111
|
+
Process multiple PDF files concurrently.
|
112
|
+
|
113
|
+
Args:
|
114
|
+
pdf_paths: List of PDF paths to process
|
115
|
+
status_callback: Optional callback for status updates
|
116
|
+
|
117
|
+
Returns:
|
118
|
+
List of FileRenameOperation results (None for failures)
|
119
|
+
"""
|
120
|
+
tasks = [self.process_pdf(pdf, status_callback) for pdf in pdf_paths]
|
121
|
+
return await asyncio.gather(*tasks, return_exceptions=False)
|
122
|
+
|
123
|
+
async def execute_rename(
|
124
|
+
self,
|
125
|
+
operation: FileRenameOperation,
|
126
|
+
output_dir: Path | None = None,
|
127
|
+
dry_run: bool = True,
|
128
|
+
) -> bool:
|
129
|
+
"""
|
130
|
+
Execute a rename operation.
|
131
|
+
|
132
|
+
Args:
|
133
|
+
operation: The rename operation to execute
|
134
|
+
output_dir: Optional output directory
|
135
|
+
dry_run: If True, don't actually rename
|
136
|
+
|
137
|
+
Returns:
|
138
|
+
True if successful
|
139
|
+
|
140
|
+
Raises:
|
141
|
+
RuntimeError: If rename fails
|
142
|
+
"""
|
143
|
+
new_path = operation.create_new_path(output_dir)
|
144
|
+
return await self.file_renamer.rename(operation.original_path, new_path, dry_run)
|
@@ -0,0 +1,79 @@
|
|
1
|
+
"""File rename service - handles the actual file operations."""
|
2
|
+
|
3
|
+
import shutil
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
from pdf_renamer.domain.ports import FileRenamer
|
7
|
+
|
8
|
+
|
9
|
+
class RenameService(FileRenamer):
|
10
|
+
"""Service for renaming files with duplicate handling."""
|
11
|
+
|
12
|
+
async def rename(self, original_path: Path, new_path: Path, dry_run: bool = True) -> bool:
|
13
|
+
"""
|
14
|
+
Rename a file with duplicate handling.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
original_path: Original file path
|
18
|
+
new_path: New file path
|
19
|
+
dry_run: If True, don't actually rename
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
True if successful (or would be successful in dry-run)
|
23
|
+
|
24
|
+
Raises:
|
25
|
+
RuntimeError: If rename fails
|
26
|
+
"""
|
27
|
+
try:
|
28
|
+
# Check if source exists
|
29
|
+
if not original_path.exists():
|
30
|
+
msg = f"Source file does not exist: {original_path}"
|
31
|
+
raise RuntimeError(msg)
|
32
|
+
|
33
|
+
# Handle duplicates
|
34
|
+
final_path = self._handle_duplicate(new_path)
|
35
|
+
|
36
|
+
if dry_run:
|
37
|
+
# In dry-run, just verify we could do the operation
|
38
|
+
return True
|
39
|
+
|
40
|
+
# Perform the rename
|
41
|
+
if original_path.parent != final_path.parent:
|
42
|
+
# Moving to different directory
|
43
|
+
final_path.parent.mkdir(parents=True, exist_ok=True)
|
44
|
+
shutil.move(str(original_path), str(final_path))
|
45
|
+
else:
|
46
|
+
# Renaming in same directory
|
47
|
+
original_path.rename(final_path)
|
48
|
+
|
49
|
+
return True
|
50
|
+
|
51
|
+
except Exception as e:
|
52
|
+
msg = f"Failed to rename {original_path} to {new_path}: {e}"
|
53
|
+
raise RuntimeError(msg) from e
|
54
|
+
|
55
|
+
def _handle_duplicate(self, path: Path) -> Path:
|
56
|
+
"""
|
57
|
+
Handle duplicate filenames by adding a counter suffix.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
path: Desired path
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
Path that doesn't conflict with existing files
|
64
|
+
"""
|
65
|
+
if not path.exists():
|
66
|
+
return path
|
67
|
+
|
68
|
+
# Extract stem and suffix
|
69
|
+
stem = path.stem
|
70
|
+
suffix = path.suffix
|
71
|
+
parent = path.parent
|
72
|
+
|
73
|
+
# Try incrementing counter
|
74
|
+
counter = 1
|
75
|
+
while True:
|
76
|
+
new_path = parent / f"{stem}-{counter}{suffix}"
|
77
|
+
if not new_path.exists():
|
78
|
+
return new_path
|
79
|
+
counter += 1
|
@@ -0,0 +1,25 @@
|
|
1
|
+
"""Domain layer - pure business logic with no external dependencies."""
|
2
|
+
|
3
|
+
from pdf_renamer.domain.models import (
|
4
|
+
FilenameResult,
|
5
|
+
FileRenameOperation,
|
6
|
+
PDFContent,
|
7
|
+
PDFMetadata,
|
8
|
+
)
|
9
|
+
from pdf_renamer.domain.ports import (
|
10
|
+
FilenameGenerator,
|
11
|
+
FileRenamer,
|
12
|
+
LLMProvider,
|
13
|
+
PDFExtractor,
|
14
|
+
)
|
15
|
+
|
16
|
+
__all__ = [
|
17
|
+
"FileRenameOperation",
|
18
|
+
"FileRenamer",
|
19
|
+
"FilenameGenerator",
|
20
|
+
"FilenameResult",
|
21
|
+
"LLMProvider",
|
22
|
+
"PDFContent",
|
23
|
+
"PDFExtractor",
|
24
|
+
"PDFMetadata",
|
25
|
+
]
|