pdf-file-renamer 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_file_renamer/__init__.py +1 -1
- pdf_file_renamer/application/pdf_rename_workflow.py +8 -2
- pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py +45 -14
- pdf_file_renamer/presentation/formatters.py +13 -3
- pdf_file_renamer-0.6.2.dist-info/METADATA +443 -0
- {pdf_file_renamer-0.6.0.dist-info → pdf_file_renamer-0.6.2.dist-info}/RECORD +9 -9
- pdf_file_renamer-0.6.0.dist-info/METADATA +0 -272
- {pdf_file_renamer-0.6.0.dist-info → pdf_file_renamer-0.6.2.dist-info}/WHEEL +0 -0
- {pdf_file_renamer-0.6.0.dist-info → pdf_file_renamer-0.6.2.dist-info}/entry_points.txt +0 -0
- {pdf_file_renamer-0.6.0.dist-info → pdf_file_renamer-0.6.2.dist-info}/licenses/LICENSE +0 -0
pdf_file_renamer/__init__.py
CHANGED
@@ -5,7 +5,7 @@ import contextlib
|
|
5
5
|
from collections.abc import Callable
|
6
6
|
from pathlib import Path
|
7
7
|
|
8
|
-
from pdf_file_renamer.domain.models import FileRenameOperation
|
8
|
+
from pdf_file_renamer.domain.models import ConfidenceLevel, FileRenameOperation
|
9
9
|
from pdf_file_renamer.domain.ports import (
|
10
10
|
DOIExtractor,
|
11
11
|
FilenameGenerator,
|
@@ -103,12 +103,18 @@ class PDFRenameWorkflow:
|
|
103
103
|
|
104
104
|
# Mark complete
|
105
105
|
if status_callback:
|
106
|
+
# result.confidence is already a string due to use_enum_values=True
|
107
|
+
confidence_str = (
|
108
|
+
result.confidence.value
|
109
|
+
if isinstance(result.confidence, ConfidenceLevel)
|
110
|
+
else result.confidence
|
111
|
+
)
|
106
112
|
status_callback(
|
107
113
|
filename,
|
108
114
|
{
|
109
115
|
"status": "Complete",
|
110
116
|
"stage": "✓",
|
111
|
-
"confidence":
|
117
|
+
"confidence": confidence_str,
|
112
118
|
},
|
113
119
|
)
|
114
120
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
"""DOI extraction using pdf2doi library."""
|
2
2
|
|
3
3
|
import asyncio
|
4
|
+
import contextlib
|
4
5
|
import re
|
5
6
|
from pathlib import Path
|
6
7
|
|
@@ -31,16 +32,12 @@ class PDF2DOIExtractor(DOIExtractor):
|
|
31
32
|
try:
|
32
33
|
# Run pdf2doi in executor to avoid blocking
|
33
34
|
loop = asyncio.get_event_loop()
|
34
|
-
|
35
|
-
None, pdf2doi.pdf2doi, str(pdf_path)
|
36
|
-
)
|
35
|
+
result = await loop.run_in_executor(None, pdf2doi.pdf2doi, str(pdf_path))
|
37
36
|
|
38
|
-
|
37
|
+
# pdf2doi returns a dict (not a list)
|
38
|
+
if not result or not isinstance(result, dict):
|
39
39
|
return None
|
40
40
|
|
41
|
-
# Get the first result
|
42
|
-
result = results[0]
|
43
|
-
|
44
41
|
# Check if DOI was found
|
45
42
|
identifier = result.get("identifier")
|
46
43
|
if not identifier:
|
@@ -50,15 +47,49 @@ class PDF2DOIExtractor(DOIExtractor):
|
|
50
47
|
if identifier_type.lower() not in ("doi", "arxiv"):
|
51
48
|
return None
|
52
49
|
|
53
|
-
# Extract metadata from validation_info (
|
50
|
+
# Extract metadata from validation_info (JSON string from CrossRef API)
|
54
51
|
validation_info = result.get("validation_info", "")
|
55
52
|
|
56
|
-
# Parse
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
53
|
+
# Parse JSON metadata
|
54
|
+
import json
|
55
|
+
|
56
|
+
metadata = {}
|
57
|
+
if validation_info:
|
58
|
+
with contextlib.suppress(json.JSONDecodeError):
|
59
|
+
metadata = json.loads(validation_info)
|
60
|
+
|
61
|
+
# Extract title
|
62
|
+
title = metadata.get("title")
|
63
|
+
|
64
|
+
# Extract authors (list of dicts with 'given' and 'family' fields)
|
65
|
+
authors: list[str] | None = None
|
66
|
+
if "author" in metadata:
|
67
|
+
author_list = metadata["author"]
|
68
|
+
author_names: list[str] = []
|
69
|
+
for author in author_list:
|
70
|
+
if isinstance(author, dict):
|
71
|
+
family = author.get("family", "")
|
72
|
+
given = author.get("given", "")
|
73
|
+
if family:
|
74
|
+
full_name = f"{given} {family}".strip() if given else family
|
75
|
+
author_names.append(full_name)
|
76
|
+
if author_names:
|
77
|
+
authors = author_names
|
78
|
+
|
79
|
+
# Extract year from published-online or published
|
80
|
+
year = None
|
81
|
+
for date_field in ["published-online", "published", "created"]:
|
82
|
+
if date_field in metadata and "date-parts" in metadata[date_field]:
|
83
|
+
date_parts = metadata[date_field]["date-parts"]
|
84
|
+
if date_parts and len(date_parts) > 0 and len(date_parts[0]) > 0:
|
85
|
+
year = str(date_parts[0][0])
|
86
|
+
break
|
87
|
+
|
88
|
+
# Extract journal (container-title)
|
89
|
+
journal = metadata.get("container-title")
|
90
|
+
|
91
|
+
# Extract publisher
|
92
|
+
publisher = metadata.get("publisher")
|
62
93
|
|
63
94
|
return DOIMetadata(
|
64
95
|
doi=identifier,
|
@@ -7,7 +7,7 @@ from rich.prompt import Prompt
|
|
7
7
|
from rich.table import Table
|
8
8
|
from rich.text import Text
|
9
9
|
|
10
|
-
from pdf_file_renamer.domain.models import FileRenameOperation
|
10
|
+
from pdf_file_renamer.domain.models import ConfidenceLevel, FileRenameOperation
|
11
11
|
|
12
12
|
|
13
13
|
class ProgressDisplay:
|
@@ -146,7 +146,13 @@ class InteractivePrompt:
|
|
146
146
|
info_text.append("Suggested: ", style="bold green")
|
147
147
|
info_text.append(f"{operation.new_filename}\n", style="green")
|
148
148
|
info_text.append("Confidence: ", style="bold yellow")
|
149
|
-
|
149
|
+
# Handle both enum and string confidence
|
150
|
+
conf_str = (
|
151
|
+
operation.confidence.value
|
152
|
+
if isinstance(operation.confidence, ConfidenceLevel)
|
153
|
+
else operation.confidence
|
154
|
+
)
|
155
|
+
info_text.append(f"{conf_str}\n", style="yellow")
|
150
156
|
info_text.append("Reasoning: ", style="bold white")
|
151
157
|
info_text.append(operation.reasoning, style="dim white")
|
152
158
|
|
@@ -206,10 +212,14 @@ class ResultsTable:
|
|
206
212
|
reasoning = op.reasoning
|
207
213
|
if len(reasoning) > 100:
|
208
214
|
reasoning = reasoning[:100] + "..."
|
215
|
+
# Handle both enum and string confidence
|
216
|
+
conf_str = (
|
217
|
+
op.confidence.value if isinstance(op.confidence, ConfidenceLevel) else op.confidence
|
218
|
+
)
|
209
219
|
table.add_row(
|
210
220
|
op.original_path.name,
|
211
221
|
op.new_filename,
|
212
|
-
|
222
|
+
conf_str,
|
213
223
|
reasoning,
|
214
224
|
)
|
215
225
|
|
@@ -0,0 +1,443 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: pdf-file-renamer
|
3
|
+
Version: 0.6.2
|
4
|
+
Summary: Intelligent PDF renaming using LLMs with DOI-based naming and interactive workflow
|
5
|
+
Project-URL: Homepage, https://github.com/nostoslabs/pdf-renamer
|
6
|
+
Project-URL: Repository, https://github.com/nostoslabs/pdf-renamer
|
7
|
+
Project-URL: Issues, https://github.com/nostoslabs/pdf-renamer/issues
|
8
|
+
Project-URL: Changelog, https://github.com/nostoslabs/pdf-renamer/blob/main/CHANGELOG.md
|
9
|
+
Author-email: Nostos Labs <info@nostoslabs.com>
|
10
|
+
License: MIT
|
11
|
+
License-File: LICENSE
|
12
|
+
Keywords: academic-papers,ai,automation,document-management,doi,file-organization,llm,pdf,rename
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
14
|
+
Classifier: Environment :: Console
|
15
|
+
Classifier: Intended Audience :: Education
|
16
|
+
Classifier: Intended Audience :: End Users/Desktop
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
18
|
+
Classifier: License :: OSI Approved :: MIT License
|
19
|
+
Classifier: Operating System :: OS Independent
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
23
|
+
Classifier: Topic :: Office/Business :: Office Suites
|
24
|
+
Classifier: Topic :: Scientific/Engineering
|
25
|
+
Classifier: Topic :: Text Processing :: General
|
26
|
+
Classifier: Topic :: Utilities
|
27
|
+
Classifier: Typing :: Typed
|
28
|
+
Requires-Python: >=3.11
|
29
|
+
Requires-Dist: docling-core>=2.0.0
|
30
|
+
Requires-Dist: docling-parse>=2.0.0
|
31
|
+
Requires-Dist: pdf2doi>=1.7
|
32
|
+
Requires-Dist: pydantic-ai>=1.0.17
|
33
|
+
Requires-Dist: pydantic-settings>=2.7.1
|
34
|
+
Requires-Dist: pydantic>=2.10.6
|
35
|
+
Requires-Dist: pymupdf>=1.26.5
|
36
|
+
Requires-Dist: python-dotenv>=1.1.1
|
37
|
+
Requires-Dist: rich>=14.2.0
|
38
|
+
Requires-Dist: tenacity>=9.0.0
|
39
|
+
Requires-Dist: typer>=0.19.2
|
40
|
+
Provides-Extra: dev
|
41
|
+
Requires-Dist: mypy>=1.14.1; extra == 'dev'
|
42
|
+
Requires-Dist: pytest-asyncio>=0.25.2; extra == 'dev'
|
43
|
+
Requires-Dist: pytest-cov>=6.0.0; extra == 'dev'
|
44
|
+
Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
|
45
|
+
Requires-Dist: pytest>=8.3.4; extra == 'dev'
|
46
|
+
Requires-Dist: ruff>=0.9.1; extra == 'dev'
|
47
|
+
Description-Content-Type: text/markdown
|
48
|
+
|
49
|
+
# PDF Renamer
|
50
|
+
|
51
|
+
[](https://pypi.org/project/pdf-file-renamer/)
|
52
|
+
[](https://pypi.org/project/pdf-file-renamer/)
|
53
|
+
[](https://pypi.org/project/pdf-file-renamer/)
|
54
|
+
[](https://opensource.org/licenses/MIT)
|
55
|
+
[](https://github.com/nostoslabs/pdf-renamer/actions)
|
56
|
+
[](https://github.com/astral-sh/ruff)
|
57
|
+
[](http://mypy-lang.org/)
|
58
|
+
|
59
|
+
**Intelligent PDF file renaming using LLMs and DOI metadata.** Automatically generate clean, descriptive filenames for your PDF library.
|
60
|
+
|
61
|
+
> 🚀 Works with **OpenAI**, **Ollama**, **LM Studio**, and any OpenAI-compatible API
|
62
|
+
> 📚 **DOI-first** approach for academic papers - no API costs!
|
63
|
+
> 🎯 **Interactive mode** with retry, edit, and skip options
|
64
|
+
|
65
|
+
## Table of Contents
|
66
|
+
|
67
|
+
- [Quick Example](#quick-example)
|
68
|
+
- [Features](#features)
|
69
|
+
- [Installation](#installation)
|
70
|
+
- [Configuration](#configuration)
|
71
|
+
- [Usage](#usage)
|
72
|
+
- [Interactive Mode](#interactive-mode)
|
73
|
+
- [How It Works](#how-it-works)
|
74
|
+
- [Cost Considerations](#cost-considerations)
|
75
|
+
- [Architecture](#architecture)
|
76
|
+
- [Development](#development)
|
77
|
+
- [Contributing](#contributing)
|
78
|
+
- [License](#license)
|
79
|
+
|
80
|
+
## Quick Example
|
81
|
+
|
82
|
+

|
83
|
+
|
84
|
+
Transform messy filenames into clean, organized ones:
|
85
|
+
|
86
|
+
```
|
87
|
+
Before: After:
|
88
|
+
📄 paper_final_v3.pdf → Leroux-Analog-In-memory-Computing-2025.pdf
|
89
|
+
📄 download (2).pdf → Ruiz-Why-Don-Trace-Requirements-2023.pdf
|
90
|
+
📄 document.pdf → Raspail-Camp_of_the_Saints.pdf
|
91
|
+
```
|
92
|
+
|
93
|
+
**Live Progress Display:**
|
94
|
+
```
|
95
|
+
Processing 3 PDFs with max 3 concurrent API calls and 10 concurrent extractions
|
96
|
+
|
97
|
+
╭─────────────────────────── 📊 Progress ───────────────────────────╮
|
98
|
+
│ Total: 3 | Pending: 0 | Extracting: 0 | Analyzing: 0 | Complete: 3 │
|
99
|
+
╰───────────────────────────────────────────────────────────────────╯
|
100
|
+
╭───────────────────────────────────────────────────────────────────╮
|
101
|
+
│ [██████████████████████████████████████████████] 100.0% │
|
102
|
+
╰───────────────────────────────────────────────────────────────────╯
|
103
|
+
Processing Status
|
104
|
+
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
|
105
|
+
┃ File ┃ Stage ┃ Status ┃ Details ┃
|
106
|
+
┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
|
107
|
+
│ paper_final_v3.pdf │ ✓ │ Complete │ very_high │
|
108
|
+
│ download (2).pdf │ ✓ │ Complete │ very_high (DOI) │
|
109
|
+
│ document.pdf │ ✓ │ Complete │ high │
|
110
|
+
└────────────────────┴───────┴──────────┴─────────────────────┘
|
111
|
+
```
|
112
|
+
|
113
|
+
## Features
|
114
|
+
|
115
|
+
- **🎓 DOI-based naming** - Automatically extracts DOI and fetches authoritative metadata for academic papers
|
116
|
+
- **🧠 Advanced PDF parsing** using docling-parse for better structure-aware extraction
|
117
|
+
- **👁️ OCR fallback** for scanned PDFs with low text content
|
118
|
+
- **🎯 Smart LLM prompting** with multi-pass analysis for improved accuracy
|
119
|
+
- **⚡ Hybrid approach** - Uses DOI metadata when available, falls back to LLM analysis for other documents
|
120
|
+
- **📝 Standardized format** - Generates filenames like `Author-Topic-Year.pdf`
|
121
|
+
- **🔍 Dry-run mode** to preview changes before applying
|
122
|
+
- **💬 Enhanced interactive mode** with options to accept, manually edit, retry, or skip each file
|
123
|
+
- **📊 Live progress display** with concurrent processing for speed
|
124
|
+
- **⚙️ Configurable concurrency** limits for API calls and PDF extraction
|
125
|
+
- **📦 Batch processing** of multiple PDFs with optional output directory
|
126
|
+
|
127
|
+
## Installation
|
128
|
+
|
129
|
+
### Quick Start (No Installation Required)
|
130
|
+
|
131
|
+
```bash
|
132
|
+
# Run directly with uvx
|
133
|
+
uvx pdf-renamer --dry-run /path/to/pdfs
|
134
|
+
```
|
135
|
+
|
136
|
+
### Install from PyPI
|
137
|
+
|
138
|
+
```bash
|
139
|
+
# Using pip
|
140
|
+
pip install pdf-file-renamer
|
141
|
+
|
142
|
+
# Using uv
|
143
|
+
uv pip install pdf-file-renamer
|
144
|
+
```
|
145
|
+
|
146
|
+
### Install from Source
|
147
|
+
|
148
|
+
```bash
|
149
|
+
# Clone and install
|
150
|
+
git clone https://github.com/nostoslabs/pdf-renamer.git
|
151
|
+
cd pdf-renamer
|
152
|
+
uv sync
|
153
|
+
```
|
154
|
+
|
155
|
+
## Configuration
|
156
|
+
|
157
|
+
Configure your LLM provider:
|
158
|
+
|
159
|
+
**Option A: OpenAI (Cloud)**
|
160
|
+
```bash
|
161
|
+
cp .env.example .env
|
162
|
+
# Edit .env and add your OPENAI_API_KEY
|
163
|
+
```
|
164
|
+
|
165
|
+
**Option B: Ollama or other local models**
|
166
|
+
```bash
|
167
|
+
# No API key needed for local models
|
168
|
+
# Either set LLM_BASE_URL in .env or use --url flag
|
169
|
+
echo "LLM_BASE_URL=http://patmos:11434/v1" > .env
|
170
|
+
```
|
171
|
+
|
172
|
+
## Usage
|
173
|
+
|
174
|
+
### Quick Start
|
175
|
+
|
176
|
+
```bash
|
177
|
+
# Preview renames (dry-run mode)
|
178
|
+
pdf-renamer --dry-run /path/to/pdf/directory
|
179
|
+
|
180
|
+
# Actually rename files
|
181
|
+
pdf-renamer --no-dry-run /path/to/pdf/directory
|
182
|
+
|
183
|
+
# Interactive mode - review each file
|
184
|
+
pdf-renamer --interactive --no-dry-run /path/to/pdf/directory
|
185
|
+
```
|
186
|
+
|
187
|
+
### Using uvx (No Installation)
|
188
|
+
|
189
|
+
```bash
|
190
|
+
# Run directly without installing
|
191
|
+
uvx pdf-renamer --dry-run /path/to/pdfs
|
192
|
+
|
193
|
+
# Run from GitHub
|
194
|
+
uvx https://github.com/nostoslabs/pdf-renamer --dry-run /path/to/pdfs
|
195
|
+
```
|
196
|
+
|
197
|
+
### Options
|
198
|
+
|
199
|
+
- `--dry-run/--no-dry-run`: Show suggestions without renaming (default: True)
|
200
|
+
- `--interactive, -i`: Interactive mode with rich options:
|
201
|
+
- **Accept** - Use the suggested filename
|
202
|
+
- **Edit** - Manually modify the filename
|
203
|
+
- **Retry** - Ask the LLM to generate a new suggestion
|
204
|
+
- **Skip** - Skip this file and move to the next
|
205
|
+
- `--model`: Model to use (default: llama3.2, works with any OpenAI-compatible API)
|
206
|
+
- `--url`: Custom base URL for OpenAI-compatible APIs (default: http://localhost:11434/v1)
|
207
|
+
- `--pattern`: Glob pattern for files (default: *.pdf)
|
208
|
+
- `--output-dir, -o`: Move renamed files to a different directory
|
209
|
+
- `--max-concurrent-api`: Maximum concurrent API calls (default: 3)
|
210
|
+
- `--max-concurrent-pdf`: Maximum concurrent PDF extractions (default: 10)
|
211
|
+
|
212
|
+
### Examples
|
213
|
+
|
214
|
+
**Using OpenAI:**
|
215
|
+
```bash
|
216
|
+
# Preview all PDFs in current directory
|
217
|
+
uvx pdf-renamer --dry-run .
|
218
|
+
|
219
|
+
# Rename PDFs in specific directory
|
220
|
+
uvx pdf-renamer --no-dry-run ~/Documents/Papers
|
221
|
+
|
222
|
+
# Use a different OpenAI model
|
223
|
+
uvx pdf-renamer --model gpt-4o --dry-run .
|
224
|
+
```
|
225
|
+
|
226
|
+
**Using Ollama (or other local models):**
|
227
|
+
```bash
|
228
|
+
# Using Ollama on patmos server with gemma model
|
229
|
+
uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --dry-run .
|
230
|
+
|
231
|
+
# Using local Ollama with qwen model
|
232
|
+
uvx pdf-renamer --url http://localhost:11434/v1 --model qwen2.5 --dry-run .
|
233
|
+
|
234
|
+
# Set URL in environment and just use model flag
|
235
|
+
export LLM_BASE_URL=http://patmos:11434/v1
|
236
|
+
uvx pdf-renamer --model gemma3:latest --dry-run .
|
237
|
+
```
|
238
|
+
|
239
|
+
**Other examples:**
|
240
|
+
```bash
|
241
|
+
# Process only specific files
|
242
|
+
uvx pdf-renamer --pattern "*2020*.pdf" --dry-run .
|
243
|
+
|
244
|
+
# Interactive mode with local model
|
245
|
+
uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --interactive --no-dry-run .
|
246
|
+
|
247
|
+
# Run directly from GitHub
|
248
|
+
uvx https://github.com/nostoslabs/pdf-renamer --no-dry-run ~/Documents/Papers
|
249
|
+
```
|
250
|
+
|
251
|
+
## Interactive Mode
|
252
|
+
|
253
|
+
When using `--interactive` mode, you'll be presented with each file one at a time with detailed options:
|
254
|
+
|
255
|
+
```
|
256
|
+
================================================================================
|
257
|
+
Original: 2024-research-paper.pdf
|
258
|
+
Suggested: Smith-Machine-Learning-Applications-2024.pdf
|
259
|
+
Confidence: high
|
260
|
+
Reasoning: Clear author and topic identified from abstract
|
261
|
+
================================================================================
|
262
|
+
|
263
|
+
Options:
|
264
|
+
y / yes / Enter - Accept suggested name
|
265
|
+
e / edit - Manually edit the filename
|
266
|
+
r / retry - Ask LLM to generate a new suggestion
|
267
|
+
n / no / skip - Skip this file
|
268
|
+
|
269
|
+
What would you like to do? [y]:
|
270
|
+
```
|
271
|
+
|
272
|
+
This mode is perfect for:
|
273
|
+
- **Reviewing suggestions** before applying them
|
274
|
+
- **Fine-tuning filenames** that are close but not quite right
|
275
|
+
- **Retrying** when the LLM suggestion isn't good enough
|
276
|
+
- **Building confidence** in the tool before batch processing
|
277
|
+
|
278
|
+
You can use interactive mode with `--dry-run` to preview without actually renaming files, or with `--no-dry-run` to apply changes immediately after confirmation.
|
279
|
+
|
280
|
+
## How It Works
|
281
|
+
|
282
|
+
### Intelligent Hybrid Approach
|
283
|
+
|
284
|
+
The tool uses a multi-strategy approach to generate accurate filenames:
|
285
|
+
|
286
|
+
1. **DOI Detection** (for academic papers)
|
287
|
+
- Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
|
288
|
+
- If found, queries authoritative metadata (title, authors, year, journal)
|
289
|
+
- Generates filename with **very high confidence** from validated metadata
|
290
|
+
- **Saves API costs** - no LLM call needed for papers with DOIs
|
291
|
+
|
292
|
+
2. **LLM Analysis** (fallback for non-academic PDFs)
|
293
|
+
- **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
|
294
|
+
- **OCR**: Automatically applies OCR for scanned PDFs with minimal text
|
295
|
+
- **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
|
296
|
+
- **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
|
297
|
+
- **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
|
298
|
+
- **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
|
299
|
+
|
300
|
+
3. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
|
301
|
+
4. **Rename**: Applies suggestions (if not in dry-run mode)
|
302
|
+
|
303
|
+
### Benefits of DOI Integration
|
304
|
+
|
305
|
+
- **Accuracy**: DOI metadata is canonical and verified
|
306
|
+
- **Speed**: Instant lookup vs. LLM processing time
|
307
|
+
- **Cost**: Free DOI lookups save on API costs for academic papers
|
308
|
+
- **Reliability**: Works even when PDF text extraction is poor
|
309
|
+
|
310
|
+
## Cost Considerations
|
311
|
+
|
312
|
+
**DOI-based Naming (Academic Papers):**
|
313
|
+
- **Completely free** - No API costs
|
314
|
+
- **No LLM needed** - Direct metadata lookup
|
315
|
+
- Works for most academic papers with embedded DOIs
|
316
|
+
|
317
|
+
**OpenAI (Fallback):**
|
318
|
+
- Uses `gpt-4o-mini` by default (very cost-effective)
|
319
|
+
- Only called when DOI not found
|
320
|
+
- Processes first ~4500 characters per PDF
|
321
|
+
- Typical cost: ~$0.001-0.003 per PDF
|
322
|
+
|
323
|
+
**Ollama/Local Models:**
|
324
|
+
- Completely free (runs on your hardware)
|
325
|
+
- Works with any Ollama model (llama3, qwen2.5, mistral, etc.)
|
326
|
+
- Also compatible with LM Studio, vLLM, and other OpenAI-compatible endpoints
|
327
|
+
|
328
|
+
## Filename Format
|
329
|
+
|
330
|
+
The tool generates filenames in this format:
|
331
|
+
- `Smith-Kalman-Filtering-Applications-2020.pdf`
|
332
|
+
- `Adamy-Electronic-Warfare-Modeling-Techniques.pdf`
|
333
|
+
- `Blair-Monopulse-Processing-Unresolved-Targets.pdf`
|
334
|
+
|
335
|
+
Guidelines:
|
336
|
+
- First author's last name
|
337
|
+
- 3-6 word topic description (prioritizes clarity over brevity)
|
338
|
+
- Year (if identifiable)
|
339
|
+
- Hyphens between words
|
340
|
+
- Target ~80 characters (can be longer if needed for clarity)
|
341
|
+
|
342
|
+
## Architecture
|
343
|
+
|
344
|
+
This project follows **Clean Architecture** principles with clear separation of concerns:
|
345
|
+
|
346
|
+
```
|
347
|
+
src/pdf_file_renamer/
|
348
|
+
├── domain/ # Core business logic (models, ports)
|
349
|
+
├── application/ # Use cases and workflows
|
350
|
+
├── infrastructure/ # External integrations (PDF, LLM, DOI)
|
351
|
+
└── presentation/ # CLI and UI components
|
352
|
+
```
|
353
|
+
|
354
|
+
**Key Design Patterns:**
|
355
|
+
- **Ports and Adapters** - Clean interfaces for external dependencies
|
356
|
+
- **Dependency Injection** - Flexible component composition
|
357
|
+
- **Single Responsibility** - Each module has one clear purpose
|
358
|
+
- **Type Safety** - Full mypy strict mode compliance
|
359
|
+
|
360
|
+
See [REFACTORING_SUMMARY.md](REFACTORING_SUMMARY.md) for detailed architecture notes.
|
361
|
+
|
362
|
+
## Development
|
363
|
+
|
364
|
+
### Setup
|
365
|
+
|
366
|
+
```bash
|
367
|
+
# Clone repository
|
368
|
+
git clone https://github.com/nostoslabs/pdf-renamer.git
|
369
|
+
cd pdf-renamer
|
370
|
+
|
371
|
+
# Install dependencies with uv
|
372
|
+
uv sync
|
373
|
+
|
374
|
+
# Run tests
|
375
|
+
uv run pytest
|
376
|
+
|
377
|
+
# Run linting
|
378
|
+
uv run ruff check src/ tests/
|
379
|
+
|
380
|
+
# Run type checking
|
381
|
+
uv run mypy src/
|
382
|
+
```
|
383
|
+
|
384
|
+
### Code Quality
|
385
|
+
|
386
|
+
- **Tests**: pytest with async support and coverage reporting
|
387
|
+
- **Linting**: ruff for fast, comprehensive linting
|
388
|
+
- **Formatting**: ruff format for consistent code style
|
389
|
+
- **Type Checking**: mypy in strict mode
|
390
|
+
- **CI/CD**: GitHub Actions for automated testing and releases
|
391
|
+
|
392
|
+
### Running Locally
|
393
|
+
|
394
|
+
```bash
|
395
|
+
# Run with local changes
|
396
|
+
uv run pdf-file-renamer --dry-run /path/to/pdfs
|
397
|
+
|
398
|
+
# Run specific module
|
399
|
+
uv run python -m pdf_file_renamer.main --help
|
400
|
+
```
|
401
|
+
|
402
|
+
## Contributing
|
403
|
+
|
404
|
+
Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
|
405
|
+
|
406
|
+
### Development Workflow
|
407
|
+
|
408
|
+
1. Fork the repository
|
409
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
410
|
+
3. Make your changes
|
411
|
+
4. Run tests and linting (`uv run pytest && uv run ruff check src/`)
|
412
|
+
5. Commit your changes (`git commit -m 'Add amazing feature'`)
|
413
|
+
6. Push to the branch (`git push origin feature/amazing-feature`)
|
414
|
+
7. Open a Pull Request
|
415
|
+
|
416
|
+
### Code Style
|
417
|
+
|
418
|
+
- Follow PEP 8 (enforced by ruff)
|
419
|
+
- Use type hints for all functions
|
420
|
+
- Write tests for new features
|
421
|
+
- Update documentation as needed
|
422
|
+
|
423
|
+
## License
|
424
|
+
|
425
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
426
|
+
|
427
|
+
## Acknowledgments
|
428
|
+
|
429
|
+
- [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi) for DOI extraction
|
430
|
+
- [pydantic-ai](https://ai.pydantic.dev/) for LLM integration
|
431
|
+
- [docling-parse](https://github.com/DS4SD/docling-parse) for advanced PDF parsing
|
432
|
+
- [PyMuPDF](https://pymupdf.readthedocs.io/) for PDF text extraction
|
433
|
+
- [rich](https://rich.readthedocs.io/) for beautiful terminal UI
|
434
|
+
|
435
|
+
## Support
|
436
|
+
|
437
|
+
- **Issues**: [GitHub Issues](https://github.com/nostoslabs/pdf-renamer/issues)
|
438
|
+
- **Discussions**: [GitHub Discussions](https://github.com/nostoslabs/pdf-renamer/discussions)
|
439
|
+
- **Changelog**: [CHANGELOG.md](CHANGELOG.md)
|
440
|
+
|
441
|
+
---
|
442
|
+
|
443
|
+
**Made with ❤️ by [Nostos Labs](https://github.com/nostoslabs)**
|
@@ -1,8 +1,8 @@
|
|
1
|
-
pdf_file_renamer/__init__.py,sha256=
|
1
|
+
pdf_file_renamer/__init__.py,sha256=ag2NG1Rry9SOlQHvUnNzrgujU5GkDJZ8Fh7FKCuSRNk,85
|
2
2
|
pdf_file_renamer/main.py,sha256=FTEEb-9QmOOsN9SE8L1SZvFVIkVpQDy8xZ5a8t8CWUs,145
|
3
3
|
pdf_file_renamer/application/__init__.py,sha256=riSV7UXBenkDst7Nnf11N1_RuRtM7wpKdwugxOhumS4,363
|
4
4
|
pdf_file_renamer/application/filename_service.py,sha256=IbeCNBwyhFlCMCZveq16nmQ2qvyTdtgLmr6PDWPckOs,4868
|
5
|
-
pdf_file_renamer/application/pdf_rename_workflow.py,sha256=
|
5
|
+
pdf_file_renamer/application/pdf_rename_workflow.py,sha256=WLcGJ4ufEmAnGSxVQcOFDeGG8gXSccs11DaP521YDzo,6144
|
6
6
|
pdf_file_renamer/application/rename_service.py,sha256=vviNQolk_w-qDQvOKTKj8ZhqYyyNWL-VJMfuUnL6WLw,2357
|
7
7
|
pdf_file_renamer/domain/__init__.py,sha256=jxbH3h6xaCnSRuBxclFESl6ZE1pua_I1K4CRAaYxu_I,503
|
8
8
|
pdf_file_renamer/domain/models.py,sha256=QwN79TzWmqvQvz-m9ymebvAx3pWlVpSWXNdSEAk4qq0,3186
|
@@ -10,7 +10,7 @@ pdf_file_renamer/domain/ports.py,sha256=ebOcHptiOK119NCmIwM32_fbRK5xkZP9K67vjL-4
|
|
10
10
|
pdf_file_renamer/infrastructure/__init__.py,sha256=C3ZQ7WCPCa6PMfP00lu4wqb0r57GVyDdiD5EL2DhCeY,187
|
11
11
|
pdf_file_renamer/infrastructure/config.py,sha256=baNL5_6_NNiS50ZNdql7fDwQbeAwf6f58HGYIWFQxQQ,2464
|
12
12
|
pdf_file_renamer/infrastructure/doi/__init__.py,sha256=8N9ZEwfG7q5xomzh187YtP8t4CfEBHM334xNRblPeuI,153
|
13
|
-
pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py,sha256=
|
13
|
+
pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py,sha256=714WU8MRQF2mWFEdB9MSm2nexivIByKxciOyArgfkTs,5114
|
14
14
|
pdf_file_renamer/infrastructure/llm/__init__.py,sha256=ToB8__mHvXwaIukGKPEAQ8SeC4ZLiH4auZI1P1yH5PQ,159
|
15
15
|
pdf_file_renamer/infrastructure/llm/pydantic_ai_provider.py,sha256=kVsmj0NIawkj-1WWM0hZXbsNH09GabVZm9HPlYsxGuo,9217
|
16
16
|
pdf_file_renamer/infrastructure/pdf/__init__.py,sha256=uMHqxSXNLZH5WH_e1kXrp9m7uTqPkiI2hXjNo6rCRoo,368
|
@@ -19,9 +19,9 @@ pdf_file_renamer/infrastructure/pdf/docling_extractor.py,sha256=auZrJpK7mMg1mUXK
|
|
19
19
|
pdf_file_renamer/infrastructure/pdf/pymupdf_extractor.py,sha256=C61udZCqGqiVx7T0HWNyjvnhgv5AgMIcCYtrhgHOJwk,5465
|
20
20
|
pdf_file_renamer/presentation/__init__.py,sha256=1VR44GoPGTixk3hG5YzhGyQf7a4BTKsJBd2VP3rHcFM,211
|
21
21
|
pdf_file_renamer/presentation/cli.py,sha256=0t_59-utRWLNCYjFetU0ZHoF1DPTjdNiWM9Au0jFaOg,8013
|
22
|
-
pdf_file_renamer/presentation/formatters.py,sha256=
|
23
|
-
pdf_file_renamer-0.6.
|
24
|
-
pdf_file_renamer-0.6.
|
25
|
-
pdf_file_renamer-0.6.
|
26
|
-
pdf_file_renamer-0.6.
|
27
|
-
pdf_file_renamer-0.6.
|
22
|
+
pdf_file_renamer/presentation/formatters.py,sha256=8Vz95QupJKkPgPgRyMVmA_gxRWG5vfxdnSd7Czovlrg,8946
|
23
|
+
pdf_file_renamer-0.6.2.dist-info/METADATA,sha256=qwnly-Ce8cu-S1pNnj6-NwuPC3ZbdOxGTtLheEK0IKc,16851
|
24
|
+
pdf_file_renamer-0.6.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
25
|
+
pdf_file_renamer-0.6.2.dist-info/entry_points.txt,sha256=0fEGYy60chGE9rECWeCVPxjxzz6vMtIAYdFvmH7xzbw,63
|
26
|
+
pdf_file_renamer-0.6.2.dist-info/licenses/LICENSE,sha256=_w08V08WgoMpDMlGNlkIatC5QfQ_Ds_rXOBM8pl7ffE,1068
|
27
|
+
pdf_file_renamer-0.6.2.dist-info/RECORD,,
|
@@ -1,272 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: pdf-file-renamer
|
3
|
-
Version: 0.6.0
|
4
|
-
Summary: Intelligent PDF renaming using LLMs
|
5
|
-
License-File: LICENSE
|
6
|
-
Requires-Python: >=3.11
|
7
|
-
Requires-Dist: docling-core>=2.0.0
|
8
|
-
Requires-Dist: docling-parse>=2.0.0
|
9
|
-
Requires-Dist: pdf2doi>=1.7
|
10
|
-
Requires-Dist: pydantic-ai>=1.0.17
|
11
|
-
Requires-Dist: pydantic-settings>=2.7.1
|
12
|
-
Requires-Dist: pydantic>=2.10.6
|
13
|
-
Requires-Dist: pymupdf>=1.26.5
|
14
|
-
Requires-Dist: python-dotenv>=1.1.1
|
15
|
-
Requires-Dist: rich>=14.2.0
|
16
|
-
Requires-Dist: tenacity>=9.0.0
|
17
|
-
Requires-Dist: typer>=0.19.2
|
18
|
-
Provides-Extra: dev
|
19
|
-
Requires-Dist: mypy>=1.14.1; extra == 'dev'
|
20
|
-
Requires-Dist: pytest-asyncio>=0.25.2; extra == 'dev'
|
21
|
-
Requires-Dist: pytest-cov>=6.0.0; extra == 'dev'
|
22
|
-
Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
|
23
|
-
Requires-Dist: pytest>=8.3.4; extra == 'dev'
|
24
|
-
Requires-Dist: ruff>=0.9.1; extra == 'dev'
|
25
|
-
Description-Content-Type: text/markdown
|
26
|
-
|
27
|
-
# PDF Renamer
|
28
|
-
|
29
|
-
[](https://pypi.org/project/pdf-file-renamer/)
|
30
|
-
[](https://www.python.org/downloads/)
|
31
|
-
[](https://docs.astral.sh/uv/)
|
32
|
-
[](https://opensource.org/licenses/MIT)
|
33
|
-
[](https://ai.pydantic.dev/)
|
34
|
-
[](https://github.com/nostoslabs/pdf-renamer)
|
35
|
-
|
36
|
-
[](https://github.com/nostoslabs/pdf-renamer)
|
37
|
-
[](https://github.com/astral-sh/ruff)
|
38
|
-
[](http://mypy-lang.org/)
|
39
|
-
[](https://github.com/astral-sh/ruff)
|
40
|
-
|
41
|
-
Intelligent PDF file renaming using LLMs. This tool analyzes PDF content and metadata to suggest descriptive, standardized filenames.
|
42
|
-
|
43
|
-
> 🚀 Works with **OpenAI**, **Ollama**, **LM Studio**, and any OpenAI-compatible API
|
44
|
-
|
45
|
-
## Features
|
46
|
-
|
47
|
-
- **DOI-based naming** - Automatically extracts DOI and fetches authoritative metadata for academic papers
|
48
|
-
- **Advanced PDF parsing** using docling-parse for better structure-aware extraction
|
49
|
-
- **OCR fallback** for scanned PDFs with low text content
|
50
|
-
- **Smart LLM prompting** with multi-pass analysis for improved accuracy
|
51
|
-
- **Hybrid approach** - Uses DOI metadata when available, falls back to LLM analysis for other documents
|
52
|
-
- Suggests filenames in format: `Author-Topic-Year.pdf`
|
53
|
-
- Dry-run mode to preview changes before applying
|
54
|
-
- **Enhanced interactive mode** with options to accept, manually edit, retry, or skip each file
|
55
|
-
- **Live progress display** with concurrent processing for speed
|
56
|
-
- **Configurable concurrency** limits for API calls and PDF extraction
|
57
|
-
- Batch processing of multiple PDFs with optional output directory
|
58
|
-
|
59
|
-
## Installation
|
60
|
-
|
61
|
-
### Quick Start (No Installation Required)
|
62
|
-
|
63
|
-
```bash
|
64
|
-
# Run directly with uvx
|
65
|
-
uvx pdf-renamer --dry-run /path/to/pdfs
|
66
|
-
```
|
67
|
-
|
68
|
-
### Install from PyPI
|
69
|
-
|
70
|
-
```bash
|
71
|
-
# Using pip
|
72
|
-
pip install pdf-file-renamer
|
73
|
-
|
74
|
-
# Using uv
|
75
|
-
uv pip install pdf-file-renamer
|
76
|
-
```
|
77
|
-
|
78
|
-
### Install from Source
|
79
|
-
|
80
|
-
```bash
|
81
|
-
# Clone and install
|
82
|
-
git clone https://github.com/nostoslabs/pdf-renamer.git
|
83
|
-
cd pdf-renamer
|
84
|
-
uv sync
|
85
|
-
```
|
86
|
-
|
87
|
-
## Configuration
|
88
|
-
|
89
|
-
Configure your LLM provider:
|
90
|
-
|
91
|
-
**Option A: OpenAI (Cloud)**
|
92
|
-
```bash
|
93
|
-
cp .env.example .env
|
94
|
-
# Edit .env and add your OPENAI_API_KEY
|
95
|
-
```
|
96
|
-
|
97
|
-
**Option B: Ollama or other local models**
|
98
|
-
```bash
|
99
|
-
# No API key needed for local models
|
100
|
-
# Either set LLM_BASE_URL in .env or use --url flag
|
101
|
-
echo "LLM_BASE_URL=http://patmos:11434/v1" > .env
|
102
|
-
```
|
103
|
-
|
104
|
-
## Usage
|
105
|
-
|
106
|
-
### Quick Start
|
107
|
-
|
108
|
-
```bash
|
109
|
-
# Preview renames (dry-run mode)
|
110
|
-
pdf-renamer --dry-run /path/to/pdf/directory
|
111
|
-
|
112
|
-
# Actually rename files
|
113
|
-
pdf-renamer --no-dry-run /path/to/pdf/directory
|
114
|
-
|
115
|
-
# Interactive mode - review each file
|
116
|
-
pdf-renamer --interactive --no-dry-run /path/to/pdf/directory
|
117
|
-
```
|
118
|
-
|
119
|
-
### Using uvx (No Installation)
|
120
|
-
|
121
|
-
```bash
|
122
|
-
# Run directly without installing
|
123
|
-
uvx pdf-renamer --dry-run /path/to/pdfs
|
124
|
-
|
125
|
-
# Run from GitHub
|
126
|
-
uvx https://github.com/nostoslabs/pdf-renamer --dry-run /path/to/pdfs
|
127
|
-
```
|
128
|
-
|
129
|
-
### Options
|
130
|
-
|
131
|
-
- `--dry-run/--no-dry-run`: Show suggestions without renaming (default: True)
|
132
|
-
- `--interactive, -i`: Interactive mode with rich options:
|
133
|
-
- **Accept** - Use the suggested filename
|
134
|
-
- **Edit** - Manually modify the filename
|
135
|
-
- **Retry** - Ask the LLM to generate a new suggestion
|
136
|
-
- **Skip** - Skip this file and move to the next
|
137
|
-
- `--model`: Model to use (default: llama3.2, works with any OpenAI-compatible API)
|
138
|
-
- `--url`: Custom base URL for OpenAI-compatible APIs (default: http://localhost:11434/v1)
|
139
|
-
- `--pattern`: Glob pattern for files (default: *.pdf)
|
140
|
-
- `--output-dir, -o`: Move renamed files to a different directory
|
141
|
-
- `--max-concurrent-api`: Maximum concurrent API calls (default: 3)
|
142
|
-
- `--max-concurrent-pdf`: Maximum concurrent PDF extractions (default: 10)
|
143
|
-
|
144
|
-
### Examples
|
145
|
-
|
146
|
-
**Using OpenAI:**
|
147
|
-
```bash
|
148
|
-
# Preview all PDFs in current directory
|
149
|
-
uvx pdf-renamer --dry-run .
|
150
|
-
|
151
|
-
# Rename PDFs in specific directory
|
152
|
-
uvx pdf-renamer --no-dry-run ~/Documents/Papers
|
153
|
-
|
154
|
-
# Use a different OpenAI model
|
155
|
-
uvx pdf-renamer --model gpt-4o --dry-run .
|
156
|
-
```
|
157
|
-
|
158
|
-
**Using Ollama (or other local models):**
|
159
|
-
```bash
|
160
|
-
# Using Ollama on patmos server with gemma model
|
161
|
-
uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --dry-run .
|
162
|
-
|
163
|
-
# Using local Ollama with qwen model
|
164
|
-
uvx pdf-renamer --url http://localhost:11434/v1 --model qwen2.5 --dry-run .
|
165
|
-
|
166
|
-
# Set URL in environment and just use model flag
|
167
|
-
export LLM_BASE_URL=http://patmos:11434/v1
|
168
|
-
uvx pdf-renamer --model gemma3:latest --dry-run .
|
169
|
-
```
|
170
|
-
|
171
|
-
**Other examples:**
|
172
|
-
```bash
|
173
|
-
# Process only specific files
|
174
|
-
uvx pdf-renamer --pattern "*2020*.pdf" --dry-run .
|
175
|
-
|
176
|
-
# Interactive mode with local model
|
177
|
-
uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --interactive --no-dry-run .
|
178
|
-
|
179
|
-
# Run directly from GitHub
|
180
|
-
uvx https://github.com/nostoslabs/pdf-renamer --no-dry-run ~/Documents/Papers
|
181
|
-
```
|
182
|
-
|
183
|
-
## Interactive Mode
|
184
|
-
|
185
|
-
When using `--interactive` mode, you'll be presented with each file one at a time with detailed options:
|
186
|
-
|
187
|
-
```
|
188
|
-
================================================================================
|
189
|
-
Original: 2024-research-paper.pdf
|
190
|
-
Suggested: Smith-Machine-Learning-Applications-2024.pdf
|
191
|
-
Confidence: high
|
192
|
-
Reasoning: Clear author and topic identified from abstract
|
193
|
-
================================================================================
|
194
|
-
|
195
|
-
Options:
|
196
|
-
y / yes / Enter - Accept suggested name
|
197
|
-
e / edit - Manually edit the filename
|
198
|
-
r / retry - Ask LLM to generate a new suggestion
|
199
|
-
n / no / skip - Skip this file
|
200
|
-
|
201
|
-
What would you like to do? [y]:
|
202
|
-
```
|
203
|
-
|
204
|
-
This mode is perfect for:
|
205
|
-
- **Reviewing suggestions** before applying them
|
206
|
-
- **Fine-tuning filenames** that are close but not quite right
|
207
|
-
- **Retrying** when the LLM suggestion isn't good enough
|
208
|
-
- **Building confidence** in the tool before batch processing
|
209
|
-
|
210
|
-
You can use interactive mode with `--dry-run` to preview without actually renaming files, or with `--no-dry-run` to apply changes immediately after confirmation.
|
211
|
-
|
212
|
-
## How It Works
|
213
|
-
|
214
|
-
### Intelligent Hybrid Approach
|
215
|
-
|
216
|
-
The tool uses a multi-strategy approach to generate accurate filenames:
|
217
|
-
|
218
|
-
1. **DOI Detection** (for academic papers)
|
219
|
-
- Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
|
220
|
-
- If found, queries authoritative metadata (title, authors, year, journal)
|
221
|
-
- Generates filename with **very high confidence** from validated metadata
|
222
|
-
- **Saves API costs** - no LLM call needed for papers with DOIs
|
223
|
-
|
224
|
-
2. **LLM Analysis** (fallback for non-academic PDFs)
|
225
|
-
- **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
|
226
|
-
- **OCR**: Automatically applies OCR for scanned PDFs with minimal text
|
227
|
-
- **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
|
228
|
-
- **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
|
229
|
-
- **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
|
230
|
-
- **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
|
231
|
-
|
232
|
-
3. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
|
233
|
-
4. **Rename**: Applies suggestions (if not in dry-run mode)
|
234
|
-
|
235
|
-
### Benefits of DOI Integration
|
236
|
-
|
237
|
-
- **Accuracy**: DOI metadata is canonical and verified
|
238
|
-
- **Speed**: Instant lookup vs. LLM processing time
|
239
|
-
- **Cost**: Free DOI lookups save on API costs for academic papers
|
240
|
-
- **Reliability**: Works even when PDF text extraction is poor
|
241
|
-
|
242
|
-
## Cost Considerations
|
243
|
-
|
244
|
-
**DOI-based Naming (Academic Papers):**
|
245
|
-
- **Completely free** - No API costs
|
246
|
-
- **No LLM needed** - Direct metadata lookup
|
247
|
-
- Works for most academic papers with embedded DOIs
|
248
|
-
|
249
|
-
**OpenAI (Fallback):**
|
250
|
-
- Uses `gpt-4o-mini` by default (very cost-effective)
|
251
|
-
- Only called when DOI not found
|
252
|
-
- Processes first ~4500 characters per PDF
|
253
|
-
- Typical cost: ~$0.001-0.003 per PDF
|
254
|
-
|
255
|
-
**Ollama/Local Models:**
|
256
|
-
- Completely free (runs on your hardware)
|
257
|
-
- Works with any Ollama model (llama3, qwen2.5, mistral, etc.)
|
258
|
-
- Also compatible with LM Studio, vLLM, and other OpenAI-compatible endpoints
|
259
|
-
|
260
|
-
## Filename Format
|
261
|
-
|
262
|
-
The tool generates filenames in this format:
|
263
|
-
- `Smith-Kalman-Filtering-Applications-2020.pdf`
|
264
|
-
- `Adamy-Electronic-Warfare-Modeling-Techniques.pdf`
|
265
|
-
- `Blair-Monopulse-Processing-Unresolved-Targets.pdf`
|
266
|
-
|
267
|
-
Guidelines:
|
268
|
-
- First author's last name
|
269
|
-
- 3-6 word topic description (prioritizes clarity over brevity)
|
270
|
-
- Year (if identifiable)
|
271
|
-
- Hyphens between words
|
272
|
-
- Target ~80 characters (can be longer if needed for clarity)
|
File without changes
|
File without changes
|
File without changes
|