pdf-file-renamer 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,3 @@
1
1
  """PDF Renamer - Intelligent PDF file renaming using LLMs."""
2
2
 
3
- __version__ = "0.6.1"
3
+ __version__ = "0.6.2"
@@ -1,7 +1,9 @@
1
1
  """DOI extraction using pdf2doi library."""
2
2
 
3
3
  import asyncio
4
+ import contextlib
4
5
  import re
6
+ from difflib import SequenceMatcher
5
7
  from pathlib import Path
6
8
 
7
9
  import pdf2doi
@@ -13,10 +15,18 @@ from pdf_file_renamer.domain.ports import DOIExtractor
13
15
  class PDF2DOIExtractor(DOIExtractor):
14
16
  """Extract DOI from PDF files using pdf2doi library."""
15
17
 
16
- def __init__(self) -> None:
17
- """Initialize the PDF2DOI extractor."""
18
+ def __init__(self, validate_match: bool = True, similarity_threshold: float = 0.3) -> None:
19
+ """
20
+ Initialize the PDF2DOI extractor.
21
+
22
+ Args:
23
+ validate_match: Whether to validate that DOI metadata matches PDF content
24
+ similarity_threshold: Minimum similarity score (0-1) for title validation
25
+ """
18
26
  # Suppress pdf2doi verbose output
19
27
  pdf2doi.config.set("verbose", False)
28
+ self.validate_match = validate_match
29
+ self.similarity_threshold = similarity_threshold
20
30
 
21
31
  async def extract_doi(self, pdf_path: Path) -> DOIMetadata | None:
22
32
  """
@@ -31,9 +41,7 @@ class PDF2DOIExtractor(DOIExtractor):
31
41
  try:
32
42
  # Run pdf2doi in executor to avoid blocking
33
43
  loop = asyncio.get_event_loop()
34
- result = await loop.run_in_executor(
35
- None, pdf2doi.pdf2doi, str(pdf_path)
36
- )
44
+ result = await loop.run_in_executor(None, pdf2doi.pdf2doi, str(pdf_path))
37
45
 
38
46
  # pdf2doi returns a dict (not a list)
39
47
  if not result or not isinstance(result, dict):
@@ -56,28 +64,26 @@ class PDF2DOIExtractor(DOIExtractor):
56
64
 
57
65
  metadata = {}
58
66
  if validation_info:
59
- try:
67
+ with contextlib.suppress(json.JSONDecodeError):
60
68
  metadata = json.loads(validation_info)
61
- except json.JSONDecodeError:
62
- pass
63
69
 
64
70
  # Extract title
65
71
  title = metadata.get("title")
66
72
 
67
73
  # Extract authors (list of dicts with 'given' and 'family' fields)
68
- authors = None
74
+ authors: list[str] | None = None
69
75
  if "author" in metadata:
70
76
  author_list = metadata["author"]
71
- authors = []
77
+ author_names: list[str] = []
72
78
  for author in author_list:
73
79
  if isinstance(author, dict):
74
80
  family = author.get("family", "")
75
81
  given = author.get("given", "")
76
82
  if family:
77
83
  full_name = f"{given} {family}".strip() if given else family
78
- authors.append(full_name)
79
- if not authors:
80
- authors = None
84
+ author_names.append(full_name)
85
+ if author_names:
86
+ authors = author_names
81
87
 
82
88
  # Extract year from published-online or published
83
89
  year = None
@@ -94,7 +100,7 @@ class PDF2DOIExtractor(DOIExtractor):
94
100
  # Extract publisher
95
101
  publisher = metadata.get("publisher")
96
102
 
97
- return DOIMetadata(
103
+ doi_metadata = DOIMetadata(
98
104
  doi=identifier,
99
105
  title=title,
100
106
  authors=authors,
@@ -104,6 +110,16 @@ class PDF2DOIExtractor(DOIExtractor):
104
110
  raw_bibtex=validation_info if validation_info else None,
105
111
  )
106
112
 
113
+ # Validate that the DOI metadata matches the PDF content
114
+ if self.validate_match:
115
+ # Extract first page text from PDF to check for title match
116
+ pdf_text = await self._extract_pdf_first_page(pdf_path)
117
+ if not self._validate_doi_matches_pdf(doi_metadata, pdf_text):
118
+ # DOI doesn't match - likely a citation DOI, not the paper's DOI
119
+ return None
120
+
121
+ return doi_metadata
122
+
107
123
  except Exception:
108
124
  # Silently fail - DOI extraction is opportunistic
109
125
  return None
@@ -161,3 +177,120 @@ class PDF2DOIExtractor(DOIExtractor):
161
177
  ]
162
178
 
163
179
  return authors if authors else None
180
+
181
+ async def _extract_pdf_first_page(self, pdf_path: Path) -> str:
182
+ """
183
+ Extract text from the first page of a PDF.
184
+
185
+ Args:
186
+ pdf_path: Path to PDF file
187
+
188
+ Returns:
189
+ Text from first page (empty string if extraction fails)
190
+ """
191
+ try:
192
+ import fitz # PyMuPDF
193
+
194
+ loop = asyncio.get_event_loop()
195
+
196
+ def extract() -> str:
197
+ with fitz.open(pdf_path) as doc:
198
+ if len(doc) > 0:
199
+ return doc[0].get_text()
200
+ return ""
201
+
202
+ return await loop.run_in_executor(None, extract)
203
+ except Exception:
204
+ return ""
205
+
206
+ def _validate_doi_matches_pdf(self, doi_metadata: DOIMetadata, pdf_text: str) -> bool:
207
+ """
208
+ Validate that DOI metadata matches the PDF content.
209
+
210
+ This checks if the title from the DOI metadata appears in the PDF text
211
+ (particularly the first page, where the title should be).
212
+
213
+ Args:
214
+ doi_metadata: DOI metadata to validate
215
+ pdf_text: Text from PDF first page (not full document!)
216
+
217
+ Returns:
218
+ True if metadata appears to match PDF, False otherwise
219
+ """
220
+ if not doi_metadata.title or not pdf_text:
221
+ # If we can't validate, assume it's valid (fail open)
222
+ return True
223
+
224
+ # Normalize text for comparison
225
+ pdf_text_lower = pdf_text.lower()
226
+ title_lower = doi_metadata.title.lower()
227
+
228
+ # Check if the full title appears in the PDF text
229
+ if title_lower in pdf_text_lower:
230
+ return True
231
+
232
+ # Check similarity using SequenceMatcher on first ~300 chars (title area)
233
+ # Most paper titles appear in the first few hundred characters
234
+ title_area = pdf_text_lower[:300]
235
+ similarity = SequenceMatcher(None, title_lower, title_area).ratio()
236
+
237
+ if similarity >= self.similarity_threshold:
238
+ return True
239
+
240
+ # Check if significant words from title appear in the title area ONLY
241
+ # This prevents matching citation DOIs from the references section
242
+ title_words = self._extract_significant_words(title_lower)
243
+ if not title_words:
244
+ return True # Can't validate, fail open
245
+
246
+ # Require at least 70% of significant words to appear in the title area
247
+ matches = sum(1 for word in title_words if word in title_area)
248
+ match_ratio = matches / len(title_words)
249
+
250
+ return match_ratio >= 0.7
251
+
252
+ def _extract_significant_words(self, text: str) -> list[str]:
253
+ """
254
+ Extract significant words from text (removing common words).
255
+
256
+ Args:
257
+ text: Input text
258
+
259
+ Returns:
260
+ List of significant words
261
+ """
262
+ # Common words to skip
263
+ stop_words = {
264
+ "a",
265
+ "an",
266
+ "the",
267
+ "and",
268
+ "or",
269
+ "but",
270
+ "in",
271
+ "on",
272
+ "at",
273
+ "to",
274
+ "for",
275
+ "of",
276
+ "with",
277
+ "by",
278
+ "from",
279
+ "as",
280
+ "is",
281
+ "was",
282
+ "are",
283
+ "were",
284
+ "been",
285
+ "be",
286
+ "this",
287
+ "that",
288
+ "these",
289
+ "those",
290
+ }
291
+
292
+ # Extract words (alphanumeric only)
293
+ words = re.findall(r"\b\w+\b", text.lower())
294
+
295
+ # Filter stop words and short words
296
+ return [w for w in words if w not in stop_words and len(w) > 3]
@@ -214,9 +214,7 @@ class ResultsTable:
214
214
  reasoning = reasoning[:100] + "..."
215
215
  # Handle both enum and string confidence
216
216
  conf_str = (
217
- op.confidence.value
218
- if isinstance(op.confidence, ConfidenceLevel)
219
- else op.confidence
217
+ op.confidence.value if isinstance(op.confidence, ConfidenceLevel) else op.confidence
220
218
  )
221
219
  table.add_row(
222
220
  op.original_path.name,
@@ -0,0 +1,444 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdf-file-renamer
3
+ Version: 0.6.3
4
+ Summary: Intelligent PDF renaming using LLMs with DOI-based naming and interactive workflow
5
+ Project-URL: Homepage, https://github.com/nostoslabs/pdf-renamer
6
+ Project-URL: Repository, https://github.com/nostoslabs/pdf-renamer
7
+ Project-URL: Issues, https://github.com/nostoslabs/pdf-renamer/issues
8
+ Project-URL: Changelog, https://github.com/nostoslabs/pdf-renamer/blob/main/CHANGELOG.md
9
+ Author-email: Nostos Labs <info@nostoslabs.com>
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: academic-papers,ai,automation,document-management,doi,file-organization,llm,pdf,rename
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Environment :: Console
15
+ Classifier: Intended Audience :: Education
16
+ Classifier: Intended Audience :: End Users/Desktop
17
+ Classifier: Intended Audience :: Science/Research
18
+ Classifier: License :: OSI Approved :: MIT License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Office/Business :: Office Suites
24
+ Classifier: Topic :: Scientific/Engineering
25
+ Classifier: Topic :: Text Processing :: General
26
+ Classifier: Topic :: Utilities
27
+ Classifier: Typing :: Typed
28
+ Requires-Python: >=3.11
29
+ Requires-Dist: docling-core>=2.0.0
30
+ Requires-Dist: docling-parse>=2.0.0
31
+ Requires-Dist: pdf2doi>=1.7
32
+ Requires-Dist: pydantic-ai>=1.0.17
33
+ Requires-Dist: pydantic-settings>=2.7.1
34
+ Requires-Dist: pydantic>=2.10.6
35
+ Requires-Dist: pymupdf>=1.26.5
36
+ Requires-Dist: python-dotenv>=1.1.1
37
+ Requires-Dist: rich>=14.2.0
38
+ Requires-Dist: tenacity>=9.0.0
39
+ Requires-Dist: typer>=0.19.2
40
+ Provides-Extra: dev
41
+ Requires-Dist: mypy>=1.14.1; extra == 'dev'
42
+ Requires-Dist: pytest-asyncio>=0.25.2; extra == 'dev'
43
+ Requires-Dist: pytest-cov>=6.0.0; extra == 'dev'
44
+ Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
45
+ Requires-Dist: pytest>=8.3.4; extra == 'dev'
46
+ Requires-Dist: ruff>=0.9.1; extra == 'dev'
47
+ Description-Content-Type: text/markdown
48
+
49
+ # PDF Renamer
50
+
51
+ [![PyPI version](https://img.shields.io/pypi/v/pdf-file-renamer.svg)](https://pypi.org/project/pdf-file-renamer/)
52
+ [![PyPI downloads](https://img.shields.io/pypi/dm/pdf-file-renamer.svg)](https://pypi.org/project/pdf-file-renamer/)
53
+ [![Python](https://img.shields.io/pypi/pyversions/pdf-file-renamer.svg)](https://pypi.org/project/pdf-file-renamer/)
54
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
55
+ [![CI](https://github.com/nostoslabs/pdf-renamer/workflows/CI/badge.svg)](https://github.com/nostoslabs/pdf-renamer/actions)
56
+ [![Code style: ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
57
+ [![Type checked: mypy](https://img.shields.io/badge/type%20checked-mypy-blue.svg)](http://mypy-lang.org/)
58
+
59
+ **Intelligent PDF file renaming using LLMs and DOI metadata.** Automatically generate clean, descriptive filenames for your PDF library.
60
+
61
+ > 🚀 Works with **OpenAI**, **Ollama**, **LM Studio**, and any OpenAI-compatible API
62
+ > 📚 **DOI-first** approach for academic papers - no API costs!
63
+ > 🎯 **Interactive mode** with retry, edit, and skip options
64
+
65
+ ## Table of Contents
66
+
67
+ - [Quick Example](#quick-example)
68
+ - [Features](#features)
69
+ - [Installation](#installation)
70
+ - [Configuration](#configuration)
71
+ - [Usage](#usage)
72
+ - [Interactive Mode](#interactive-mode)
73
+ - [How It Works](#how-it-works)
74
+ - [Cost Considerations](#cost-considerations)
75
+ - [Architecture](#architecture)
76
+ - [Development](#development)
77
+ - [Contributing](#contributing)
78
+ - [License](#license)
79
+
80
+ ## Quick Example
81
+
82
+ ![Demo](demo.gif)
83
+
84
+ Transform messy filenames into clean, organized ones:
85
+
86
+ ```
87
+ Before: After:
88
+ 📄 paper_final_v3.pdf → Leroux-Analog-In-memory-Computing-2025.pdf
89
+ 📄 download (2).pdf → Ruiz-Why-Don-Trace-Requirements-2023.pdf
90
+ 📄 document.pdf → Raspail-Camp_of_the_Saints.pdf
91
+ ```
92
+
93
+ **Live Progress Display:**
94
+ ```
95
+ Processing 3 PDFs with max 3 concurrent API calls and 10 concurrent extractions
96
+
97
+ ╭─────────────────────────── 📊 Progress ───────────────────────────╮
98
+ │ Total: 3 | Pending: 0 | Extracting: 0 | Analyzing: 0 | Complete: 3 │
99
+ ╰───────────────────────────────────────────────────────────────────╯
100
+ ╭───────────────────────────────────────────────────────────────────╮
101
+ │ [██████████████████████████████████████████████] 100.0% │
102
+ ╰───────────────────────────────────────────────────────────────────╯
103
+ Processing Status
104
+ ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
105
+ ┃ File ┃ Stage ┃ Status ┃ Details ┃
106
+ ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
107
+ │ paper_final_v3.pdf │ ✓ │ Complete │ very_high │
108
+ │ download (2).pdf │ ✓ │ Complete │ very_high (DOI) │
109
+ │ document.pdf │ ✓ │ Complete │ high │
110
+ └────────────────────┴───────┴──────────┴─────────────────────┘
111
+ ```
112
+
113
+ ## Features
114
+
115
+ - **🎓 DOI-based naming** - Automatically extracts DOI and fetches authoritative metadata for academic papers
116
+ - **🧠 Advanced PDF parsing** using docling-parse for better structure-aware extraction
117
+ - **👁️ OCR fallback** for scanned PDFs with low text content
118
+ - **🎯 Smart LLM prompting** with multi-pass analysis for improved accuracy
119
+ - **⚡ Hybrid approach** - Uses DOI metadata when available, falls back to LLM analysis for other documents
120
+ - **📝 Standardized format** - Generates filenames like `Author-Topic-Year.pdf`
121
+ - **🔍 Dry-run mode** to preview changes before applying
122
+ - **💬 Enhanced interactive mode** with options to accept, manually edit, retry, or skip each file
123
+ - **📊 Live progress display** with concurrent processing for speed
124
+ - **⚙️ Configurable concurrency** limits for API calls and PDF extraction
125
+ - **📦 Batch processing** of multiple PDFs with optional output directory
126
+
127
+ ## Installation
128
+
129
+ ### Quick Start (No Installation Required)
130
+
131
+ ```bash
132
+ # Run directly with uvx
133
+ uvx pdf-renamer --dry-run /path/to/pdfs
134
+ ```
135
+
136
+ ### Install from PyPI
137
+
138
+ ```bash
139
+ # Using pip
140
+ pip install pdf-file-renamer
141
+
142
+ # Using uv
143
+ uv pip install pdf-file-renamer
144
+ ```
145
+
146
+ ### Install from Source
147
+
148
+ ```bash
149
+ # Clone and install
150
+ git clone https://github.com/nostoslabs/pdf-renamer.git
151
+ cd pdf-renamer
152
+ uv sync
153
+ ```
154
+
155
+ ## Configuration
156
+
157
+ Configure your LLM provider:
158
+
159
+ **Option A: OpenAI (Cloud)**
160
+ ```bash
161
+ cp .env.example .env
162
+ # Edit .env and add your OPENAI_API_KEY
163
+ ```
164
+
165
+ **Option B: Ollama or other local models**
166
+ ```bash
167
+ # No API key needed for local models
168
+ # Either set LLM_BASE_URL in .env or use --url flag
169
+ echo "LLM_BASE_URL=http://patmos:11434/v1" > .env
170
+ ```
171
+
172
+ ## Usage
173
+
174
+ ### Quick Start
175
+
176
+ ```bash
177
+ # Preview renames (dry-run mode)
178
+ pdf-renamer --dry-run /path/to/pdf/directory
179
+
180
+ # Actually rename files
181
+ pdf-renamer --no-dry-run /path/to/pdf/directory
182
+
183
+ # Interactive mode - review each file
184
+ pdf-renamer --interactive --no-dry-run /path/to/pdf/directory
185
+ ```
186
+
187
+ ### Using uvx (No Installation)
188
+
189
+ ```bash
190
+ # Run directly without installing
191
+ uvx pdf-renamer --dry-run /path/to/pdfs
192
+
193
+ # Run from GitHub
194
+ uvx https://github.com/nostoslabs/pdf-renamer --dry-run /path/to/pdfs
195
+ ```
196
+
197
+ ### Options
198
+
199
+ - `--dry-run/--no-dry-run`: Show suggestions without renaming (default: True)
200
+ - `--interactive, -i`: Interactive mode with rich options:
201
+ - **Accept** - Use the suggested filename
202
+ - **Edit** - Manually modify the filename
203
+ - **Retry** - Ask the LLM to generate a new suggestion
204
+ - **Skip** - Skip this file and move to the next
205
+ - `--model`: Model to use (default: llama3.2, works with any OpenAI-compatible API)
206
+ - `--url`: Custom base URL for OpenAI-compatible APIs (default: http://localhost:11434/v1)
207
+ - `--pattern`: Glob pattern for files (default: *.pdf)
208
+ - `--output-dir, -o`: Move renamed files to a different directory
209
+ - `--max-concurrent-api`: Maximum concurrent API calls (default: 3)
210
+ - `--max-concurrent-pdf`: Maximum concurrent PDF extractions (default: 10)
211
+
212
+ ### Examples
213
+
214
+ **Using OpenAI:**
215
+ ```bash
216
+ # Preview all PDFs in current directory
217
+ uvx pdf-renamer --dry-run .
218
+
219
+ # Rename PDFs in specific directory
220
+ uvx pdf-renamer --no-dry-run ~/Documents/Papers
221
+
222
+ # Use a different OpenAI model
223
+ uvx pdf-renamer --model gpt-4o --dry-run .
224
+ ```
225
+
226
+ **Using Ollama (or other local models):**
227
+ ```bash
228
+ # Using Ollama on patmos server with gemma model
229
+ uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --dry-run .
230
+
231
+ # Using local Ollama with qwen model
232
+ uvx pdf-renamer --url http://localhost:11434/v1 --model qwen2.5 --dry-run .
233
+
234
+ # Set URL in environment and just use model flag
235
+ export LLM_BASE_URL=http://patmos:11434/v1
236
+ uvx pdf-renamer --model gemma3:latest --dry-run .
237
+ ```
238
+
239
+ **Other examples:**
240
+ ```bash
241
+ # Process only specific files
242
+ uvx pdf-renamer --pattern "*2020*.pdf" --dry-run .
243
+
244
+ # Interactive mode with local model
245
+ uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --interactive --no-dry-run .
246
+
247
+ # Run directly from GitHub
248
+ uvx https://github.com/nostoslabs/pdf-renamer --no-dry-run ~/Documents/Papers
249
+ ```
250
+
251
+ ## Interactive Mode
252
+
253
+ When using `--interactive` mode, you'll be presented with each file one at a time with detailed options:
254
+
255
+ ```
256
+ ================================================================================
257
+ Original: 2024-research-paper.pdf
258
+ Suggested: Smith-Machine-Learning-Applications-2024.pdf
259
+ Confidence: high
260
+ Reasoning: Clear author and topic identified from abstract
261
+ ================================================================================
262
+
263
+ Options:
264
+ y / yes / Enter - Accept suggested name
265
+ e / edit - Manually edit the filename
266
+ r / retry - Ask LLM to generate a new suggestion
267
+ n / no / skip - Skip this file
268
+
269
+ What would you like to do? [y]:
270
+ ```
271
+
272
+ This mode is perfect for:
273
+ - **Reviewing suggestions** before applying them
274
+ - **Fine-tuning filenames** that are close but not quite right
275
+ - **Retrying** when the LLM suggestion isn't good enough
276
+ - **Building confidence** in the tool before batch processing
277
+
278
+ You can use interactive mode with `--dry-run` to preview without actually renaming files, or with `--no-dry-run` to apply changes immediately after confirmation.
279
+
280
+ ## How It Works
281
+
282
+ ### Intelligent Hybrid Approach
283
+
284
+ The tool uses a multi-strategy approach to generate accurate filenames:
285
+
286
+ 1. **DOI Detection** (for academic papers)
287
+ - Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
288
+ - **Validates DOI metadata** against PDF content to prevent citation DOI mismatches
289
+ - If found and validated, queries authoritative metadata (title, authors, year, journal)
290
+ - Generates filename with **very high confidence** from validated metadata
291
+ - **Saves API costs** - no LLM call needed for papers with DOIs
292
+
293
+ 2. **LLM Analysis** (fallback for non-academic PDFs)
294
+ - **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
295
+ - **OCR**: Automatically applies OCR for scanned PDFs with minimal text
296
+ - **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
297
+ - **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
298
+ - **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
299
+ - **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
300
+
301
+ 3. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
302
+ 4. **Rename**: Applies suggestions (if not in dry-run mode)
303
+
304
+ ### Benefits of DOI Integration
305
+
306
+ - **Accuracy**: DOI metadata is canonical and verified
307
+ - **Speed**: Instant lookup vs. LLM processing time
308
+ - **Cost**: Free DOI lookups save on API costs for academic papers
309
+ - **Reliability**: Works even when PDF text extraction is poor
310
+
311
+ ## Cost Considerations
312
+
313
+ **DOI-based Naming (Academic Papers):**
314
+ - **Completely free** - No API costs
315
+ - **No LLM needed** - Direct metadata lookup
316
+ - Works for most academic papers with embedded DOIs
317
+
318
+ **OpenAI (Fallback):**
319
+ - Uses `gpt-4o-mini` by default (very cost-effective)
320
+ - Only called when DOI not found
321
+ - Processes first ~4500 characters per PDF
322
+ - Typical cost: ~$0.001-0.003 per PDF
323
+
324
+ **Ollama/Local Models:**
325
+ - Completely free (runs on your hardware)
326
+ - Works with any Ollama model (llama3, qwen2.5, mistral, etc.)
327
+ - Also compatible with LM Studio, vLLM, and other OpenAI-compatible endpoints
328
+
329
+ ## Filename Format
330
+
331
+ The tool generates filenames in this format:
332
+ - `Smith-Kalman-Filtering-Applications-2020.pdf`
333
+ - `Adamy-Electronic-Warfare-Modeling-Techniques.pdf`
334
+ - `Blair-Monopulse-Processing-Unresolved-Targets.pdf`
335
+
336
+ Guidelines:
337
+ - First author's last name
338
+ - 3-6 word topic description (prioritizes clarity over brevity)
339
+ - Year (if identifiable)
340
+ - Hyphens between words
341
+ - Target ~80 characters (can be longer if needed for clarity)
342
+
343
+ ## Architecture
344
+
345
+ This project follows **Clean Architecture** principles with clear separation of concerns:
346
+
347
+ ```
348
+ src/pdf_file_renamer/
349
+ ├── domain/ # Core business logic (models, ports)
350
+ ├── application/ # Use cases and workflows
351
+ ├── infrastructure/ # External integrations (PDF, LLM, DOI)
352
+ └── presentation/ # CLI and UI components
353
+ ```
354
+
355
+ **Key Design Patterns:**
356
+ - **Ports and Adapters** - Clean interfaces for external dependencies
357
+ - **Dependency Injection** - Flexible component composition
358
+ - **Single Responsibility** - Each module has one clear purpose
359
+ - **Type Safety** - Full mypy strict mode compliance
360
+
361
+ See [REFACTORING_SUMMARY.md](REFACTORING_SUMMARY.md) for detailed architecture notes.
362
+
363
+ ## Development
364
+
365
+ ### Setup
366
+
367
+ ```bash
368
+ # Clone repository
369
+ git clone https://github.com/nostoslabs/pdf-renamer.git
370
+ cd pdf-renamer
371
+
372
+ # Install dependencies with uv
373
+ uv sync
374
+
375
+ # Run tests
376
+ uv run pytest
377
+
378
+ # Run linting
379
+ uv run ruff check src/ tests/
380
+
381
+ # Run type checking
382
+ uv run mypy src/
383
+ ```
384
+
385
+ ### Code Quality
386
+
387
+ - **Tests**: pytest with async support and coverage reporting
388
+ - **Linting**: ruff for fast, comprehensive linting
389
+ - **Formatting**: ruff format for consistent code style
390
+ - **Type Checking**: mypy in strict mode
391
+ - **CI/CD**: GitHub Actions for automated testing and releases
392
+
393
+ ### Running Locally
394
+
395
+ ```bash
396
+ # Run with local changes
397
+ uv run pdf-file-renamer --dry-run /path/to/pdfs
398
+
399
+ # Run specific module
400
+ uv run python -m pdf_file_renamer.main --help
401
+ ```
402
+
403
+ ## Contributing
404
+
405
+ Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
406
+
407
+ ### Development Workflow
408
+
409
+ 1. Fork the repository
410
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
411
+ 3. Make your changes
412
+ 4. Run tests and linting (`uv run pytest && uv run ruff check src/`)
413
+ 5. Commit your changes (`git commit -m 'Add amazing feature'`)
414
+ 6. Push to the branch (`git push origin feature/amazing-feature`)
415
+ 7. Open a Pull Request
416
+
417
+ ### Code Style
418
+
419
+ - Follow PEP 8 (enforced by ruff)
420
+ - Use type hints for all functions
421
+ - Write tests for new features
422
+ - Update documentation as needed
423
+
424
+ ## License
425
+
426
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
427
+
428
+ ## Acknowledgments
429
+
430
+ - [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi) for DOI extraction
431
+ - [pydantic-ai](https://ai.pydantic.dev/) for LLM integration
432
+ - [docling-parse](https://github.com/DS4SD/docling-parse) for advanced PDF parsing
433
+ - [PyMuPDF](https://pymupdf.readthedocs.io/) for PDF text extraction
434
+ - [rich](https://rich.readthedocs.io/) for beautiful terminal UI
435
+
436
+ ## Support
437
+
438
+ - **Issues**: [GitHub Issues](https://github.com/nostoslabs/pdf-renamer/issues)
439
+ - **Discussions**: [GitHub Discussions](https://github.com/nostoslabs/pdf-renamer/discussions)
440
+ - **Changelog**: [CHANGELOG.md](CHANGELOG.md)
441
+
442
+ ---
443
+
444
+ **Made with ❤️ by [Nostos Labs](https://github.com/nostoslabs)**
@@ -1,4 +1,4 @@
1
- pdf_file_renamer/__init__.py,sha256=1hyyq0EM6vqGG8Gxxdkg3MuLU_4Mwj3mc812ikutUB8,85
1
+ pdf_file_renamer/__init__.py,sha256=ag2NG1Rry9SOlQHvUnNzrgujU5GkDJZ8Fh7FKCuSRNk,85
2
2
  pdf_file_renamer/main.py,sha256=FTEEb-9QmOOsN9SE8L1SZvFVIkVpQDy8xZ5a8t8CWUs,145
3
3
  pdf_file_renamer/application/__init__.py,sha256=riSV7UXBenkDst7Nnf11N1_RuRtM7wpKdwugxOhumS4,363
4
4
  pdf_file_renamer/application/filename_service.py,sha256=IbeCNBwyhFlCMCZveq16nmQ2qvyTdtgLmr6PDWPckOs,4868
@@ -10,7 +10,7 @@ pdf_file_renamer/domain/ports.py,sha256=ebOcHptiOK119NCmIwM32_fbRK5xkZP9K67vjL-4
10
10
  pdf_file_renamer/infrastructure/__init__.py,sha256=C3ZQ7WCPCa6PMfP00lu4wqb0r57GVyDdiD5EL2DhCeY,187
11
11
  pdf_file_renamer/infrastructure/config.py,sha256=baNL5_6_NNiS50ZNdql7fDwQbeAwf6f58HGYIWFQxQQ,2464
12
12
  pdf_file_renamer/infrastructure/doi/__init__.py,sha256=8N9ZEwfG7q5xomzh187YtP8t4CfEBHM334xNRblPeuI,153
13
- pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py,sha256=mK2Z5oOwN-TgiEHLgoLM5yCSe_-G9kWXLr4Sw3nMkEM,5105
13
+ pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py,sha256=1tQ7fQF3TPxUZ7By9dzKz4LAfE8TPyjlvt8lACqGiLk,9551
14
14
  pdf_file_renamer/infrastructure/llm/__init__.py,sha256=ToB8__mHvXwaIukGKPEAQ8SeC4ZLiH4auZI1P1yH5PQ,159
15
15
  pdf_file_renamer/infrastructure/llm/pydantic_ai_provider.py,sha256=kVsmj0NIawkj-1WWM0hZXbsNH09GabVZm9HPlYsxGuo,9217
16
16
  pdf_file_renamer/infrastructure/pdf/__init__.py,sha256=uMHqxSXNLZH5WH_e1kXrp9m7uTqPkiI2hXjNo6rCRoo,368
@@ -19,9 +19,9 @@ pdf_file_renamer/infrastructure/pdf/docling_extractor.py,sha256=auZrJpK7mMg1mUXK
19
19
  pdf_file_renamer/infrastructure/pdf/pymupdf_extractor.py,sha256=C61udZCqGqiVx7T0HWNyjvnhgv5AgMIcCYtrhgHOJwk,5465
20
20
  pdf_file_renamer/presentation/__init__.py,sha256=1VR44GoPGTixk3hG5YzhGyQf7a4BTKsJBd2VP3rHcFM,211
21
21
  pdf_file_renamer/presentation/cli.py,sha256=0t_59-utRWLNCYjFetU0ZHoF1DPTjdNiWM9Au0jFaOg,8013
22
- pdf_file_renamer/presentation/formatters.py,sha256=Es7pZoHw5bEPtNfa_s43eHXa_m0yrTmX6S2aU78JUE0,8978
23
- pdf_file_renamer-0.6.1.dist-info/METADATA,sha256=OyZKW601xnQFXR-SDLakLEnasq5rtfP7YO6IYn6f-z4,9912
24
- pdf_file_renamer-0.6.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
- pdf_file_renamer-0.6.1.dist-info/entry_points.txt,sha256=0fEGYy60chGE9rECWeCVPxjxzz6vMtIAYdFvmH7xzbw,63
26
- pdf_file_renamer-0.6.1.dist-info/licenses/LICENSE,sha256=_w08V08WgoMpDMlGNlkIatC5QfQ_Ds_rXOBM8pl7ffE,1068
27
- pdf_file_renamer-0.6.1.dist-info/RECORD,,
22
+ pdf_file_renamer/presentation/formatters.py,sha256=8Vz95QupJKkPgPgRyMVmA_gxRWG5vfxdnSd7Czovlrg,8946
23
+ pdf_file_renamer-0.6.3.dist-info/METADATA,sha256=ywxT5kRE2VGcv1HUuwvqrAeaVw7ksYsn6Y6MTa5hShA,16952
24
+ pdf_file_renamer-0.6.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
+ pdf_file_renamer-0.6.3.dist-info/entry_points.txt,sha256=0fEGYy60chGE9rECWeCVPxjxzz6vMtIAYdFvmH7xzbw,63
26
+ pdf_file_renamer-0.6.3.dist-info/licenses/LICENSE,sha256=_w08V08WgoMpDMlGNlkIatC5QfQ_Ds_rXOBM8pl7ffE,1068
27
+ pdf_file_renamer-0.6.3.dist-info/RECORD,,
@@ -1,272 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: pdf-file-renamer
3
- Version: 0.6.1
4
- Summary: Intelligent PDF renaming using LLMs
5
- License-File: LICENSE
6
- Requires-Python: >=3.11
7
- Requires-Dist: docling-core>=2.0.0
8
- Requires-Dist: docling-parse>=2.0.0
9
- Requires-Dist: pdf2doi>=1.7
10
- Requires-Dist: pydantic-ai>=1.0.17
11
- Requires-Dist: pydantic-settings>=2.7.1
12
- Requires-Dist: pydantic>=2.10.6
13
- Requires-Dist: pymupdf>=1.26.5
14
- Requires-Dist: python-dotenv>=1.1.1
15
- Requires-Dist: rich>=14.2.0
16
- Requires-Dist: tenacity>=9.0.0
17
- Requires-Dist: typer>=0.19.2
18
- Provides-Extra: dev
19
- Requires-Dist: mypy>=1.14.1; extra == 'dev'
20
- Requires-Dist: pytest-asyncio>=0.25.2; extra == 'dev'
21
- Requires-Dist: pytest-cov>=6.0.0; extra == 'dev'
22
- Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
23
- Requires-Dist: pytest>=8.3.4; extra == 'dev'
24
- Requires-Dist: ruff>=0.9.1; extra == 'dev'
25
- Description-Content-Type: text/markdown
26
-
27
- # PDF Renamer
28
-
29
- [![PyPI version](https://img.shields.io/pypi/v/pdf-file-renamer.svg)](https://pypi.org/project/pdf-file-renamer/)
30
- [![Python](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
31
- [![uv](https://img.shields.io/badge/uv-0.5+-orange.svg)](https://docs.astral.sh/uv/)
32
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
33
- [![pydantic-ai](https://img.shields.io/badge/pydantic--ai-1.0+-green.svg)](https://ai.pydantic.dev/)
34
- [![GitHub](https://img.shields.io/badge/github-nostoslabs%2Fpdf--renamer-blue?logo=github)](https://github.com/nostoslabs/pdf-renamer)
35
-
36
- [![Tests](https://img.shields.io/badge/tests-passing-brightgreen.svg)](https://github.com/nostoslabs/pdf-renamer)
37
- [![Code style: ruff](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff)
38
- [![Type checked: mypy](https://img.shields.io/badge/type%20checked-mypy-blue.svg)](http://mypy-lang.org/)
39
- [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
40
-
41
- Intelligent PDF file renaming using LLMs. This tool analyzes PDF content and metadata to suggest descriptive, standardized filenames.
42
-
43
- > 🚀 Works with **OpenAI**, **Ollama**, **LM Studio**, and any OpenAI-compatible API
44
-
45
- ## Features
46
-
47
- - **DOI-based naming** - Automatically extracts DOI and fetches authoritative metadata for academic papers
48
- - **Advanced PDF parsing** using docling-parse for better structure-aware extraction
49
- - **OCR fallback** for scanned PDFs with low text content
50
- - **Smart LLM prompting** with multi-pass analysis for improved accuracy
51
- - **Hybrid approach** - Uses DOI metadata when available, falls back to LLM analysis for other documents
52
- - Suggests filenames in format: `Author-Topic-Year.pdf`
53
- - Dry-run mode to preview changes before applying
54
- - **Enhanced interactive mode** with options to accept, manually edit, retry, or skip each file
55
- - **Live progress display** with concurrent processing for speed
56
- - **Configurable concurrency** limits for API calls and PDF extraction
57
- - Batch processing of multiple PDFs with optional output directory
58
-
59
- ## Installation
60
-
61
- ### Quick Start (No Installation Required)
62
-
63
- ```bash
64
- # Run directly with uvx
65
- uvx pdf-renamer --dry-run /path/to/pdfs
66
- ```
67
-
68
- ### Install from PyPI
69
-
70
- ```bash
71
- # Using pip
72
- pip install pdf-file-renamer
73
-
74
- # Using uv
75
- uv pip install pdf-file-renamer
76
- ```
77
-
78
- ### Install from Source
79
-
80
- ```bash
81
- # Clone and install
82
- git clone https://github.com/nostoslabs/pdf-renamer.git
83
- cd pdf-renamer
84
- uv sync
85
- ```
86
-
87
- ## Configuration
88
-
89
- Configure your LLM provider:
90
-
91
- **Option A: OpenAI (Cloud)**
92
- ```bash
93
- cp .env.example .env
94
- # Edit .env and add your OPENAI_API_KEY
95
- ```
96
-
97
- **Option B: Ollama or other local models**
98
- ```bash
99
- # No API key needed for local models
100
- # Either set LLM_BASE_URL in .env or use --url flag
101
- echo "LLM_BASE_URL=http://patmos:11434/v1" > .env
102
- ```
103
-
104
- ## Usage
105
-
106
- ### Quick Start
107
-
108
- ```bash
109
- # Preview renames (dry-run mode)
110
- pdf-renamer --dry-run /path/to/pdf/directory
111
-
112
- # Actually rename files
113
- pdf-renamer --no-dry-run /path/to/pdf/directory
114
-
115
- # Interactive mode - review each file
116
- pdf-renamer --interactive --no-dry-run /path/to/pdf/directory
117
- ```
118
-
119
- ### Using uvx (No Installation)
120
-
121
- ```bash
122
- # Run directly without installing
123
- uvx pdf-renamer --dry-run /path/to/pdfs
124
-
125
- # Run from GitHub
126
- uvx https://github.com/nostoslabs/pdf-renamer --dry-run /path/to/pdfs
127
- ```
128
-
129
- ### Options
130
-
131
- - `--dry-run/--no-dry-run`: Show suggestions without renaming (default: True)
132
- - `--interactive, -i`: Interactive mode with rich options:
133
- - **Accept** - Use the suggested filename
134
- - **Edit** - Manually modify the filename
135
- - **Retry** - Ask the LLM to generate a new suggestion
136
- - **Skip** - Skip this file and move to the next
137
- - `--model`: Model to use (default: llama3.2, works with any OpenAI-compatible API)
138
- - `--url`: Custom base URL for OpenAI-compatible APIs (default: http://localhost:11434/v1)
139
- - `--pattern`: Glob pattern for files (default: *.pdf)
140
- - `--output-dir, -o`: Move renamed files to a different directory
141
- - `--max-concurrent-api`: Maximum concurrent API calls (default: 3)
142
- - `--max-concurrent-pdf`: Maximum concurrent PDF extractions (default: 10)
143
-
144
- ### Examples
145
-
146
- **Using OpenAI:**
147
- ```bash
148
- # Preview all PDFs in current directory
149
- uvx pdf-renamer --dry-run .
150
-
151
- # Rename PDFs in specific directory
152
- uvx pdf-renamer --no-dry-run ~/Documents/Papers
153
-
154
- # Use a different OpenAI model
155
- uvx pdf-renamer --model gpt-4o --dry-run .
156
- ```
157
-
158
- **Using Ollama (or other local models):**
159
- ```bash
160
- # Using Ollama on patmos server with gemma model
161
- uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --dry-run .
162
-
163
- # Using local Ollama with qwen model
164
- uvx pdf-renamer --url http://localhost:11434/v1 --model qwen2.5 --dry-run .
165
-
166
- # Set URL in environment and just use model flag
167
- export LLM_BASE_URL=http://patmos:11434/v1
168
- uvx pdf-renamer --model gemma3:latest --dry-run .
169
- ```
170
-
171
- **Other examples:**
172
- ```bash
173
- # Process only specific files
174
- uvx pdf-renamer --pattern "*2020*.pdf" --dry-run .
175
-
176
- # Interactive mode with local model
177
- uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --interactive --no-dry-run .
178
-
179
- # Run directly from GitHub
180
- uvx https://github.com/nostoslabs/pdf-renamer --no-dry-run ~/Documents/Papers
181
- ```
182
-
183
- ## Interactive Mode
184
-
185
- When using `--interactive` mode, you'll be presented with each file one at a time with detailed options:
186
-
187
- ```
188
- ================================================================================
189
- Original: 2024-research-paper.pdf
190
- Suggested: Smith-Machine-Learning-Applications-2024.pdf
191
- Confidence: high
192
- Reasoning: Clear author and topic identified from abstract
193
- ================================================================================
194
-
195
- Options:
196
- y / yes / Enter - Accept suggested name
197
- e / edit - Manually edit the filename
198
- r / retry - Ask LLM to generate a new suggestion
199
- n / no / skip - Skip this file
200
-
201
- What would you like to do? [y]:
202
- ```
203
-
204
- This mode is perfect for:
205
- - **Reviewing suggestions** before applying them
206
- - **Fine-tuning filenames** that are close but not quite right
207
- - **Retrying** when the LLM suggestion isn't good enough
208
- - **Building confidence** in the tool before batch processing
209
-
210
- You can use interactive mode with `--dry-run` to preview without actually renaming files, or with `--no-dry-run` to apply changes immediately after confirmation.
211
-
212
- ## How It Works
213
-
214
- ### Intelligent Hybrid Approach
215
-
216
- The tool uses a multi-strategy approach to generate accurate filenames:
217
-
218
- 1. **DOI Detection** (for academic papers)
219
- - Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
220
- - If found, queries authoritative metadata (title, authors, year, journal)
221
- - Generates filename with **very high confidence** from validated metadata
222
- - **Saves API costs** - no LLM call needed for papers with DOIs
223
-
224
- 2. **LLM Analysis** (fallback for non-academic PDFs)
225
- - **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
226
- - **OCR**: Automatically applies OCR for scanned PDFs with minimal text
227
- - **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
228
- - **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
229
- - **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
230
- - **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
231
-
232
- 3. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
233
- 4. **Rename**: Applies suggestions (if not in dry-run mode)
234
-
235
- ### Benefits of DOI Integration
236
-
237
- - **Accuracy**: DOI metadata is canonical and verified
238
- - **Speed**: Instant lookup vs. LLM processing time
239
- - **Cost**: Free DOI lookups save on API costs for academic papers
240
- - **Reliability**: Works even when PDF text extraction is poor
241
-
242
- ## Cost Considerations
243
-
244
- **DOI-based Naming (Academic Papers):**
245
- - **Completely free** - No API costs
246
- - **No LLM needed** - Direct metadata lookup
247
- - Works for most academic papers with embedded DOIs
248
-
249
- **OpenAI (Fallback):**
250
- - Uses `gpt-4o-mini` by default (very cost-effective)
251
- - Only called when DOI not found
252
- - Processes first ~4500 characters per PDF
253
- - Typical cost: ~$0.001-0.003 per PDF
254
-
255
- **Ollama/Local Models:**
256
- - Completely free (runs on your hardware)
257
- - Works with any Ollama model (llama3, qwen2.5, mistral, etc.)
258
- - Also compatible with LM Studio, vLLM, and other OpenAI-compatible endpoints
259
-
260
- ## Filename Format
261
-
262
- The tool generates filenames in this format:
263
- - `Smith-Kalman-Filtering-Applications-2020.pdf`
264
- - `Adamy-Electronic-Warfare-Modeling-Techniques.pdf`
265
- - `Blair-Monopulse-Processing-Unresolved-Targets.pdf`
266
-
267
- Guidelines:
268
- - First author's last name
269
- - 3-6 word topic description (prioritizes clarity over brevity)
270
- - Year (if identifiable)
271
- - Hyphens between words
272
- - Target ~80 characters (can be longer if needed for clarity)