pdf-file-renamer 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,3 @@
1
1
  """PDF Renamer - Intelligent PDF file renaming using LLMs."""
2
2
 
3
- __version__ = "0.6.0"
3
+ __version__ = "0.6.2"
@@ -5,7 +5,7 @@ import contextlib
5
5
  from collections.abc import Callable
6
6
  from pathlib import Path
7
7
 
8
- from pdf_file_renamer.domain.models import FileRenameOperation
8
+ from pdf_file_renamer.domain.models import ConfidenceLevel, FileRenameOperation
9
9
  from pdf_file_renamer.domain.ports import (
10
10
  DOIExtractor,
11
11
  FilenameGenerator,
@@ -103,12 +103,18 @@ class PDFRenameWorkflow:
103
103
 
104
104
  # Mark complete
105
105
  if status_callback:
106
+ # result.confidence is already a string due to use_enum_values=True
107
+ confidence_str = (
108
+ result.confidence.value
109
+ if isinstance(result.confidence, ConfidenceLevel)
110
+ else result.confidence
111
+ )
106
112
  status_callback(
107
113
  filename,
108
114
  {
109
115
  "status": "Complete",
110
116
  "stage": "✓",
111
- "confidence": result.confidence.value,
117
+ "confidence": confidence_str,
112
118
  },
113
119
  )
114
120
 
@@ -1,6 +1,7 @@
1
1
  """DOI extraction using pdf2doi library."""
2
2
 
3
3
  import asyncio
4
+ import contextlib
4
5
  import re
5
6
  from pathlib import Path
6
7
 
@@ -31,16 +32,12 @@ class PDF2DOIExtractor(DOIExtractor):
31
32
  try:
32
33
  # Run pdf2doi in executor to avoid blocking
33
34
  loop = asyncio.get_event_loop()
34
- results = await loop.run_in_executor(
35
- None, pdf2doi.pdf2doi, str(pdf_path)
36
- )
35
+ result = await loop.run_in_executor(None, pdf2doi.pdf2doi, str(pdf_path))
37
36
 
38
- if not results or len(results) == 0:
37
+ # pdf2doi returns a dict (not a list)
38
+ if not result or not isinstance(result, dict):
39
39
  return None
40
40
 
41
- # Get the first result
42
- result = results[0]
43
-
44
41
  # Check if DOI was found
45
42
  identifier = result.get("identifier")
46
43
  if not identifier:
@@ -50,15 +47,49 @@ class PDF2DOIExtractor(DOIExtractor):
50
47
  if identifier_type.lower() not in ("doi", "arxiv"):
51
48
  return None
52
49
 
53
- # Extract metadata from validation_info (bibtex)
50
+ # Extract metadata from validation_info (JSON string from CrossRef API)
54
51
  validation_info = result.get("validation_info", "")
55
52
 
56
- # Parse bibtex for metadata
57
- title = self._extract_bibtex_field(validation_info, "title")
58
- authors = self._extract_bibtex_authors(validation_info)
59
- year = self._extract_bibtex_field(validation_info, "year")
60
- journal = self._extract_bibtex_field(validation_info, "journal")
61
- publisher = self._extract_bibtex_field(validation_info, "publisher")
53
+ # Parse JSON metadata
54
+ import json
55
+
56
+ metadata = {}
57
+ if validation_info:
58
+ with contextlib.suppress(json.JSONDecodeError):
59
+ metadata = json.loads(validation_info)
60
+
61
+ # Extract title
62
+ title = metadata.get("title")
63
+
64
+ # Extract authors (list of dicts with 'given' and 'family' fields)
65
+ authors: list[str] | None = None
66
+ if "author" in metadata:
67
+ author_list = metadata["author"]
68
+ author_names: list[str] = []
69
+ for author in author_list:
70
+ if isinstance(author, dict):
71
+ family = author.get("family", "")
72
+ given = author.get("given", "")
73
+ if family:
74
+ full_name = f"{given} {family}".strip() if given else family
75
+ author_names.append(full_name)
76
+ if author_names:
77
+ authors = author_names
78
+
79
+ # Extract year from published-online or published
80
+ year = None
81
+ for date_field in ["published-online", "published", "created"]:
82
+ if date_field in metadata and "date-parts" in metadata[date_field]:
83
+ date_parts = metadata[date_field]["date-parts"]
84
+ if date_parts and len(date_parts) > 0 and len(date_parts[0]) > 0:
85
+ year = str(date_parts[0][0])
86
+ break
87
+
88
+ # Extract journal (container-title)
89
+ journal = metadata.get("container-title")
90
+
91
+ # Extract publisher
92
+ publisher = metadata.get("publisher")
62
93
 
63
94
  return DOIMetadata(
64
95
  doi=identifier,
@@ -7,7 +7,7 @@ from rich.prompt import Prompt
7
7
  from rich.table import Table
8
8
  from rich.text import Text
9
9
 
10
- from pdf_file_renamer.domain.models import FileRenameOperation
10
+ from pdf_file_renamer.domain.models import ConfidenceLevel, FileRenameOperation
11
11
 
12
12
 
13
13
  class ProgressDisplay:
@@ -146,7 +146,13 @@ class InteractivePrompt:
146
146
  info_text.append("Suggested: ", style="bold green")
147
147
  info_text.append(f"{operation.new_filename}\n", style="green")
148
148
  info_text.append("Confidence: ", style="bold yellow")
149
- info_text.append(f"{operation.confidence.value}\n", style="yellow")
149
+ # Handle both enum and string confidence
150
+ conf_str = (
151
+ operation.confidence.value
152
+ if isinstance(operation.confidence, ConfidenceLevel)
153
+ else operation.confidence
154
+ )
155
+ info_text.append(f"{conf_str}\n", style="yellow")
150
156
  info_text.append("Reasoning: ", style="bold white")
151
157
  info_text.append(operation.reasoning, style="dim white")
152
158
 
@@ -206,10 +212,14 @@ class ResultsTable:
206
212
  reasoning = op.reasoning
207
213
  if len(reasoning) > 100:
208
214
  reasoning = reasoning[:100] + "..."
215
+ # Handle both enum and string confidence
216
+ conf_str = (
217
+ op.confidence.value if isinstance(op.confidence, ConfidenceLevel) else op.confidence
218
+ )
209
219
  table.add_row(
210
220
  op.original_path.name,
211
221
  op.new_filename,
212
- op.confidence.value,
222
+ conf_str,
213
223
  reasoning,
214
224
  )
215
225
 
@@ -0,0 +1,443 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdf-file-renamer
3
+ Version: 0.6.2
4
+ Summary: Intelligent PDF renaming using LLMs with DOI-based naming and interactive workflow
5
+ Project-URL: Homepage, https://github.com/nostoslabs/pdf-renamer
6
+ Project-URL: Repository, https://github.com/nostoslabs/pdf-renamer
7
+ Project-URL: Issues, https://github.com/nostoslabs/pdf-renamer/issues
8
+ Project-URL: Changelog, https://github.com/nostoslabs/pdf-renamer/blob/main/CHANGELOG.md
9
+ Author-email: Nostos Labs <info@nostoslabs.com>
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: academic-papers,ai,automation,document-management,doi,file-organization,llm,pdf,rename
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Environment :: Console
15
+ Classifier: Intended Audience :: Education
16
+ Classifier: Intended Audience :: End Users/Desktop
17
+ Classifier: Intended Audience :: Science/Research
18
+ Classifier: License :: OSI Approved :: MIT License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Office/Business :: Office Suites
24
+ Classifier: Topic :: Scientific/Engineering
25
+ Classifier: Topic :: Text Processing :: General
26
+ Classifier: Topic :: Utilities
27
+ Classifier: Typing :: Typed
28
+ Requires-Python: >=3.11
29
+ Requires-Dist: docling-core>=2.0.0
30
+ Requires-Dist: docling-parse>=2.0.0
31
+ Requires-Dist: pdf2doi>=1.7
32
+ Requires-Dist: pydantic-ai>=1.0.17
33
+ Requires-Dist: pydantic-settings>=2.7.1
34
+ Requires-Dist: pydantic>=2.10.6
35
+ Requires-Dist: pymupdf>=1.26.5
36
+ Requires-Dist: python-dotenv>=1.1.1
37
+ Requires-Dist: rich>=14.2.0
38
+ Requires-Dist: tenacity>=9.0.0
39
+ Requires-Dist: typer>=0.19.2
40
+ Provides-Extra: dev
41
+ Requires-Dist: mypy>=1.14.1; extra == 'dev'
42
+ Requires-Dist: pytest-asyncio>=0.25.2; extra == 'dev'
43
+ Requires-Dist: pytest-cov>=6.0.0; extra == 'dev'
44
+ Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
45
+ Requires-Dist: pytest>=8.3.4; extra == 'dev'
46
+ Requires-Dist: ruff>=0.9.1; extra == 'dev'
47
+ Description-Content-Type: text/markdown
48
+
49
+ # PDF Renamer
50
+
51
+ [![PyPI version](https://img.shields.io/pypi/v/pdf-file-renamer.svg)](https://pypi.org/project/pdf-file-renamer/)
52
+ [![PyPI downloads](https://img.shields.io/pypi/dm/pdf-file-renamer.svg)](https://pypi.org/project/pdf-file-renamer/)
53
+ [![Python](https://img.shields.io/pypi/pyversions/pdf-file-renamer.svg)](https://pypi.org/project/pdf-file-renamer/)
54
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
55
+ [![CI](https://github.com/nostoslabs/pdf-renamer/workflows/CI/badge.svg)](https://github.com/nostoslabs/pdf-renamer/actions)
56
+ [![Code style: ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
57
+ [![Type checked: mypy](https://img.shields.io/badge/type%20checked-mypy-blue.svg)](http://mypy-lang.org/)
58
+
59
+ **Intelligent PDF file renaming using LLMs and DOI metadata.** Automatically generate clean, descriptive filenames for your PDF library.
60
+
61
+ > 🚀 Works with **OpenAI**, **Ollama**, **LM Studio**, and any OpenAI-compatible API
62
+ > 📚 **DOI-first** approach for academic papers - no API costs!
63
+ > 🎯 **Interactive mode** with retry, edit, and skip options
64
+
65
+ ## Table of Contents
66
+
67
+ - [Quick Example](#quick-example)
68
+ - [Features](#features)
69
+ - [Installation](#installation)
70
+ - [Configuration](#configuration)
71
+ - [Usage](#usage)
72
+ - [Interactive Mode](#interactive-mode)
73
+ - [How It Works](#how-it-works)
74
+ - [Cost Considerations](#cost-considerations)
75
+ - [Architecture](#architecture)
76
+ - [Development](#development)
77
+ - [Contributing](#contributing)
78
+ - [License](#license)
79
+
80
+ ## Quick Example
81
+
82
+ ![Demo](demo.gif)
83
+
84
+ Transform messy filenames into clean, organized ones:
85
+
86
+ ```
87
+ Before: After:
88
+ 📄 paper_final_v3.pdf → Leroux-Analog-In-memory-Computing-2025.pdf
89
+ 📄 download (2).pdf → Ruiz-Why-Don-Trace-Requirements-2023.pdf
90
+ 📄 document.pdf → Raspail-Camp_of_the_Saints.pdf
91
+ ```
92
+
93
+ **Live Progress Display:**
94
+ ```
95
+ Processing 3 PDFs with max 3 concurrent API calls and 10 concurrent extractions
96
+
97
+ ╭─────────────────────────── 📊 Progress ───────────────────────────╮
98
+ │ Total: 3 | Pending: 0 | Extracting: 0 | Analyzing: 0 | Complete: 3 │
99
+ ╰───────────────────────────────────────────────────────────────────╯
100
+ ╭───────────────────────────────────────────────────────────────────╮
101
+ │ [██████████████████████████████████████████████] 100.0% │
102
+ ╰───────────────────────────────────────────────────────────────────╯
103
+ Processing Status
104
+ ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
105
+ ┃ File ┃ Stage ┃ Status ┃ Details ┃
106
+ ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
107
+ │ paper_final_v3.pdf │ ✓ │ Complete │ very_high │
108
+ │ download (2).pdf │ ✓ │ Complete │ very_high (DOI) │
109
+ │ document.pdf │ ✓ │ Complete │ high │
110
+ └────────────────────┴───────┴──────────┴─────────────────────┘
111
+ ```
112
+
113
+ ## Features
114
+
115
+ - **🎓 DOI-based naming** - Automatically extracts DOI and fetches authoritative metadata for academic papers
116
+ - **🧠 Advanced PDF parsing** using docling-parse for better structure-aware extraction
117
+ - **👁️ OCR fallback** for scanned PDFs with low text content
118
+ - **🎯 Smart LLM prompting** with multi-pass analysis for improved accuracy
119
+ - **⚡ Hybrid approach** - Uses DOI metadata when available, falls back to LLM analysis for other documents
120
+ - **📝 Standardized format** - Generates filenames like `Author-Topic-Year.pdf`
121
+ - **🔍 Dry-run mode** to preview changes before applying
122
+ - **💬 Enhanced interactive mode** with options to accept, manually edit, retry, or skip each file
123
+ - **📊 Live progress display** with concurrent processing for speed
124
+ - **⚙️ Configurable concurrency** limits for API calls and PDF extraction
125
+ - **📦 Batch processing** of multiple PDFs with optional output directory
126
+
127
+ ## Installation
128
+
129
+ ### Quick Start (No Installation Required)
130
+
131
+ ```bash
132
+ # Run directly with uvx
133
+ uvx pdf-renamer --dry-run /path/to/pdfs
134
+ ```
135
+
136
+ ### Install from PyPI
137
+
138
+ ```bash
139
+ # Using pip
140
+ pip install pdf-file-renamer
141
+
142
+ # Using uv
143
+ uv pip install pdf-file-renamer
144
+ ```
145
+
146
+ ### Install from Source
147
+
148
+ ```bash
149
+ # Clone and install
150
+ git clone https://github.com/nostoslabs/pdf-renamer.git
151
+ cd pdf-renamer
152
+ uv sync
153
+ ```
154
+
155
+ ## Configuration
156
+
157
+ Configure your LLM provider:
158
+
159
+ **Option A: OpenAI (Cloud)**
160
+ ```bash
161
+ cp .env.example .env
162
+ # Edit .env and add your OPENAI_API_KEY
163
+ ```
164
+
165
+ **Option B: Ollama or other local models**
166
+ ```bash
167
+ # No API key needed for local models
168
+ # Either set LLM_BASE_URL in .env or use --url flag
169
+ echo "LLM_BASE_URL=http://patmos:11434/v1" > .env
170
+ ```
171
+
172
+ ## Usage
173
+
174
+ ### Quick Start
175
+
176
+ ```bash
177
+ # Preview renames (dry-run mode)
178
+ pdf-renamer --dry-run /path/to/pdf/directory
179
+
180
+ # Actually rename files
181
+ pdf-renamer --no-dry-run /path/to/pdf/directory
182
+
183
+ # Interactive mode - review each file
184
+ pdf-renamer --interactive --no-dry-run /path/to/pdf/directory
185
+ ```
186
+
187
+ ### Using uvx (No Installation)
188
+
189
+ ```bash
190
+ # Run directly without installing
191
+ uvx pdf-renamer --dry-run /path/to/pdfs
192
+
193
+ # Run from GitHub
194
+ uvx https://github.com/nostoslabs/pdf-renamer --dry-run /path/to/pdfs
195
+ ```
196
+
197
+ ### Options
198
+
199
+ - `--dry-run/--no-dry-run`: Show suggestions without renaming (default: True)
200
+ - `--interactive, -i`: Interactive mode with rich options:
201
+ - **Accept** - Use the suggested filename
202
+ - **Edit** - Manually modify the filename
203
+ - **Retry** - Ask the LLM to generate a new suggestion
204
+ - **Skip** - Skip this file and move to the next
205
+ - `--model`: Model to use (default: llama3.2, works with any OpenAI-compatible API)
206
+ - `--url`: Custom base URL for OpenAI-compatible APIs (default: http://localhost:11434/v1)
207
+ - `--pattern`: Glob pattern for files (default: *.pdf)
208
+ - `--output-dir, -o`: Move renamed files to a different directory
209
+ - `--max-concurrent-api`: Maximum concurrent API calls (default: 3)
210
+ - `--max-concurrent-pdf`: Maximum concurrent PDF extractions (default: 10)
211
+
212
+ ### Examples
213
+
214
+ **Using OpenAI:**
215
+ ```bash
216
+ # Preview all PDFs in current directory
217
+ uvx pdf-renamer --dry-run .
218
+
219
+ # Rename PDFs in specific directory
220
+ uvx pdf-renamer --no-dry-run ~/Documents/Papers
221
+
222
+ # Use a different OpenAI model
223
+ uvx pdf-renamer --model gpt-4o --dry-run .
224
+ ```
225
+
226
+ **Using Ollama (or other local models):**
227
+ ```bash
228
+ # Using Ollama on patmos server with gemma model
229
+ uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --dry-run .
230
+
231
+ # Using local Ollama with qwen model
232
+ uvx pdf-renamer --url http://localhost:11434/v1 --model qwen2.5 --dry-run .
233
+
234
+ # Set URL in environment and just use model flag
235
+ export LLM_BASE_URL=http://patmos:11434/v1
236
+ uvx pdf-renamer --model gemma3:latest --dry-run .
237
+ ```
238
+
239
+ **Other examples:**
240
+ ```bash
241
+ # Process only specific files
242
+ uvx pdf-renamer --pattern "*2020*.pdf" --dry-run .
243
+
244
+ # Interactive mode with local model
245
+ uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --interactive --no-dry-run .
246
+
247
+ # Run directly from GitHub
248
+ uvx https://github.com/nostoslabs/pdf-renamer --no-dry-run ~/Documents/Papers
249
+ ```
250
+
251
+ ## Interactive Mode
252
+
253
+ When using `--interactive` mode, you'll be presented with each file one at a time with detailed options:
254
+
255
+ ```
256
+ ================================================================================
257
+ Original: 2024-research-paper.pdf
258
+ Suggested: Smith-Machine-Learning-Applications-2024.pdf
259
+ Confidence: high
260
+ Reasoning: Clear author and topic identified from abstract
261
+ ================================================================================
262
+
263
+ Options:
264
+ y / yes / Enter - Accept suggested name
265
+ e / edit - Manually edit the filename
266
+ r / retry - Ask LLM to generate a new suggestion
267
+ n / no / skip - Skip this file
268
+
269
+ What would you like to do? [y]:
270
+ ```
271
+
272
+ This mode is perfect for:
273
+ - **Reviewing suggestions** before applying them
274
+ - **Fine-tuning filenames** that are close but not quite right
275
+ - **Retrying** when the LLM suggestion isn't good enough
276
+ - **Building confidence** in the tool before batch processing
277
+
278
+ You can use interactive mode with `--dry-run` to preview without actually renaming files, or with `--no-dry-run` to apply changes immediately after confirmation.
279
+
280
+ ## How It Works
281
+
282
+ ### Intelligent Hybrid Approach
283
+
284
+ The tool uses a multi-strategy approach to generate accurate filenames:
285
+
286
+ 1. **DOI Detection** (for academic papers)
287
+ - Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
288
+ - If found, queries authoritative metadata (title, authors, year, journal)
289
+ - Generates filename with **very high confidence** from validated metadata
290
+ - **Saves API costs** - no LLM call needed for papers with DOIs
291
+
292
+ 2. **LLM Analysis** (fallback for non-academic PDFs)
293
+ - **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
294
+ - **OCR**: Automatically applies OCR for scanned PDFs with minimal text
295
+ - **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
296
+ - **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
297
+ - **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
298
+ - **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
299
+
300
+ 3. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
301
+ 4. **Rename**: Applies suggestions (if not in dry-run mode)
302
+
303
+ ### Benefits of DOI Integration
304
+
305
+ - **Accuracy**: DOI metadata is canonical and verified
306
+ - **Speed**: Instant lookup vs. LLM processing time
307
+ - **Cost**: Free DOI lookups save on API costs for academic papers
308
+ - **Reliability**: Works even when PDF text extraction is poor
309
+
310
+ ## Cost Considerations
311
+
312
+ **DOI-based Naming (Academic Papers):**
313
+ - **Completely free** - No API costs
314
+ - **No LLM needed** - Direct metadata lookup
315
+ - Works for most academic papers with embedded DOIs
316
+
317
+ **OpenAI (Fallback):**
318
+ - Uses `gpt-4o-mini` by default (very cost-effective)
319
+ - Only called when DOI not found
320
+ - Processes first ~4500 characters per PDF
321
+ - Typical cost: ~$0.001-0.003 per PDF
322
+
323
+ **Ollama/Local Models:**
324
+ - Completely free (runs on your hardware)
325
+ - Works with any Ollama model (llama3, qwen2.5, mistral, etc.)
326
+ - Also compatible with LM Studio, vLLM, and other OpenAI-compatible endpoints
327
+
328
+ ## Filename Format
329
+
330
+ The tool generates filenames in this format:
331
+ - `Smith-Kalman-Filtering-Applications-2020.pdf`
332
+ - `Adamy-Electronic-Warfare-Modeling-Techniques.pdf`
333
+ - `Blair-Monopulse-Processing-Unresolved-Targets.pdf`
334
+
335
+ Guidelines:
336
+ - First author's last name
337
+ - 3-6 word topic description (prioritizes clarity over brevity)
338
+ - Year (if identifiable)
339
+ - Hyphens between words
340
+ - Target ~80 characters (can be longer if needed for clarity)
341
+
342
+ ## Architecture
343
+
344
+ This project follows **Clean Architecture** principles with clear separation of concerns:
345
+
346
+ ```
347
+ src/pdf_file_renamer/
348
+ ├── domain/ # Core business logic (models, ports)
349
+ ├── application/ # Use cases and workflows
350
+ ├── infrastructure/ # External integrations (PDF, LLM, DOI)
351
+ └── presentation/ # CLI and UI components
352
+ ```
353
+
354
+ **Key Design Patterns:**
355
+ - **Ports and Adapters** - Clean interfaces for external dependencies
356
+ - **Dependency Injection** - Flexible component composition
357
+ - **Single Responsibility** - Each module has one clear purpose
358
+ - **Type Safety** - Full mypy strict mode compliance
359
+
360
+ See [REFACTORING_SUMMARY.md](REFACTORING_SUMMARY.md) for detailed architecture notes.
361
+
362
+ ## Development
363
+
364
+ ### Setup
365
+
366
+ ```bash
367
+ # Clone repository
368
+ git clone https://github.com/nostoslabs/pdf-renamer.git
369
+ cd pdf-renamer
370
+
371
+ # Install dependencies with uv
372
+ uv sync
373
+
374
+ # Run tests
375
+ uv run pytest
376
+
377
+ # Run linting
378
+ uv run ruff check src/ tests/
379
+
380
+ # Run type checking
381
+ uv run mypy src/
382
+ ```
383
+
384
+ ### Code Quality
385
+
386
+ - **Tests**: pytest with async support and coverage reporting
387
+ - **Linting**: ruff for fast, comprehensive linting
388
+ - **Formatting**: ruff format for consistent code style
389
+ - **Type Checking**: mypy in strict mode
390
+ - **CI/CD**: GitHub Actions for automated testing and releases
391
+
392
+ ### Running Locally
393
+
394
+ ```bash
395
+ # Run with local changes
396
+ uv run pdf-file-renamer --dry-run /path/to/pdfs
397
+
398
+ # Run specific module
399
+ uv run python -m pdf_file_renamer.main --help
400
+ ```
401
+
402
+ ## Contributing
403
+
404
+ Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
405
+
406
+ ### Development Workflow
407
+
408
+ 1. Fork the repository
409
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
410
+ 3. Make your changes
411
+ 4. Run tests and linting (`uv run pytest && uv run ruff check src/`)
412
+ 5. Commit your changes (`git commit -m 'Add amazing feature'`)
413
+ 6. Push to the branch (`git push origin feature/amazing-feature`)
414
+ 7. Open a Pull Request
415
+
416
+ ### Code Style
417
+
418
+ - Follow PEP 8 (enforced by ruff)
419
+ - Use type hints for all functions
420
+ - Write tests for new features
421
+ - Update documentation as needed
422
+
423
+ ## License
424
+
425
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
426
+
427
+ ## Acknowledgments
428
+
429
+ - [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi) for DOI extraction
430
+ - [pydantic-ai](https://ai.pydantic.dev/) for LLM integration
431
+ - [docling-parse](https://github.com/DS4SD/docling-parse) for advanced PDF parsing
432
+ - [PyMuPDF](https://pymupdf.readthedocs.io/) for PDF text extraction
433
+ - [rich](https://rich.readthedocs.io/) for beautiful terminal UI
434
+
435
+ ## Support
436
+
437
+ - **Issues**: [GitHub Issues](https://github.com/nostoslabs/pdf-renamer/issues)
438
+ - **Discussions**: [GitHub Discussions](https://github.com/nostoslabs/pdf-renamer/discussions)
439
+ - **Changelog**: [CHANGELOG.md](CHANGELOG.md)
440
+
441
+ ---
442
+
443
+ **Made with ❤️ by [Nostos Labs](https://github.com/nostoslabs)**
@@ -1,8 +1,8 @@
1
- pdf_file_renamer/__init__.py,sha256=yCEfy0jblhbUMNTSjJKPuW4zADXoI6IfICx8XvB4R8Q,85
1
+ pdf_file_renamer/__init__.py,sha256=ag2NG1Rry9SOlQHvUnNzrgujU5GkDJZ8Fh7FKCuSRNk,85
2
2
  pdf_file_renamer/main.py,sha256=FTEEb-9QmOOsN9SE8L1SZvFVIkVpQDy8xZ5a8t8CWUs,145
3
3
  pdf_file_renamer/application/__init__.py,sha256=riSV7UXBenkDst7Nnf11N1_RuRtM7wpKdwugxOhumS4,363
4
4
  pdf_file_renamer/application/filename_service.py,sha256=IbeCNBwyhFlCMCZveq16nmQ2qvyTdtgLmr6PDWPckOs,4868
5
- pdf_file_renamer/application/pdf_rename_workflow.py,sha256=gd53KoR1aFrK__6TArm7Rtn1yNxylEI2ikmubDOByF4,5842
5
+ pdf_file_renamer/application/pdf_rename_workflow.py,sha256=WLcGJ4ufEmAnGSxVQcOFDeGG8gXSccs11DaP521YDzo,6144
6
6
  pdf_file_renamer/application/rename_service.py,sha256=vviNQolk_w-qDQvOKTKj8ZhqYyyNWL-VJMfuUnL6WLw,2357
7
7
  pdf_file_renamer/domain/__init__.py,sha256=jxbH3h6xaCnSRuBxclFESl6ZE1pua_I1K4CRAaYxu_I,503
8
8
  pdf_file_renamer/domain/models.py,sha256=QwN79TzWmqvQvz-m9ymebvAx3pWlVpSWXNdSEAk4qq0,3186
@@ -10,7 +10,7 @@ pdf_file_renamer/domain/ports.py,sha256=ebOcHptiOK119NCmIwM32_fbRK5xkZP9K67vjL-4
10
10
  pdf_file_renamer/infrastructure/__init__.py,sha256=C3ZQ7WCPCa6PMfP00lu4wqb0r57GVyDdiD5EL2DhCeY,187
11
11
  pdf_file_renamer/infrastructure/config.py,sha256=baNL5_6_NNiS50ZNdql7fDwQbeAwf6f58HGYIWFQxQQ,2464
12
12
  pdf_file_renamer/infrastructure/doi/__init__.py,sha256=8N9ZEwfG7q5xomzh187YtP8t4CfEBHM334xNRblPeuI,153
13
- pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py,sha256=rAo0q5HTCqVTyyIXzW3D6riGS5Q9xlXhbT2AY0Hb7nk,3820
13
+ pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py,sha256=714WU8MRQF2mWFEdB9MSm2nexivIByKxciOyArgfkTs,5114
14
14
  pdf_file_renamer/infrastructure/llm/__init__.py,sha256=ToB8__mHvXwaIukGKPEAQ8SeC4ZLiH4auZI1P1yH5PQ,159
15
15
  pdf_file_renamer/infrastructure/llm/pydantic_ai_provider.py,sha256=kVsmj0NIawkj-1WWM0hZXbsNH09GabVZm9HPlYsxGuo,9217
16
16
  pdf_file_renamer/infrastructure/pdf/__init__.py,sha256=uMHqxSXNLZH5WH_e1kXrp9m7uTqPkiI2hXjNo6rCRoo,368
@@ -19,9 +19,9 @@ pdf_file_renamer/infrastructure/pdf/docling_extractor.py,sha256=auZrJpK7mMg1mUXK
19
19
  pdf_file_renamer/infrastructure/pdf/pymupdf_extractor.py,sha256=C61udZCqGqiVx7T0HWNyjvnhgv5AgMIcCYtrhgHOJwk,5465
20
20
  pdf_file_renamer/presentation/__init__.py,sha256=1VR44GoPGTixk3hG5YzhGyQf7a4BTKsJBd2VP3rHcFM,211
21
21
  pdf_file_renamer/presentation/cli.py,sha256=0t_59-utRWLNCYjFetU0ZHoF1DPTjdNiWM9Au0jFaOg,8013
22
- pdf_file_renamer/presentation/formatters.py,sha256=Yl-Et7OKMfthyLqTA5qEtSAqh0PfHKp3lNNBA_dn01c,8519
23
- pdf_file_renamer-0.6.0.dist-info/METADATA,sha256=6XmqT7jtJuqkWlks3FlPWCpNP_tKOGokhxy2Yju5R7k,9912
24
- pdf_file_renamer-0.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
- pdf_file_renamer-0.6.0.dist-info/entry_points.txt,sha256=0fEGYy60chGE9rECWeCVPxjxzz6vMtIAYdFvmH7xzbw,63
26
- pdf_file_renamer-0.6.0.dist-info/licenses/LICENSE,sha256=_w08V08WgoMpDMlGNlkIatC5QfQ_Ds_rXOBM8pl7ffE,1068
27
- pdf_file_renamer-0.6.0.dist-info/RECORD,,
22
+ pdf_file_renamer/presentation/formatters.py,sha256=8Vz95QupJKkPgPgRyMVmA_gxRWG5vfxdnSd7Czovlrg,8946
23
+ pdf_file_renamer-0.6.2.dist-info/METADATA,sha256=qwnly-Ce8cu-S1pNnj6-NwuPC3ZbdOxGTtLheEK0IKc,16851
24
+ pdf_file_renamer-0.6.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
+ pdf_file_renamer-0.6.2.dist-info/entry_points.txt,sha256=0fEGYy60chGE9rECWeCVPxjxzz6vMtIAYdFvmH7xzbw,63
26
+ pdf_file_renamer-0.6.2.dist-info/licenses/LICENSE,sha256=_w08V08WgoMpDMlGNlkIatC5QfQ_Ds_rXOBM8pl7ffE,1068
27
+ pdf_file_renamer-0.6.2.dist-info/RECORD,,
@@ -1,272 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: pdf-file-renamer
3
- Version: 0.6.0
4
- Summary: Intelligent PDF renaming using LLMs
5
- License-File: LICENSE
6
- Requires-Python: >=3.11
7
- Requires-Dist: docling-core>=2.0.0
8
- Requires-Dist: docling-parse>=2.0.0
9
- Requires-Dist: pdf2doi>=1.7
10
- Requires-Dist: pydantic-ai>=1.0.17
11
- Requires-Dist: pydantic-settings>=2.7.1
12
- Requires-Dist: pydantic>=2.10.6
13
- Requires-Dist: pymupdf>=1.26.5
14
- Requires-Dist: python-dotenv>=1.1.1
15
- Requires-Dist: rich>=14.2.0
16
- Requires-Dist: tenacity>=9.0.0
17
- Requires-Dist: typer>=0.19.2
18
- Provides-Extra: dev
19
- Requires-Dist: mypy>=1.14.1; extra == 'dev'
20
- Requires-Dist: pytest-asyncio>=0.25.2; extra == 'dev'
21
- Requires-Dist: pytest-cov>=6.0.0; extra == 'dev'
22
- Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
23
- Requires-Dist: pytest>=8.3.4; extra == 'dev'
24
- Requires-Dist: ruff>=0.9.1; extra == 'dev'
25
- Description-Content-Type: text/markdown
26
-
27
- # PDF Renamer
28
-
29
- [![PyPI version](https://img.shields.io/pypi/v/pdf-file-renamer.svg)](https://pypi.org/project/pdf-file-renamer/)
30
- [![Python](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
31
- [![uv](https://img.shields.io/badge/uv-0.5+-orange.svg)](https://docs.astral.sh/uv/)
32
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
33
- [![pydantic-ai](https://img.shields.io/badge/pydantic--ai-1.0+-green.svg)](https://ai.pydantic.dev/)
34
- [![GitHub](https://img.shields.io/badge/github-nostoslabs%2Fpdf--renamer-blue?logo=github)](https://github.com/nostoslabs/pdf-renamer)
35
-
36
- [![Tests](https://img.shields.io/badge/tests-passing-brightgreen.svg)](https://github.com/nostoslabs/pdf-renamer)
37
- [![Code style: ruff](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff)
38
- [![Type checked: mypy](https://img.shields.io/badge/type%20checked-mypy-blue.svg)](http://mypy-lang.org/)
39
- [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
40
-
41
- Intelligent PDF file renaming using LLMs. This tool analyzes PDF content and metadata to suggest descriptive, standardized filenames.
42
-
43
- > 🚀 Works with **OpenAI**, **Ollama**, **LM Studio**, and any OpenAI-compatible API
44
-
45
- ## Features
46
-
47
- - **DOI-based naming** - Automatically extracts DOI and fetches authoritative metadata for academic papers
48
- - **Advanced PDF parsing** using docling-parse for better structure-aware extraction
49
- - **OCR fallback** for scanned PDFs with low text content
50
- - **Smart LLM prompting** with multi-pass analysis for improved accuracy
51
- - **Hybrid approach** - Uses DOI metadata when available, falls back to LLM analysis for other documents
52
- - Suggests filenames in format: `Author-Topic-Year.pdf`
53
- - Dry-run mode to preview changes before applying
54
- - **Enhanced interactive mode** with options to accept, manually edit, retry, or skip each file
55
- - **Live progress display** with concurrent processing for speed
56
- - **Configurable concurrency** limits for API calls and PDF extraction
57
- - Batch processing of multiple PDFs with optional output directory
58
-
59
- ## Installation
60
-
61
- ### Quick Start (No Installation Required)
62
-
63
- ```bash
64
- # Run directly with uvx
65
- uvx pdf-renamer --dry-run /path/to/pdfs
66
- ```
67
-
68
- ### Install from PyPI
69
-
70
- ```bash
71
- # Using pip
72
- pip install pdf-file-renamer
73
-
74
- # Using uv
75
- uv pip install pdf-file-renamer
76
- ```
77
-
78
- ### Install from Source
79
-
80
- ```bash
81
- # Clone and install
82
- git clone https://github.com/nostoslabs/pdf-renamer.git
83
- cd pdf-renamer
84
- uv sync
85
- ```
86
-
87
- ## Configuration
88
-
89
- Configure your LLM provider:
90
-
91
- **Option A: OpenAI (Cloud)**
92
- ```bash
93
- cp .env.example .env
94
- # Edit .env and add your OPENAI_API_KEY
95
- ```
96
-
97
- **Option B: Ollama or other local models**
98
- ```bash
99
- # No API key needed for local models
100
- # Either set LLM_BASE_URL in .env or use --url flag
101
- echo "LLM_BASE_URL=http://patmos:11434/v1" > .env
102
- ```
103
-
104
- ## Usage
105
-
106
- ### Quick Start
107
-
108
- ```bash
109
- # Preview renames (dry-run mode)
110
- pdf-renamer --dry-run /path/to/pdf/directory
111
-
112
- # Actually rename files
113
- pdf-renamer --no-dry-run /path/to/pdf/directory
114
-
115
- # Interactive mode - review each file
116
- pdf-renamer --interactive --no-dry-run /path/to/pdf/directory
117
- ```
118
-
119
- ### Using uvx (No Installation)
120
-
121
- ```bash
122
- # Run directly without installing
123
- uvx pdf-renamer --dry-run /path/to/pdfs
124
-
125
- # Run from GitHub
126
- uvx https://github.com/nostoslabs/pdf-renamer --dry-run /path/to/pdfs
127
- ```
128
-
129
- ### Options
130
-
131
- - `--dry-run/--no-dry-run`: Show suggestions without renaming (default: True)
132
- - `--interactive, -i`: Interactive mode with rich options:
133
- - **Accept** - Use the suggested filename
134
- - **Edit** - Manually modify the filename
135
- - **Retry** - Ask the LLM to generate a new suggestion
136
- - **Skip** - Skip this file and move to the next
137
- - `--model`: Model to use (default: llama3.2, works with any OpenAI-compatible API)
138
- - `--url`: Custom base URL for OpenAI-compatible APIs (default: http://localhost:11434/v1)
139
- - `--pattern`: Glob pattern for files (default: *.pdf)
140
- - `--output-dir, -o`: Move renamed files to a different directory
141
- - `--max-concurrent-api`: Maximum concurrent API calls (default: 3)
142
- - `--max-concurrent-pdf`: Maximum concurrent PDF extractions (default: 10)
143
-
144
- ### Examples
145
-
146
- **Using OpenAI:**
147
- ```bash
148
- # Preview all PDFs in current directory
149
- uvx pdf-renamer --dry-run .
150
-
151
- # Rename PDFs in specific directory
152
- uvx pdf-renamer --no-dry-run ~/Documents/Papers
153
-
154
- # Use a different OpenAI model
155
- uvx pdf-renamer --model gpt-4o --dry-run .
156
- ```
157
-
158
- **Using Ollama (or other local models):**
159
- ```bash
160
- # Using Ollama on patmos server with gemma model
161
- uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --dry-run .
162
-
163
- # Using local Ollama with qwen model
164
- uvx pdf-renamer --url http://localhost:11434/v1 --model qwen2.5 --dry-run .
165
-
166
- # Set URL in environment and just use model flag
167
- export LLM_BASE_URL=http://patmos:11434/v1
168
- uvx pdf-renamer --model gemma3:latest --dry-run .
169
- ```
170
-
171
- **Other examples:**
172
- ```bash
173
- # Process only specific files
174
- uvx pdf-renamer --pattern "*2020*.pdf" --dry-run .
175
-
176
- # Interactive mode with local model
177
- uvx pdf-renamer --url http://patmos:11434/v1 --model gemma3:latest --interactive --no-dry-run .
178
-
179
- # Run directly from GitHub
180
- uvx https://github.com/nostoslabs/pdf-renamer --no-dry-run ~/Documents/Papers
181
- ```
182
-
183
- ## Interactive Mode
184
-
185
- When using `--interactive` mode, you'll be presented with each file one at a time with detailed options:
186
-
187
- ```
188
- ================================================================================
189
- Original: 2024-research-paper.pdf
190
- Suggested: Smith-Machine-Learning-Applications-2024.pdf
191
- Confidence: high
192
- Reasoning: Clear author and topic identified from abstract
193
- ================================================================================
194
-
195
- Options:
196
- y / yes / Enter - Accept suggested name
197
- e / edit - Manually edit the filename
198
- r / retry - Ask LLM to generate a new suggestion
199
- n / no / skip - Skip this file
200
-
201
- What would you like to do? [y]:
202
- ```
203
-
204
- This mode is perfect for:
205
- - **Reviewing suggestions** before applying them
206
- - **Fine-tuning filenames** that are close but not quite right
207
- - **Retrying** when the LLM suggestion isn't good enough
208
- - **Building confidence** in the tool before batch processing
209
-
210
- You can use interactive mode with `--dry-run` to preview without actually renaming files, or with `--no-dry-run` to apply changes immediately after confirmation.
211
-
212
- ## How It Works
213
-
214
- ### Intelligent Hybrid Approach
215
-
216
- The tool uses a multi-strategy approach to generate accurate filenames:
217
-
218
- 1. **DOI Detection** (for academic papers)
219
- - Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
220
- - If found, queries authoritative metadata (title, authors, year, journal)
221
- - Generates filename with **very high confidence** from validated metadata
222
- - **Saves API costs** - no LLM call needed for papers with DOIs
223
-
224
- 2. **LLM Analysis** (fallback for non-academic PDFs)
225
- - **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
226
- - **OCR**: Automatically applies OCR for scanned PDFs with minimal text
227
- - **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
228
- - **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
229
- - **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
230
- - **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
231
-
232
- 3. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
233
- 4. **Rename**: Applies suggestions (if not in dry-run mode)
234
-
235
- ### Benefits of DOI Integration
236
-
237
- - **Accuracy**: DOI metadata is canonical and verified
238
- - **Speed**: Instant lookup vs. LLM processing time
239
- - **Cost**: Free DOI lookups save on API costs for academic papers
240
- - **Reliability**: Works even when PDF text extraction is poor
241
-
242
- ## Cost Considerations
243
-
244
- **DOI-based Naming (Academic Papers):**
245
- - **Completely free** - No API costs
246
- - **No LLM needed** - Direct metadata lookup
247
- - Works for most academic papers with embedded DOIs
248
-
249
- **OpenAI (Fallback):**
250
- - Uses `gpt-4o-mini` by default (very cost-effective)
251
- - Only called when DOI not found
252
- - Processes first ~4500 characters per PDF
253
- - Typical cost: ~$0.001-0.003 per PDF
254
-
255
- **Ollama/Local Models:**
256
- - Completely free (runs on your hardware)
257
- - Works with any Ollama model (llama3, qwen2.5, mistral, etc.)
258
- - Also compatible with LM Studio, vLLM, and other OpenAI-compatible endpoints
259
-
260
- ## Filename Format
261
-
262
- The tool generates filenames in this format:
263
- - `Smith-Kalman-Filtering-Applications-2020.pdf`
264
- - `Adamy-Electronic-Warfare-Modeling-Techniques.pdf`
265
- - `Blair-Monopulse-Processing-Unresolved-Targets.pdf`
266
-
267
- Guidelines:
268
- - First author's last name
269
- - 3-6 word topic description (prioritizes clarity over brevity)
270
- - Year (if identifiable)
271
- - Hyphens between words
272
- - Target ~80 characters (can be longer if needed for clarity)