pdf-file-renamer 0.4.2__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_file_renamer-0.6.0/.env.example +9 -0
- pdf_file_renamer-0.6.0/.github/workflows/ci.yml +78 -0
- pdf_file_renamer-0.6.0/.github/workflows/release.yml +69 -0
- pdf_file_renamer-0.6.0/.gitignore +55 -0
- pdf_file_renamer-0.6.0/.python-version +1 -0
- {pdf_file_renamer-0.4.2/pdf_file_renamer.egg-info → pdf_file_renamer-0.6.0}/PKG-INFO +50 -23
- pdf_file_renamer-0.4.2/PKG-INFO → pdf_file_renamer-0.6.0/README.md +36 -35
- pdf_file_renamer-0.6.0/REFACTORING_SUMMARY.md +288 -0
- pdf_file_renamer-0.6.0/coverage.xml +854 -0
- {pdf_file_renamer-0.4.2 → pdf_file_renamer-0.6.0}/pyproject.toml +13 -4
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/__init__.py +1 -1
- pdf_file_renamer-0.6.0/src/pdf_file_renamer/application/__init__.py +7 -0
- pdf_file_renamer-0.6.0/src/pdf_file_renamer/application/filename_service.py +172 -0
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/application/pdf_rename_workflow.py +29 -4
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/application/rename_service.py +1 -1
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/domain/__init__.py +2 -2
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/domain/models.py +29 -0
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/domain/ports.py +18 -1
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/infrastructure/__init__.py +1 -1
- pdf_file_renamer-0.6.0/src/pdf_file_renamer/infrastructure/doi/__init__.py +5 -0
- pdf_file_renamer-0.6.0/src/pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py +129 -0
- pdf_file_renamer-0.6.0/src/pdf_file_renamer/infrastructure/llm/__init__.py +5 -0
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/infrastructure/llm/pydantic_ai_provider.py +2 -2
- pdf_file_renamer-0.6.0/src/pdf_file_renamer/infrastructure/pdf/__init__.py +7 -0
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/infrastructure/pdf/composite.py +2 -2
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/infrastructure/pdf/docling_extractor.py +2 -2
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/infrastructure/pdf/pymupdf_extractor.py +2 -2
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/main.py +1 -1
- pdf_file_renamer-0.6.0/src/pdf_file_renamer/presentation/__init__.py +6 -0
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/presentation/cli.py +10 -5
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/presentation/formatters.py +1 -1
- pdf_file_renamer-0.6.0/tests/__init__.py +1 -0
- pdf_file_renamer-0.6.0/tests/data/2025-dennis-managing-complexity.pdf +0 -0
- pdf_file_renamer-0.6.0/tests/data/Camp_of_the_Saints.pdf +0 -0
- pdf_file_renamer-0.6.0/tests/data/s43588-025-00854-1.pdf +13838 -22
- {pdf_file_renamer-0.4.2 → pdf_file_renamer-0.6.0}/tests/test_domain_models.py +1 -1
- {pdf_file_renamer-0.4.2 → pdf_file_renamer-0.6.0}/tests/test_filename_service.py +3 -3
- {pdf_file_renamer-0.4.2 → pdf_file_renamer-0.6.0}/tests/test_rename_service.py +1 -1
- pdf_file_renamer-0.4.2/README.md +0 -219
- pdf_file_renamer-0.4.2/pdf_file_renamer.egg-info/SOURCES.txt +0 -32
- pdf_file_renamer-0.4.2/pdf_file_renamer.egg-info/dependency_links.txt +0 -1
- pdf_file_renamer-0.4.2/pdf_file_renamer.egg-info/entry_points.txt +0 -2
- pdf_file_renamer-0.4.2/pdf_file_renamer.egg-info/requires.txt +0 -18
- pdf_file_renamer-0.4.2/pdf_file_renamer.egg-info/top_level.txt +0 -1
- pdf_file_renamer-0.4.2/pdf_renamer/application/__init__.py +0 -7
- pdf_file_renamer-0.4.2/pdf_renamer/application/filename_service.py +0 -70
- pdf_file_renamer-0.4.2/pdf_renamer/infrastructure/llm/__init__.py +0 -5
- pdf_file_renamer-0.4.2/pdf_renamer/infrastructure/pdf/__init__.py +0 -7
- pdf_file_renamer-0.4.2/pdf_renamer/presentation/__init__.py +0 -6
- pdf_file_renamer-0.4.2/setup.cfg +0 -4
- {pdf_file_renamer-0.4.2 → pdf_file_renamer-0.6.0}/LICENSE +0 -0
- {pdf_file_renamer-0.4.2/pdf_renamer → pdf_file_renamer-0.6.0/src/pdf_file_renamer}/infrastructure/config.py +0 -0
@@ -0,0 +1,9 @@
|
|
1
|
+
# OpenAI API Key (required for OpenAI, optional for custom endpoints)
|
2
|
+
OPENAI_API_KEY=your_api_key_here
|
3
|
+
|
4
|
+
# Optional: Custom base URL for OpenAI-compatible APIs
|
5
|
+
# Examples:
|
6
|
+
# - Ollama: http://patmos:11434/v1
|
7
|
+
# - LM Studio: http://localhost:1234/v1
|
8
|
+
# - vLLM: http://your-server:8000/v1
|
9
|
+
# LLM_BASE_URL=http://patmos:11434/v1
|
@@ -0,0 +1,78 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches: [main, develop]
|
6
|
+
pull_request:
|
7
|
+
branches: [main, develop]
|
8
|
+
|
9
|
+
jobs:
|
10
|
+
test:
|
11
|
+
name: Test Python ${{ matrix.python-version }}
|
12
|
+
runs-on: ubuntu-latest
|
13
|
+
strategy:
|
14
|
+
matrix:
|
15
|
+
python-version: ["3.11", "3.12"]
|
16
|
+
|
17
|
+
steps:
|
18
|
+
- uses: actions/checkout@v4
|
19
|
+
|
20
|
+
- name: Install uv
|
21
|
+
uses: astral-sh/setup-uv@v4
|
22
|
+
with:
|
23
|
+
version: "latest"
|
24
|
+
|
25
|
+
- name: Set up Python ${{ matrix.python-version }}
|
26
|
+
run: uv python install ${{ matrix.python-version }}
|
27
|
+
|
28
|
+
- name: Install dependencies
|
29
|
+
run: uv sync --all-extras
|
30
|
+
|
31
|
+
- name: Run ruff linting
|
32
|
+
run: uv run ruff check src/pdf_file_renamer tests
|
33
|
+
|
34
|
+
- name: Run ruff formatting check
|
35
|
+
run: uv run ruff format --check src/pdf_file_renamer tests
|
36
|
+
|
37
|
+
- name: Run mypy type checking
|
38
|
+
run: uv run mypy src/pdf_file_renamer
|
39
|
+
|
40
|
+
- name: Run tests with coverage
|
41
|
+
run: uv run pytest tests/ --cov=pdf_file_renamer --cov-report=xml --cov-report=term
|
42
|
+
|
43
|
+
- name: Upload coverage to Codecov
|
44
|
+
uses: codecov/codecov-action@v4
|
45
|
+
if: matrix.python-version == '3.11'
|
46
|
+
with:
|
47
|
+
file: ./coverage.xml
|
48
|
+
fail_ci_if_error: false
|
49
|
+
|
50
|
+
build:
|
51
|
+
name: Build distribution
|
52
|
+
runs-on: ubuntu-latest
|
53
|
+
needs: test
|
54
|
+
|
55
|
+
steps:
|
56
|
+
- uses: actions/checkout@v4
|
57
|
+
|
58
|
+
- name: Install uv
|
59
|
+
uses: astral-sh/setup-uv@v4
|
60
|
+
with:
|
61
|
+
version: "latest"
|
62
|
+
|
63
|
+
- name: Set up Python
|
64
|
+
run: uv python install 3.11
|
65
|
+
|
66
|
+
- name: Build package
|
67
|
+
run: uv build
|
68
|
+
|
69
|
+
- name: Check build
|
70
|
+
run: |
|
71
|
+
ls -lh dist/
|
72
|
+
uv run twine check dist/*
|
73
|
+
|
74
|
+
- name: Upload artifacts
|
75
|
+
uses: actions/upload-artifact@v4
|
76
|
+
with:
|
77
|
+
name: dist
|
78
|
+
path: dist/
|
@@ -0,0 +1,69 @@
|
|
1
|
+
name: Release
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
tags:
|
6
|
+
- "v*"
|
7
|
+
|
8
|
+
permissions:
|
9
|
+
contents: write
|
10
|
+
|
11
|
+
jobs:
|
12
|
+
build-and-release:
|
13
|
+
name: Build and Release
|
14
|
+
runs-on: ubuntu-latest
|
15
|
+
|
16
|
+
steps:
|
17
|
+
- uses: actions/checkout@v4
|
18
|
+
|
19
|
+
- name: Install uv
|
20
|
+
uses: astral-sh/setup-uv@v4
|
21
|
+
with:
|
22
|
+
version: "latest"
|
23
|
+
|
24
|
+
- name: Set up Python
|
25
|
+
run: uv python install 3.11
|
26
|
+
|
27
|
+
- name: Install dependencies
|
28
|
+
run: uv sync --all-extras
|
29
|
+
|
30
|
+
- name: Run tests
|
31
|
+
run: uv run pytest tests/
|
32
|
+
|
33
|
+
- name: Build package
|
34
|
+
run: uv build
|
35
|
+
|
36
|
+
- name: Extract version from tag
|
37
|
+
id: get_version
|
38
|
+
run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
|
39
|
+
|
40
|
+
- name: Publish to PyPI
|
41
|
+
env:
|
42
|
+
TWINE_USERNAME: __token__
|
43
|
+
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
|
44
|
+
run: |
|
45
|
+
uv run twine upload dist/*
|
46
|
+
|
47
|
+
- name: Create Release
|
48
|
+
uses: softprops/action-gh-release@v1
|
49
|
+
with:
|
50
|
+
files: dist/*
|
51
|
+
generate_release_notes: true
|
52
|
+
body: |
|
53
|
+
## What's Changed
|
54
|
+
|
55
|
+
Release version ${{ steps.get_version.outputs.VERSION }}
|
56
|
+
|
57
|
+
See the [REFACTORING_SUMMARY.md](https://github.com/${{ github.repository }}/blob/${{ github.ref_name }}/REFACTORING_SUMMARY.md) for architecture details.
|
58
|
+
|
59
|
+
### Installation
|
60
|
+
|
61
|
+
**From PyPI:**
|
62
|
+
```bash
|
63
|
+
pip install pdf-renamer==${{ steps.get_version.outputs.VERSION }}
|
64
|
+
```
|
65
|
+
|
66
|
+
**Using uvx (no installation required):**
|
67
|
+
```bash
|
68
|
+
uvx pdf-renamer@${{ steps.get_version.outputs.VERSION }}
|
69
|
+
```
|
@@ -0,0 +1,55 @@
|
|
1
|
+
.claude
|
2
|
+
# Python
|
3
|
+
__pycache__/
|
4
|
+
*.py[cod]
|
5
|
+
*$py.class
|
6
|
+
*.so
|
7
|
+
.Python
|
8
|
+
build/
|
9
|
+
develop-eggs/
|
10
|
+
dist/
|
11
|
+
downloads/
|
12
|
+
eggs/
|
13
|
+
.eggs/
|
14
|
+
lib/
|
15
|
+
lib64/
|
16
|
+
parts/
|
17
|
+
sdist/
|
18
|
+
var/
|
19
|
+
wheels/
|
20
|
+
*.egg-info/
|
21
|
+
.installed.cfg
|
22
|
+
*.egg
|
23
|
+
|
24
|
+
# Virtual environments
|
25
|
+
venv/
|
26
|
+
ENV/
|
27
|
+
env/
|
28
|
+
.venv/
|
29
|
+
|
30
|
+
# uv
|
31
|
+
uv.lock
|
32
|
+
|
33
|
+
# IDEs
|
34
|
+
.vscode/
|
35
|
+
.idea/
|
36
|
+
*.swp
|
37
|
+
*.swo
|
38
|
+
*~
|
39
|
+
.DS_Store
|
40
|
+
|
41
|
+
# Environment variables
|
42
|
+
.env
|
43
|
+
.env.local
|
44
|
+
|
45
|
+
# Testing
|
46
|
+
.pytest_cache/
|
47
|
+
.coverage
|
48
|
+
htmlcov/
|
49
|
+
|
50
|
+
# Logs
|
51
|
+
*.log
|
52
|
+
|
53
|
+
# Temporary files
|
54
|
+
*.tmp
|
55
|
+
.cache/
|
@@ -0,0 +1 @@
|
|
1
|
+
3.11
|
@@ -1,28 +1,28 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pdf-file-renamer
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.6.0
|
4
4
|
Summary: Intelligent PDF renaming using LLMs
|
5
|
-
Requires-Python: >=3.11
|
6
|
-
Description-Content-Type: text/markdown
|
7
5
|
License-File: LICENSE
|
8
|
-
Requires-
|
6
|
+
Requires-Python: >=3.11
|
7
|
+
Requires-Dist: docling-core>=2.0.0
|
8
|
+
Requires-Dist: docling-parse>=2.0.0
|
9
|
+
Requires-Dist: pdf2doi>=1.7
|
9
10
|
Requires-Dist: pydantic-ai>=1.0.17
|
10
11
|
Requires-Dist: pydantic-settings>=2.7.1
|
12
|
+
Requires-Dist: pydantic>=2.10.6
|
11
13
|
Requires-Dist: pymupdf>=1.26.5
|
12
|
-
Requires-Dist: docling-parse>=2.0.0
|
13
|
-
Requires-Dist: docling-core>=2.0.0
|
14
14
|
Requires-Dist: python-dotenv>=1.1.1
|
15
15
|
Requires-Dist: rich>=14.2.0
|
16
|
-
Requires-Dist: typer>=0.19.2
|
17
16
|
Requires-Dist: tenacity>=9.0.0
|
17
|
+
Requires-Dist: typer>=0.19.2
|
18
18
|
Provides-Extra: dev
|
19
|
-
Requires-Dist:
|
20
|
-
Requires-Dist: pytest-
|
21
|
-
Requires-Dist: pytest-
|
22
|
-
Requires-Dist: pytest-mock>=3.14.0; extra ==
|
23
|
-
Requires-Dist:
|
24
|
-
Requires-Dist:
|
25
|
-
|
19
|
+
Requires-Dist: mypy>=1.14.1; extra == 'dev'
|
20
|
+
Requires-Dist: pytest-asyncio>=0.25.2; extra == 'dev'
|
21
|
+
Requires-Dist: pytest-cov>=6.0.0; extra == 'dev'
|
22
|
+
Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
|
23
|
+
Requires-Dist: pytest>=8.3.4; extra == 'dev'
|
24
|
+
Requires-Dist: ruff>=0.9.1; extra == 'dev'
|
25
|
+
Description-Content-Type: text/markdown
|
26
26
|
|
27
27
|
# PDF Renamer
|
28
28
|
|
@@ -44,9 +44,11 @@ Intelligent PDF file renaming using LLMs. This tool analyzes PDF content and met
|
|
44
44
|
|
45
45
|
## Features
|
46
46
|
|
47
|
+
- **DOI-based naming** - Automatically extracts DOI and fetches authoritative metadata for academic papers
|
47
48
|
- **Advanced PDF parsing** using docling-parse for better structure-aware extraction
|
48
49
|
- **OCR fallback** for scanned PDFs with low text content
|
49
50
|
- **Smart LLM prompting** with multi-pass analysis for improved accuracy
|
51
|
+
- **Hybrid approach** - Uses DOI metadata when available, falls back to LLM analysis for other documents
|
50
52
|
- Suggests filenames in format: `Author-Topic-Year.pdf`
|
51
53
|
- Dry-run mode to preview changes before applying
|
52
54
|
- **Enhanced interactive mode** with options to accept, manually edit, retry, or skip each file
|
@@ -209,19 +211,44 @@ You can use interactive mode with `--dry-run` to preview without actually renami
|
|
209
211
|
|
210
212
|
## How It Works
|
211
213
|
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
214
|
+
### Intelligent Hybrid Approach
|
215
|
+
|
216
|
+
The tool uses a multi-strategy approach to generate accurate filenames:
|
217
|
+
|
218
|
+
1. **DOI Detection** (for academic papers)
|
219
|
+
- Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
|
220
|
+
- If found, queries authoritative metadata (title, authors, year, journal)
|
221
|
+
- Generates filename with **very high confidence** from validated metadata
|
222
|
+
- **Saves API costs** - no LLM call needed for papers with DOIs
|
223
|
+
|
224
|
+
2. **LLM Analysis** (fallback for non-academic PDFs)
|
225
|
+
- **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
|
226
|
+
- **OCR**: Automatically applies OCR for scanned PDFs with minimal text
|
227
|
+
- **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
|
228
|
+
- **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
|
229
|
+
- **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
|
230
|
+
- **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
|
231
|
+
|
232
|
+
3. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
|
233
|
+
4. **Rename**: Applies suggestions (if not in dry-run mode)
|
234
|
+
|
235
|
+
### Benefits of DOI Integration
|
236
|
+
|
237
|
+
- **Accuracy**: DOI metadata is canonical and verified
|
238
|
+
- **Speed**: Instant lookup vs. LLM processing time
|
239
|
+
- **Cost**: Free DOI lookups save on API costs for academic papers
|
240
|
+
- **Reliability**: Works even when PDF text extraction is poor
|
220
241
|
|
221
242
|
## Cost Considerations
|
222
243
|
|
223
|
-
**
|
244
|
+
**DOI-based Naming (Academic Papers):**
|
245
|
+
- **Completely free** - No API costs
|
246
|
+
- **No LLM needed** - Direct metadata lookup
|
247
|
+
- Works for most academic papers with embedded DOIs
|
248
|
+
|
249
|
+
**OpenAI (Fallback):**
|
224
250
|
- Uses `gpt-4o-mini` by default (very cost-effective)
|
251
|
+
- Only called when DOI not found
|
225
252
|
- Processes first ~4500 characters per PDF
|
226
253
|
- Typical cost: ~$0.001-0.003 per PDF
|
227
254
|
|
@@ -1,29 +1,3 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: pdf-file-renamer
|
3
|
-
Version: 0.4.2
|
4
|
-
Summary: Intelligent PDF renaming using LLMs
|
5
|
-
Requires-Python: >=3.11
|
6
|
-
Description-Content-Type: text/markdown
|
7
|
-
License-File: LICENSE
|
8
|
-
Requires-Dist: pydantic>=2.10.6
|
9
|
-
Requires-Dist: pydantic-ai>=1.0.17
|
10
|
-
Requires-Dist: pydantic-settings>=2.7.1
|
11
|
-
Requires-Dist: pymupdf>=1.26.5
|
12
|
-
Requires-Dist: docling-parse>=2.0.0
|
13
|
-
Requires-Dist: docling-core>=2.0.0
|
14
|
-
Requires-Dist: python-dotenv>=1.1.1
|
15
|
-
Requires-Dist: rich>=14.2.0
|
16
|
-
Requires-Dist: typer>=0.19.2
|
17
|
-
Requires-Dist: tenacity>=9.0.0
|
18
|
-
Provides-Extra: dev
|
19
|
-
Requires-Dist: pytest>=8.3.4; extra == "dev"
|
20
|
-
Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
|
21
|
-
Requires-Dist: pytest-asyncio>=0.25.2; extra == "dev"
|
22
|
-
Requires-Dist: pytest-mock>=3.14.0; extra == "dev"
|
23
|
-
Requires-Dist: ruff>=0.9.1; extra == "dev"
|
24
|
-
Requires-Dist: mypy>=1.14.1; extra == "dev"
|
25
|
-
Dynamic: license-file
|
26
|
-
|
27
1
|
# PDF Renamer
|
28
2
|
|
29
3
|
[](https://pypi.org/project/pdf-file-renamer/)
|
@@ -44,9 +18,11 @@ Intelligent PDF file renaming using LLMs. This tool analyzes PDF content and met
|
|
44
18
|
|
45
19
|
## Features
|
46
20
|
|
21
|
+
- **DOI-based naming** - Automatically extracts DOI and fetches authoritative metadata for academic papers
|
47
22
|
- **Advanced PDF parsing** using docling-parse for better structure-aware extraction
|
48
23
|
- **OCR fallback** for scanned PDFs with low text content
|
49
24
|
- **Smart LLM prompting** with multi-pass analysis for improved accuracy
|
25
|
+
- **Hybrid approach** - Uses DOI metadata when available, falls back to LLM analysis for other documents
|
50
26
|
- Suggests filenames in format: `Author-Topic-Year.pdf`
|
51
27
|
- Dry-run mode to preview changes before applying
|
52
28
|
- **Enhanced interactive mode** with options to accept, manually edit, retry, or skip each file
|
@@ -209,19 +185,44 @@ You can use interactive mode with `--dry-run` to preview without actually renami
|
|
209
185
|
|
210
186
|
## How It Works
|
211
187
|
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
188
|
+
### Intelligent Hybrid Approach
|
189
|
+
|
190
|
+
The tool uses a multi-strategy approach to generate accurate filenames:
|
191
|
+
|
192
|
+
1. **DOI Detection** (for academic papers)
|
193
|
+
- Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
|
194
|
+
- If found, queries authoritative metadata (title, authors, year, journal)
|
195
|
+
- Generates filename with **very high confidence** from validated metadata
|
196
|
+
- **Saves API costs** - no LLM call needed for papers with DOIs
|
197
|
+
|
198
|
+
2. **LLM Analysis** (fallback for non-academic PDFs)
|
199
|
+
- **Extract**: Uses docling-parse to read first 5 pages with structure-aware parsing, falls back to PyMuPDF if needed
|
200
|
+
- **OCR**: Automatically applies OCR for scanned PDFs with minimal text
|
201
|
+
- **Metadata Enhancement**: Extracts focused hints (years, emails, author sections) to supplement unreliable PDF metadata
|
202
|
+
- **Analyze**: Sends full content excerpt to LLM with enhanced metadata and detailed extraction instructions
|
203
|
+
- **Multi-pass Review**: Low-confidence results trigger a second analysis pass with focused prompts
|
204
|
+
- **Suggest**: LLM returns filename in `Author-Topic-Year` format with confidence level and reasoning
|
205
|
+
|
206
|
+
3. **Interactive Review** (optional): User can accept, edit, retry, or skip each suggestion
|
207
|
+
4. **Rename**: Applies suggestions (if not in dry-run mode)
|
208
|
+
|
209
|
+
### Benefits of DOI Integration
|
210
|
+
|
211
|
+
- **Accuracy**: DOI metadata is canonical and verified
|
212
|
+
- **Speed**: Instant lookup vs. LLM processing time
|
213
|
+
- **Cost**: Free DOI lookups save on API costs for academic papers
|
214
|
+
- **Reliability**: Works even when PDF text extraction is poor
|
220
215
|
|
221
216
|
## Cost Considerations
|
222
217
|
|
223
|
-
**
|
218
|
+
**DOI-based Naming (Academic Papers):**
|
219
|
+
- **Completely free** - No API costs
|
220
|
+
- **No LLM needed** - Direct metadata lookup
|
221
|
+
- Works for most academic papers with embedded DOIs
|
222
|
+
|
223
|
+
**OpenAI (Fallback):**
|
224
224
|
- Uses `gpt-4o-mini` by default (very cost-effective)
|
225
|
+
- Only called when DOI not found
|
225
226
|
- Processes first ~4500 characters per PDF
|
226
227
|
- Typical cost: ~$0.001-0.003 per PDF
|
227
228
|
|