meaningful-pdf-names 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of meaningful-pdf-names might be problematic. Click here for more details.
- meaningful_pdf_names/__init__.py +1 -1
- meaningful_pdf_names/cli.py +26 -10
- meaningful_pdf_names-0.1.2.dist-info/METADATA +151 -0
- meaningful_pdf_names-0.1.2.dist-info/RECORD +9 -0
- meaningful_pdf_names-0.1.0.dist-info/METADATA +0 -86
- meaningful_pdf_names-0.1.0.dist-info/RECORD +0 -9
- {meaningful_pdf_names-0.1.0.dist-info → meaningful_pdf_names-0.1.2.dist-info}/WHEEL +0 -0
- {meaningful_pdf_names-0.1.0.dist-info → meaningful_pdf_names-0.1.2.dist-info}/entry_points.txt +0 -0
- {meaningful_pdf_names-0.1.0.dist-info → meaningful_pdf_names-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {meaningful_pdf_names-0.1.0.dist-info → meaningful_pdf_names-0.1.2.dist-info}/top_level.txt +0 -0
meaningful_pdf_names/__init__.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
__all__ = ["__version__"]
|
|
2
|
-
__version__ = "0.1.
|
|
2
|
+
__version__ = "0.1.1"
|
meaningful_pdf_names/cli.py
CHANGED
|
@@ -75,18 +75,27 @@ def summarize_text(text: str, max_chars: int = 4000) -> str:
|
|
|
75
75
|
return text
|
|
76
76
|
|
|
77
77
|
|
|
78
|
-
def extract_text_keywords(pdf_path: Path, max_keywords: int = 5):
|
|
78
|
+
def extract_text_keywords(pdf_path: Path, max_keywords: int = 5, pages_to_read: int = 2):
|
|
79
79
|
"""
|
|
80
|
-
Extract up to `max_keywords` from
|
|
80
|
+
Extract up to `max_keywords` from the first `pages_to_read` pages of the PDF.
|
|
81
|
+
If `pages_to_read` exceeds total pages, reads all available pages.
|
|
81
82
|
"""
|
|
82
83
|
text = ""
|
|
83
84
|
|
|
84
85
|
if PdfReader is not None:
|
|
85
86
|
try:
|
|
86
87
|
reader = PdfReader(str(pdf_path))
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
88
|
+
total_pages = len(reader.pages)
|
|
89
|
+
if total_pages > 0:
|
|
90
|
+
# Determine how many pages to actually read
|
|
91
|
+
pages_to_extract = min(pages_to_read, total_pages)
|
|
92
|
+
|
|
93
|
+
# Extract text from the first N pages
|
|
94
|
+
for i in range(pages_to_extract):
|
|
95
|
+
page = reader.pages[i]
|
|
96
|
+
page_text = (page.extract_text() or "").strip()
|
|
97
|
+
if page_text:
|
|
98
|
+
text += page_text + " "
|
|
90
99
|
except Exception:
|
|
91
100
|
text = ""
|
|
92
101
|
|
|
@@ -205,9 +214,9 @@ def unique_target_path(folder: Path, base_slug: str, suffix_len: int = 3) -> Pat
|
|
|
205
214
|
return candidate
|
|
206
215
|
|
|
207
216
|
|
|
208
|
-
def rename_pdfs(folder: Path, dry_run: bool = False, verbose: bool = True):
|
|
217
|
+
def rename_pdfs(folder: Path, dry_run: bool = False, verbose: bool = True, pages_to_read: int = 2):
|
|
209
218
|
"""
|
|
210
|
-
Rename all PDFs in the folder using first
|
|
219
|
+
Rename all PDFs in the folder using text from the first `pages_to_read` pages.
|
|
211
220
|
"""
|
|
212
221
|
if not folder.is_dir():
|
|
213
222
|
raise ValueError(f"{folder} is not a directory")
|
|
@@ -221,7 +230,7 @@ def rename_pdfs(folder: Path, dry_run: bool = False, verbose: bool = True):
|
|
|
221
230
|
print(f"Found {len(pdf_files)} PDF(s) in {folder}")
|
|
222
231
|
|
|
223
232
|
for pdf in pdf_files:
|
|
224
|
-
keywords = extract_text_keywords(pdf)
|
|
233
|
+
keywords = extract_text_keywords(pdf, pages_to_read=pages_to_read)
|
|
225
234
|
base_slug = build_new_name(keywords)
|
|
226
235
|
target = unique_target_path(folder, base_slug)
|
|
227
236
|
|
|
@@ -240,7 +249,7 @@ def rename_pdfs(folder: Path, dry_run: bool = False, verbose: bool = True):
|
|
|
240
249
|
def main():
|
|
241
250
|
parser = argparse.ArgumentParser(
|
|
242
251
|
description=(
|
|
243
|
-
"Rename PDFs using
|
|
252
|
+
"Rename PDFs using text-derived keywords plus a short suffix "
|
|
244
253
|
"for clean, meaningful filenames."
|
|
245
254
|
)
|
|
246
255
|
)
|
|
@@ -249,6 +258,13 @@ def main():
|
|
|
249
258
|
type=str,
|
|
250
259
|
help="Path to folder containing PDFs."
|
|
251
260
|
)
|
|
261
|
+
parser.add_argument(
|
|
262
|
+
"-p", "--pages",
|
|
263
|
+
type=int,
|
|
264
|
+
default=2,
|
|
265
|
+
help="Number of pages to read from each PDF (default: 2). "
|
|
266
|
+
"If larger than total pages, reads all available pages."
|
|
267
|
+
)
|
|
252
268
|
parser.add_argument(
|
|
253
269
|
"--dry-run",
|
|
254
270
|
action="store_true",
|
|
@@ -263,7 +279,7 @@ def main():
|
|
|
263
279
|
args = parser.parse_args()
|
|
264
280
|
folder = Path(args.folder).expanduser().resolve()
|
|
265
281
|
|
|
266
|
-
rename_pdfs(folder, dry_run=args.dry_run, verbose=not args.quiet)
|
|
282
|
+
rename_pdfs(folder, dry_run=args.dry_run, verbose=not args.quiet, pages_to_read=args.pages)
|
|
267
283
|
|
|
268
284
|
|
|
269
285
|
if __name__ == "__main__":
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: meaningful-pdf-names
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: Offline-friendly PDF renamer that generates meaningful, keyword-rich filenames from PDF content.
|
|
5
|
+
Author-email: Nishant Kumar <abcnishant007@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/abcnishant007/meaningful-pdf-names
|
|
8
|
+
Project-URL: Source, https://github.com/abcnishant007/meaningful-pdf-names
|
|
9
|
+
Project-URL: Issues, https://github.com/abcnishant007/meaningful-pdf-names/issues
|
|
10
|
+
Keywords: pdf,rename,keywords,offline,cli
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Topic :: Utilities
|
|
15
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pypdf>=5.0.0
|
|
20
|
+
Provides-Extra: summarizer
|
|
21
|
+
Requires-Dist: transformers>=4.45.0; extra == "summarizer"
|
|
22
|
+
Requires-Dist: torch>=2.0.0; extra == "summarizer"
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# meaningful-pdf-names
|
|
26
|
+
|
|
27
|
+
[](https://www.python.org)
|
|
28
|
+
[](https://pypi.org/project/meaningful-pdf-names/)
|
|
29
|
+
[](https://codecov.io/gh/abcnishant007/meaningful-pdf-names)
|
|
30
|
+
[](https://github.com/psf/black)
|
|
31
|
+
[](https://pepy.tech/projects/meaningful-pdf-names)
|
|
32
|
+
|
|
33
|
+
Offline-friendly CLI to turn your messy paper filenames into **compact, keyword-rich names** based on the PDF's first page.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
|
|
37
|
+
`final_v3_really_final.pdf` → `urban-resilience-transport-inequality-policy-a9f.pdf`
|
|
38
|
+
|
|
39
|
+
## Features
|
|
40
|
+
|
|
41
|
+
- Uses the **first 2 pages** by default (title, authors, abstract, introduction) for better context
|
|
42
|
+
- Configurable page count with `-p` flag (e.g., `-p 4` for 4 pages)
|
|
43
|
+
- Up to **5 meaningful keywords** per file
|
|
44
|
+
- Adds a **3-character [a-z0-9] suffix** to avoid collisions
|
|
45
|
+
- Works fully **offline** with `pypdf`
|
|
46
|
+
- Optional: use a small local Hugging Face summarizer
|
|
47
|
+
(`sshleifer/distilbart-cnn-12-6`) via `transformers` + `torch`
|
|
48
|
+
|
|
49
|
+
## Prerequisites
|
|
50
|
+
|
|
51
|
+
- **Python 3.9+** installed on your system
|
|
52
|
+
- **pip** (Python package manager) - usually comes with Python
|
|
53
|
+
|
|
54
|
+
## Quick Install
|
|
55
|
+
|
|
56
|
+
### From PyPI (Recommended)
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install meaningful-pdf-names
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Quick Start Guide
|
|
63
|
+
|
|
64
|
+
### For Mac Users
|
|
65
|
+
|
|
66
|
+
1. **Install the package** (see above)
|
|
67
|
+
2. **Navigate to your PDF folder**:
|
|
68
|
+
- Open Finder and go to the folder containing your PDFs
|
|
69
|
+
- Right-click on the folder and select "New Terminal at Folder"
|
|
70
|
+
- This opens Terminal directly in that folder
|
|
71
|
+
3. **Run the command**:
|
|
72
|
+
```bash
|
|
73
|
+
mpn .
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### For Linux Users
|
|
77
|
+
|
|
78
|
+
1. **Install the package** (see above)
|
|
79
|
+
2. **Navigate to your PDF folder**:
|
|
80
|
+
```bash
|
|
81
|
+
cd /path/to/your/pdf/folder
|
|
82
|
+
```
|
|
83
|
+
3. **Run the command**:
|
|
84
|
+
```bash
|
|
85
|
+
mpn .
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### For Any Folder Location
|
|
89
|
+
|
|
90
|
+
If you want to rename PDFs in a different folder without navigating there:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
mpn /full/path/to/your/pdf/folder
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Usage Examples
|
|
97
|
+
|
|
98
|
+
**Basic usage (current folder):**
|
|
99
|
+
```bash
|
|
100
|
+
mpn .
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**Specific folder:**
|
|
104
|
+
```bash
|
|
105
|
+
mpn ~/Downloads/research_papers
|
|
106
|
+
mpn /Users/username/Documents/PDFs
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**Dry run (preview changes without renaming):**
|
|
110
|
+
```bash
|
|
111
|
+
mpn . --dry-run
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
**Quiet mode (minimal output):**
|
|
115
|
+
```bash
|
|
116
|
+
mpn . --quiet
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
**Custom page count (read more pages for better context):**
|
|
120
|
+
```bash
|
|
121
|
+
mpn . -p 4 # Read first 4 pages
|
|
122
|
+
mpn . -p 10 # Read up to 10 pages (or all if PDF has fewer)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## What It Does
|
|
126
|
+
|
|
127
|
+
- Scans all PDF files in the specified folder
|
|
128
|
+
- Extracts text from just the first page (fast!)
|
|
129
|
+
- Identifies meaningful keywords from titles, authors, abstracts
|
|
130
|
+
- Generates clean, readable filenames like:
|
|
131
|
+
- `climate-change-urban-planning-sustainability-a9f.pdf`
|
|
132
|
+
- `machine-learning-neural-networks-research-4x2.pdf`
|
|
133
|
+
- `healthcare-policy-digital-transformation-b7c.pdf`
|
|
134
|
+
|
|
135
|
+
## Why Not Existing Tools?
|
|
136
|
+
|
|
137
|
+
Other tools often:
|
|
138
|
+
|
|
139
|
+
* Depend on **OpenAI / web APIs** (requires internet, API keys)
|
|
140
|
+
* Require DOIs or external metadata (not always available)
|
|
141
|
+
* Use long `Author - Title - Year` patterns (hard to read)
|
|
142
|
+
|
|
143
|
+
`meaningful-pdf-names` is:
|
|
144
|
+
|
|
145
|
+
* **Local-only** (no API keys, no network required)
|
|
146
|
+
* **Fast** (first-page only extraction)
|
|
147
|
+
* **Slug-based**: short, grep- and git-friendly names
|
|
148
|
+
|
|
149
|
+
## License
|
|
150
|
+
|
|
151
|
+
MIT
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
meaningful_pdf_names/__init__.py,sha256=mxfnxTtjjT0RlBl5L1-W0AT-IdYIc_KQVhB5cOlylEw,48
|
|
2
|
+
meaningful_pdf_names/__main__.py,sha256=MSmt_5Xg84uHqzTN38JwgseJK8rsJn_11A8WD99VtEo,61
|
|
3
|
+
meaningful_pdf_names/cli.py,sha256=EtW6J53ywyQvKeOla3iwzhRtnEYB1j-PCPWc7FfDFpI,8040
|
|
4
|
+
meaningful_pdf_names-0.1.2.dist-info/licenses/LICENSE,sha256=OphKV48tcMv6ep-7j-8T6nycykPT0g8ZlMJ9zbGvdPs,1066
|
|
5
|
+
meaningful_pdf_names-0.1.2.dist-info/METADATA,sha256=OUJ-XTJnd7t7ERHC2JYryI3-I44iMW1nfvllFldDjZw,4530
|
|
6
|
+
meaningful_pdf_names-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
+
meaningful_pdf_names-0.1.2.dist-info/entry_points.txt,sha256=EtPEkZe_yMNP99BJDtBPI2DL20GO3E5ELmOm2F4aPO4,107
|
|
8
|
+
meaningful_pdf_names-0.1.2.dist-info/top_level.txt,sha256=TD_BuniRNpBdNggGi-6B8WQ4CxkYxzEgTSm2DfY4khw,21
|
|
9
|
+
meaningful_pdf_names-0.1.2.dist-info/RECORD,,
|
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: meaningful-pdf-names
|
|
3
|
-
Version: 0.1.0
|
|
4
|
-
Summary: Offline-friendly PDF renamer that generates meaningful, keyword-rich filenames from first-page content.
|
|
5
|
-
Author-email: Nishant Kumar <abcnishant007@gmail.com>
|
|
6
|
-
License: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/abcnishant007/meaningful-pdf-names
|
|
8
|
-
Project-URL: Source, https://github.com/abcnishant007/meaningful-pdf-names
|
|
9
|
-
Project-URL: Issues, https://github.com/abcnishant007/meaningful-pdf-names/issues
|
|
10
|
-
Keywords: pdf,rename,keywords,offline,cli
|
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
|
12
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
-
Classifier: Operating System :: OS Independent
|
|
14
|
-
Classifier: Topic :: Utilities
|
|
15
|
-
Classifier: Topic :: Text Processing :: Linguistic
|
|
16
|
-
Requires-Python: >=3.9
|
|
17
|
-
Description-Content-Type: text/markdown
|
|
18
|
-
License-File: LICENSE
|
|
19
|
-
Requires-Dist: pypdf>=5.0.0
|
|
20
|
-
Provides-Extra: summarizer
|
|
21
|
-
Requires-Dist: transformers>=4.45.0; extra == "summarizer"
|
|
22
|
-
Requires-Dist: torch>=2.0.0; extra == "summarizer"
|
|
23
|
-
Dynamic: license-file
|
|
24
|
-
|
|
25
|
-
# meaningful-pdf-names
|
|
26
|
-
|
|
27
|
-
Offline-friendly CLI to turn your messy paper filenames into **compact, keyword-rich names** based on the PDF's first page.
|
|
28
|
-
|
|
29
|
-
Example:
|
|
30
|
-
|
|
31
|
-
`final_v3_really_final.pdf` → `urban-resilience-transport-inequality-policy-a9f.pdf`
|
|
32
|
-
|
|
33
|
-
## Features
|
|
34
|
-
|
|
35
|
-
- Uses only the **first page** (title, authors, abstract region) for speed.
|
|
36
|
-
- Up to **5 meaningful keywords** per file.
|
|
37
|
-
- Adds a **3-character [a-z0-9] suffix** to avoid collisions.
|
|
38
|
-
- Works fully **offline** with `pypdf`.
|
|
39
|
-
- Optional: use a small local Hugging Face summarizer
|
|
40
|
-
(`sshleifer/distilbart-cnn-12-6`) via `transformers` + `torch`.
|
|
41
|
-
|
|
42
|
-
## Install
|
|
43
|
-
|
|
44
|
-
From source / Git:
|
|
45
|
-
|
|
46
|
-
```bash
|
|
47
|
-
pip install git+https://github.com/yourname/meaningful-pdf-names.git
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
(When published to PyPI:)
|
|
51
|
-
|
|
52
|
-
```bash
|
|
53
|
-
pip install meaningful-pdf-names
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
With optional local summarizer:
|
|
57
|
-
|
|
58
|
-
```bash
|
|
59
|
-
pip install "meaningful-pdf-names[summarizer]"
|
|
60
|
-
```
|
|
61
|
-
|
|
62
|
-
## Usage
|
|
63
|
-
|
|
64
|
-
```bash
|
|
65
|
-
meaningful-pdf-names /path/to/pdfs
|
|
66
|
-
meaningful-pdf-names /path/to/pdfs --dry-run
|
|
67
|
-
mpn /path/to/pdfs
|
|
68
|
-
```
|
|
69
|
-
|
|
70
|
-
## Why not existing tools?
|
|
71
|
-
|
|
72
|
-
Other tools often:
|
|
73
|
-
|
|
74
|
-
* Depend on **OpenAI / web APIs**.
|
|
75
|
-
* Require DOIs or external metadata.
|
|
76
|
-
* Use long `Author - Title - Year` patterns.
|
|
77
|
-
|
|
78
|
-
`meaningful-pdf-names` is:
|
|
79
|
-
|
|
80
|
-
* **Local-only** (no API keys, no network).
|
|
81
|
-
* **Fast** (first-page only).
|
|
82
|
-
* **Slug-based**: short, grep- and git-friendly names.
|
|
83
|
-
|
|
84
|
-
## License
|
|
85
|
-
|
|
86
|
-
MIT
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
meaningful_pdf_names/__init__.py,sha256=tXbRXsO0NE_UV1kIHiZTTQQH0fj0U2KoxxNusu_gzrM,48
|
|
2
|
-
meaningful_pdf_names/__main__.py,sha256=MSmt_5Xg84uHqzTN38JwgseJK8rsJn_11A8WD99VtEo,61
|
|
3
|
-
meaningful_pdf_names/cli.py,sha256=C5eYS9ZTBfkf9urzKrN8G85b9-Kt0JN8qabs0CizWAs,7236
|
|
4
|
-
meaningful_pdf_names-0.1.0.dist-info/licenses/LICENSE,sha256=OphKV48tcMv6ep-7j-8T6nycykPT0g8ZlMJ9zbGvdPs,1066
|
|
5
|
-
meaningful_pdf_names-0.1.0.dist-info/METADATA,sha256=cIZjWGIsHtS-bkAf6tOL_J7YnwOXFit7_pytGZqt0q4,2365
|
|
6
|
-
meaningful_pdf_names-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
-
meaningful_pdf_names-0.1.0.dist-info/entry_points.txt,sha256=EtPEkZe_yMNP99BJDtBPI2DL20GO3E5ELmOm2F4aPO4,107
|
|
8
|
-
meaningful_pdf_names-0.1.0.dist-info/top_level.txt,sha256=TD_BuniRNpBdNggGi-6B8WQ4CxkYxzEgTSm2DfY4khw,21
|
|
9
|
-
meaningful_pdf_names-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
{meaningful_pdf_names-0.1.0.dist-info → meaningful_pdf_names-0.1.2.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{meaningful_pdf_names-0.1.0.dist-info → meaningful_pdf_names-0.1.2.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|