meaningful-pdf-names 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of meaningful-pdf-names might be problematic. Click here for more details.

@@ -0,0 +1,2 @@
1
+ __all__ = ["__version__"]
2
+ __version__ = "0.1.0"
@@ -0,0 +1,4 @@
1
+ from .cli import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
@@ -0,0 +1,270 @@
1
+ import argparse
2
+ import random
3
+ import string
4
+ import re
5
+ from pathlib import Path
6
+
7
+ try:
8
+ from pypdf import PdfReader
9
+ except ImportError:
10
+ PdfReader = None
11
+
12
+ try:
13
+ from transformers import pipeline
14
+ except ImportError:
15
+ pipeline = None
16
+
17
+ STOPWORDS = {
18
+ "the", "and", "for", "with", "that", "this", "from", "into", "onto",
19
+ "but", "not", "are", "were", "was", "have", "has", "had", "their",
20
+ "its", "our", "your", "about", "using", "use", "based", "on", "of",
21
+ "in", "to", "by", "an", "a", "as", "is", "be", "or", "we", "can",
22
+ "such", "these", "those", "also", "it", "at"
23
+ }
24
+
25
+ _SUMMARIZER = None
26
+
27
+
28
+ def get_summarizer():
29
+ """
30
+ Lazy-load a small local HF summarization model, if transformers is installed.
31
+ """
32
+ global _SUMMARIZER
33
+ if _SUMMARIZER is not None:
34
+ return _SUMMARIZER
35
+
36
+ if pipeline is None:
37
+ return None
38
+
39
+ try:
40
+ _SUMMARIZER = pipeline(
41
+ "summarization",
42
+ model="sshleifer/distilbart-cnn-12-6",
43
+ device=-1, # CPU; user can change if needed
44
+ )
45
+ except Exception:
46
+ _SUMMARIZER = None
47
+
48
+ return _SUMMARIZER
49
+
50
+
51
+ def summarize_text(text: str, max_chars: int = 4000) -> str:
52
+ """
53
+ Summarize text using a local HF model if available.
54
+ Falls back to original text on any failure.
55
+ """
56
+ text = text.strip()
57
+ if not text:
58
+ return ""
59
+
60
+ summarizer = get_summarizer()
61
+ if summarizer is None:
62
+ return text
63
+
64
+ chunk = text[:max_chars]
65
+ try:
66
+ out = summarizer(
67
+ chunk,
68
+ max_length=80,
69
+ min_length=20,
70
+ do_sample=False,
71
+ )
72
+ summary = (out[0].get("summary_text") or "").strip()
73
+ return summary or text
74
+ except Exception:
75
+ return text
76
+
77
+
78
+ def extract_text_keywords(pdf_path: Path, max_keywords: int = 5):
79
+ """
80
+ Extract up to `max_keywords` from ONLY the first page of the PDF.
81
+ """
82
+ text = ""
83
+
84
+ if PdfReader is not None:
85
+ try:
86
+ reader = PdfReader(str(pdf_path))
87
+ if len(reader.pages) > 0:
88
+ first_page = reader.pages[0]
89
+ text = (first_page.extract_text() or "")
90
+ except Exception:
91
+ text = ""
92
+
93
+ if not text.strip():
94
+ # fallback: original filename as text source
95
+ text = pdf_path.stem
96
+
97
+ # Clean the text - remove URLs, DOIs, and other noise
98
+ cleaned_text = clean_extracted_text(text)
99
+
100
+ summarized = summarize_text(cleaned_text) or cleaned_text
101
+
102
+ # Basic tokenization on summary
103
+ tokens = re.findall(r"[A-Za-z][A-Za-z\-]{1,}", summarized.lower())
104
+ filtered = [
105
+ t for t in tokens
106
+ if t not in STOPWORDS and len(t) > 2
107
+ ]
108
+
109
+ if not filtered:
110
+ # fallback to filename-derived tokens
111
+ fallback_tokens = re.findall(r"[A-Za-z0-9]+", pdf_path.stem.lower())
112
+ filtered = [t for t in fallback_tokens if len(t) > 1] or ["paper"]
113
+
114
+ # Deduplicate in order
115
+ seen = set()
116
+ ordered = []
117
+ for t in filtered:
118
+ if t not in seen:
119
+ seen.add(t)
120
+ ordered.append(t)
121
+
122
+ return ordered[:max_keywords]
123
+
124
+
125
+ def clean_extracted_text(text: str) -> str:
126
+ """
127
+ Clean extracted PDF text by removing URLs, DOIs, emails, and other noise.
128
+ """
129
+ if not text:
130
+ return ""
131
+
132
+ # Remove URLs
133
+ text = re.sub(r'https?://[^\s]+', '', text)
134
+ text = re.sub(r'www\.[^\s]+', '', text)
135
+
136
+ # Remove DOIs
137
+ text = re.sub(r'doi:[^\s]+', '', text, flags=re.IGNORECASE)
138
+ text = re.sub(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', '', text, flags=re.IGNORECASE)
139
+
140
+ # Remove emails
141
+ text = re.sub(r'\S+@\S+', '', text)
142
+
143
+ # Remove common PDF metadata patterns
144
+ text = re.sub(r'received:\s*\d{1,2}\s+\w+\s+\d{4}', '', text, flags=re.IGNORECASE)
145
+ text = re.sub(r'accepted:\s*\d{1,2}\s+\w+\s+\d{4}', '', text, flags=re.IGNORECASE)
146
+ text = re.sub(r'published:\s*\d{1,2}\s+\w+\s+\d{4}', '', text, flags=re.IGNORECASE)
147
+
148
+ # Remove page numbers and headers/footers
149
+ text = re.sub(r'\b\d{1,3}\b', '', text) # Remove standalone numbers (likely page numbers)
150
+
151
+ # Remove excessive whitespace
152
+ text = re.sub(r'\s+', ' ', text).strip()
153
+
154
+ return text
155
+
156
+
157
+ def slugify_keywords(keywords):
158
+ """
159
+ Turn keywords into a filesystem-safe slug.
160
+ """
161
+ cleaned = []
162
+ for kw in keywords:
163
+ kw = re.sub(r"[^a-z0-9\-]+", "-", kw.lower())
164
+ kw = re.sub(r"-{2,}", "-", kw).strip("-")
165
+ if kw:
166
+ cleaned.append(kw)
167
+ if not cleaned:
168
+ cleaned = ["paper"]
169
+ return "-".join(cleaned)
170
+
171
+
172
+ def random_suffix(length: int = 3):
173
+ """
174
+ Generate a short random suffix to minimize collisions.
175
+ """
176
+ alphabet = string.ascii_lowercase + string.digits
177
+ return "".join(random.choices(alphabet, k=length))
178
+
179
+
180
+ def build_new_name(keywords, suffix_len=3, max_len=120):
181
+ """
182
+ Build the base part of the new filename (without .pdf).
183
+ """
184
+ base_slug = slugify_keywords(keywords)
185
+
186
+ # Reserve for "-xyz.pdf"
187
+ max_base_len = max_len - (1 + suffix_len + 4)
188
+ if max_base_len < 10:
189
+ max_base_len = 10
190
+
191
+ if len(base_slug) > max_base_len:
192
+ base_slug = base_slug[:max_base_len].rstrip("-")
193
+
194
+ return base_slug
195
+
196
+
197
+ def unique_target_path(folder: Path, base_slug: str, suffix_len: int = 3) -> Path:
198
+ """
199
+ Generate a unique filename in `folder` using base_slug + random suffix.
200
+ """
201
+ while True:
202
+ suffix = random_suffix(suffix_len)
203
+ candidate = folder / f"{base_slug}-{suffix}.pdf"
204
+ if not candidate.exists():
205
+ return candidate
206
+
207
+
208
+ def rename_pdfs(folder: Path, dry_run: bool = False, verbose: bool = True):
209
+ """
210
+ Rename all PDFs in the folder using first-page-based keywords.
211
+ """
212
+ if not folder.is_dir():
213
+ raise ValueError(f"{folder} is not a directory")
214
+
215
+ pdf_files = sorted(
216
+ p for p in folder.iterdir()
217
+ if p.is_file() and p.suffix.lower() == ".pdf"
218
+ )
219
+
220
+ if verbose:
221
+ print(f"Found {len(pdf_files)} PDF(s) in {folder}")
222
+
223
+ for pdf in pdf_files:
224
+ keywords = extract_text_keywords(pdf)
225
+ base_slug = build_new_name(keywords)
226
+ target = unique_target_path(folder, base_slug)
227
+
228
+ if target.name == pdf.name:
229
+ if verbose:
230
+ print(f"Skip (already well-named): {pdf.name}")
231
+ continue
232
+
233
+ if verbose:
234
+ print(f"{pdf.name} -> {target.name}")
235
+
236
+ if not dry_run:
237
+ pdf.rename(target)
238
+
239
+
240
+ def main():
241
+ parser = argparse.ArgumentParser(
242
+ description=(
243
+ "Rename PDFs using first-page-derived keywords plus a short suffix "
244
+ "for clean, meaningful filenames."
245
+ )
246
+ )
247
+ parser.add_argument(
248
+ "folder",
249
+ type=str,
250
+ help="Path to folder containing PDFs."
251
+ )
252
+ parser.add_argument(
253
+ "--dry-run",
254
+ action="store_true",
255
+ help="Show planned renames without changing files."
256
+ )
257
+ parser.add_argument(
258
+ "--quiet",
259
+ action="store_true",
260
+ help="Minimal output."
261
+ )
262
+
263
+ args = parser.parse_args()
264
+ folder = Path(args.folder).expanduser().resolve()
265
+
266
+ rename_pdfs(folder, dry_run=args.dry_run, verbose=not args.quiet)
267
+
268
+
269
+ if __name__ == "__main__":
270
+ main()
@@ -0,0 +1,86 @@
1
+ Metadata-Version: 2.4
2
+ Name: meaningful-pdf-names
3
+ Version: 0.1.0
4
+ Summary: Offline-friendly PDF renamer that generates meaningful, keyword-rich filenames from first-page content.
5
+ Author-email: Nishant Kumar <abcnishant007@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/abcnishant007/meaningful-pdf-names
8
+ Project-URL: Source, https://github.com/abcnishant007/meaningful-pdf-names
9
+ Project-URL: Issues, https://github.com/abcnishant007/meaningful-pdf-names/issues
10
+ Keywords: pdf,rename,keywords,offline,cli
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Topic :: Utilities
15
+ Classifier: Topic :: Text Processing :: Linguistic
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: pypdf>=5.0.0
20
+ Provides-Extra: summarizer
21
+ Requires-Dist: transformers>=4.45.0; extra == "summarizer"
22
+ Requires-Dist: torch>=2.0.0; extra == "summarizer"
23
+ Dynamic: license-file
24
+
25
+ # meaningful-pdf-names
26
+
27
+ Offline-friendly CLI to turn your messy paper filenames into **compact, keyword-rich names** based on the PDF's first page.
28
+
29
+ Example:
30
+
31
+ `final_v3_really_final.pdf` → `urban-resilience-transport-inequality-policy-a9f.pdf`
32
+
33
+ ## Features
34
+
35
+ - Uses only the **first page** (title, authors, abstract region) for speed.
36
+ - Up to **5 meaningful keywords** per file.
37
+ - Adds a **3-character [a-z0-9] suffix** to avoid collisions.
38
+ - Works fully **offline** with `pypdf`.
39
+ - Optional: use a small local Hugging Face summarizer
40
+ (`sshleifer/distilbart-cnn-12-6`) via `transformers` + `torch`.
41
+
42
+ ## Install
43
+
44
+ From source / Git:
45
+
46
+ ```bash
47
+ pip install git+https://github.com/yourname/meaningful-pdf-names.git
48
+ ```
49
+
50
+ (When published to PyPI:)
51
+
52
+ ```bash
53
+ pip install meaningful-pdf-names
54
+ ```
55
+
56
+ With optional local summarizer:
57
+
58
+ ```bash
59
+ pip install "meaningful-pdf-names[summarizer]"
60
+ ```
61
+
62
+ ## Usage
63
+
64
+ ```bash
65
+ meaningful-pdf-names /path/to/pdfs
66
+ meaningful-pdf-names /path/to/pdfs --dry-run
67
+ mpn /path/to/pdfs
68
+ ```
69
+
70
+ ## Why not existing tools?
71
+
72
+ Other tools often:
73
+
74
+ * Depend on **OpenAI / web APIs**.
75
+ * Require DOIs or external metadata.
76
+ * Use long `Author - Title - Year` patterns.
77
+
78
+ `meaningful-pdf-names` is:
79
+
80
+ * **Local-only** (no API keys, no network).
81
+ * **Fast** (first-page only).
82
+ * **Slug-based**: short, grep- and git-friendly names.
83
+
84
+ ## License
85
+
86
+ MIT
@@ -0,0 +1,9 @@
1
+ meaningful_pdf_names/__init__.py,sha256=tXbRXsO0NE_UV1kIHiZTTQQH0fj0U2KoxxNusu_gzrM,48
2
+ meaningful_pdf_names/__main__.py,sha256=MSmt_5Xg84uHqzTN38JwgseJK8rsJn_11A8WD99VtEo,61
3
+ meaningful_pdf_names/cli.py,sha256=C5eYS9ZTBfkf9urzKrN8G85b9-Kt0JN8qabs0CizWAs,7236
4
+ meaningful_pdf_names-0.1.0.dist-info/licenses/LICENSE,sha256=OphKV48tcMv6ep-7j-8T6nycykPT0g8ZlMJ9zbGvdPs,1066
5
+ meaningful_pdf_names-0.1.0.dist-info/METADATA,sha256=cIZjWGIsHtS-bkAf6tOL_J7YnwOXFit7_pytGZqt0q4,2365
6
+ meaningful_pdf_names-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ meaningful_pdf_names-0.1.0.dist-info/entry_points.txt,sha256=EtPEkZe_yMNP99BJDtBPI2DL20GO3E5ELmOm2F4aPO4,107
8
+ meaningful_pdf_names-0.1.0.dist-info/top_level.txt,sha256=TD_BuniRNpBdNggGi-6B8WQ4CxkYxzEgTSm2DfY4khw,21
9
+ meaningful_pdf_names-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ meaningful-pdf-names = meaningful_pdf_names.cli:main
3
+ mpn = meaningful_pdf_names.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Your Name
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ meaningful_pdf_names