meaningful-pdf-names 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of meaningful-pdf-names might be problematic. Click here for more details.

@@ -1,2 +1,2 @@
1
1
  __all__ = ["__version__"]
2
- __version__ = "0.1.0"
2
+ __version__ = "0.1.1"
@@ -75,18 +75,27 @@ def summarize_text(text: str, max_chars: int = 4000) -> str:
75
75
  return text
76
76
 
77
77
 
78
- def extract_text_keywords(pdf_path: Path, max_keywords: int = 5):
78
+ def extract_text_keywords(pdf_path: Path, max_keywords: int = 5, pages_to_read: int = 2):
79
79
  """
80
- Extract up to `max_keywords` from ONLY the first page of the PDF.
80
+ Extract up to `max_keywords` from the first `pages_to_read` pages of the PDF.
81
+ If `pages_to_read` exceeds total pages, reads all available pages.
81
82
  """
82
83
  text = ""
83
84
 
84
85
  if PdfReader is not None:
85
86
  try:
86
87
  reader = PdfReader(str(pdf_path))
87
- if len(reader.pages) > 0:
88
- first_page = reader.pages[0]
89
- text = (first_page.extract_text() or "")
88
+ total_pages = len(reader.pages)
89
+ if total_pages > 0:
90
+ # Determine how many pages to actually read
91
+ pages_to_extract = min(pages_to_read, total_pages)
92
+
93
+ # Extract text from the first N pages
94
+ for i in range(pages_to_extract):
95
+ page = reader.pages[i]
96
+ page_text = (page.extract_text() or "").strip()
97
+ if page_text:
98
+ text += page_text + " "
90
99
  except Exception:
91
100
  text = ""
92
101
 
@@ -205,9 +214,9 @@ def unique_target_path(folder: Path, base_slug: str, suffix_len: int = 3) -> Pat
205
214
  return candidate
206
215
 
207
216
 
208
- def rename_pdfs(folder: Path, dry_run: bool = False, verbose: bool = True):
217
+ def rename_pdfs(folder: Path, dry_run: bool = False, verbose: bool = True, pages_to_read: int = 2):
209
218
  """
210
- Rename all PDFs in the folder using first-page-based keywords.
219
+ Rename all PDFs in the folder using text from the first `pages_to_read` pages.
211
220
  """
212
221
  if not folder.is_dir():
213
222
  raise ValueError(f"{folder} is not a directory")
@@ -221,7 +230,7 @@ def rename_pdfs(folder: Path, dry_run: bool = False, verbose: bool = True):
221
230
  print(f"Found {len(pdf_files)} PDF(s) in {folder}")
222
231
 
223
232
  for pdf in pdf_files:
224
- keywords = extract_text_keywords(pdf)
233
+ keywords = extract_text_keywords(pdf, pages_to_read=pages_to_read)
225
234
  base_slug = build_new_name(keywords)
226
235
  target = unique_target_path(folder, base_slug)
227
236
 
@@ -240,7 +249,7 @@ def rename_pdfs(folder: Path, dry_run: bool = False, verbose: bool = True):
240
249
  def main():
241
250
  parser = argparse.ArgumentParser(
242
251
  description=(
243
- "Rename PDFs using first-page-derived keywords plus a short suffix "
252
+ "Rename PDFs using text-derived keywords plus a short suffix "
244
253
  "for clean, meaningful filenames."
245
254
  )
246
255
  )
@@ -249,6 +258,13 @@ def main():
249
258
  type=str,
250
259
  help="Path to folder containing PDFs."
251
260
  )
261
+ parser.add_argument(
262
+ "-p", "--pages",
263
+ type=int,
264
+ default=2,
265
+ help="Number of pages to read from each PDF (default: 2). "
266
+ "If larger than total pages, reads all available pages."
267
+ )
252
268
  parser.add_argument(
253
269
  "--dry-run",
254
270
  action="store_true",
@@ -263,7 +279,7 @@ def main():
263
279
  args = parser.parse_args()
264
280
  folder = Path(args.folder).expanduser().resolve()
265
281
 
266
- rename_pdfs(folder, dry_run=args.dry_run, verbose=not args.quiet)
282
+ rename_pdfs(folder, dry_run=args.dry_run, verbose=not args.quiet, pages_to_read=args.pages)
267
283
 
268
284
 
269
285
  if __name__ == "__main__":
@@ -0,0 +1,151 @@
1
+ Metadata-Version: 2.4
2
+ Name: meaningful-pdf-names
3
+ Version: 0.1.2
4
+ Summary: Offline-friendly PDF renamer that generates meaningful, keyword-rich filenames from PDF content.
5
+ Author-email: Nishant Kumar <abcnishant007@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/abcnishant007/meaningful-pdf-names
8
+ Project-URL: Source, https://github.com/abcnishant007/meaningful-pdf-names
9
+ Project-URL: Issues, https://github.com/abcnishant007/meaningful-pdf-names/issues
10
+ Keywords: pdf,rename,keywords,offline,cli
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Topic :: Utilities
15
+ Classifier: Topic :: Text Processing :: Linguistic
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: pypdf>=5.0.0
20
+ Provides-Extra: summarizer
21
+ Requires-Dist: transformers>=4.45.0; extra == "summarizer"
22
+ Requires-Dist: torch>=2.0.0; extra == "summarizer"
23
+ Dynamic: license-file
24
+
25
+ # meaningful-pdf-names
26
+
27
+ [![Python application](https://img.shields.io/badge/Python-3.9+-blue.svg)](https://www.python.org)
28
+ [![PyPI version](https://img.shields.io/pypi/v/meaningful-pdf-names.svg)](https://pypi.org/project/meaningful-pdf-names/)
29
+ [![codecov](https://codecov.io/gh/abcnishant007/meaningful-pdf-names/branch/main/graph/badge.svg)](https://codecov.io/gh/abcnishant007/meaningful-pdf-names)
30
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
31
+ [![Downloads](https://static.pepy.tech/badge/meaningful-pdf-names)](https://pepy.tech/projects/meaningful-pdf-names)
32
+
33
+ Offline-friendly CLI to turn your messy paper filenames into **compact, keyword-rich names** based on the PDF's first page.
34
+
35
+ Example:
36
+
37
+ `final_v3_really_final.pdf` → `urban-resilience-transport-inequality-policy-a9f.pdf`
38
+
39
+ ## Features
40
+
41
+ - Uses the **first 2 pages** by default (title, authors, abstract, introduction) for better context
42
+ - Configurable page count with `-p` flag (e.g., `-p 4` for 4 pages)
43
+ - Up to **5 meaningful keywords** per file
44
+ - Adds a **3-character [a-z0-9] suffix** to avoid collisions
45
+ - Works fully **offline** with `pypdf`
46
+ - Optional: use a small local Hugging Face summarizer
47
+ (`sshleifer/distilbart-cnn-12-6`) via `transformers` + `torch`
48
+
49
+ ## Prerequisites
50
+
51
+ - **Python 3.9+** installed on your system
52
+ - **pip** (Python package manager) - usually comes with Python
53
+
54
+ ## Quick Install
55
+
56
+ ### From PyPI (Recommended)
57
+
58
+ ```bash
59
+ pip install meaningful-pdf-names
60
+ ```
61
+
62
+ ## Quick Start Guide
63
+
64
+ ### For Mac Users
65
+
66
+ 1. **Install the package** (see above)
67
+ 2. **Navigate to your PDF folder**:
68
+ - Open Finder and go to the folder containing your PDFs
69
+ - Right-click on the folder and select "New Terminal at Folder"
70
+ - This opens Terminal directly in that folder
71
+ 3. **Run the command**:
72
+ ```bash
73
+ mpn .
74
+ ```
75
+
76
+ ### For Linux Users
77
+
78
+ 1. **Install the package** (see above)
79
+ 2. **Navigate to your PDF folder**:
80
+ ```bash
81
+ cd /path/to/your/pdf/folder
82
+ ```
83
+ 3. **Run the command**:
84
+ ```bash
85
+ mpn .
86
+ ```
87
+
88
+ ### For Any Folder Location
89
+
90
+ If you want to rename PDFs in a different folder without navigating there:
91
+
92
+ ```bash
93
+ mpn /full/path/to/your/pdf/folder
94
+ ```
95
+
96
+ ## Usage Examples
97
+
98
+ **Basic usage (current folder):**
99
+ ```bash
100
+ mpn .
101
+ ```
102
+
103
+ **Specific folder:**
104
+ ```bash
105
+ mpn ~/Downloads/research_papers
106
+ mpn /Users/username/Documents/PDFs
107
+ ```
108
+
109
+ **Dry run (preview changes without renaming):**
110
+ ```bash
111
+ mpn . --dry-run
112
+ ```
113
+
114
+ **Quiet mode (minimal output):**
115
+ ```bash
116
+ mpn . --quiet
117
+ ```
118
+
119
+ **Custom page count (read more pages for better context):**
120
+ ```bash
121
+ mpn . -p 4 # Read first 4 pages
122
+ mpn . -p 10 # Read up to 10 pages (or all if PDF has fewer)
123
+ ```
124
+
125
+ ## What It Does
126
+
127
+ - Scans all PDF files in the specified folder
128
+ - Extracts text from just the first page (fast!)
129
+ - Identifies meaningful keywords from titles, authors, abstracts
130
+ - Generates clean, readable filenames like:
131
+ - `climate-change-urban-planning-sustainability-a9f.pdf`
132
+ - `machine-learning-neural-networks-research-4x2.pdf`
133
+ - `healthcare-policy-digital-transformation-b7c.pdf`
134
+
135
+ ## Why Not Existing Tools?
136
+
137
+ Other tools often:
138
+
139
+ * Depend on **OpenAI / web APIs** (requires internet, API keys)
140
+ * Require DOIs or external metadata (not always available)
141
+ * Use long `Author - Title - Year` patterns (hard to read)
142
+
143
+ `meaningful-pdf-names` is:
144
+
145
+ * **Local-only** (no API keys, no network required)
146
+ * **Fast** (first-page only extraction)
147
+ * **Slug-based**: short, grep- and git-friendly names
148
+
149
+ ## License
150
+
151
+ MIT
@@ -0,0 +1,9 @@
1
+ meaningful_pdf_names/__init__.py,sha256=mxfnxTtjjT0RlBl5L1-W0AT-IdYIc_KQVhB5cOlylEw,48
2
+ meaningful_pdf_names/__main__.py,sha256=MSmt_5Xg84uHqzTN38JwgseJK8rsJn_11A8WD99VtEo,61
3
+ meaningful_pdf_names/cli.py,sha256=EtW6J53ywyQvKeOla3iwzhRtnEYB1j-PCPWc7FfDFpI,8040
4
+ meaningful_pdf_names-0.1.2.dist-info/licenses/LICENSE,sha256=OphKV48tcMv6ep-7j-8T6nycykPT0g8ZlMJ9zbGvdPs,1066
5
+ meaningful_pdf_names-0.1.2.dist-info/METADATA,sha256=OUJ-XTJnd7t7ERHC2JYryI3-I44iMW1nfvllFldDjZw,4530
6
+ meaningful_pdf_names-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ meaningful_pdf_names-0.1.2.dist-info/entry_points.txt,sha256=EtPEkZe_yMNP99BJDtBPI2DL20GO3E5ELmOm2F4aPO4,107
8
+ meaningful_pdf_names-0.1.2.dist-info/top_level.txt,sha256=TD_BuniRNpBdNggGi-6B8WQ4CxkYxzEgTSm2DfY4khw,21
9
+ meaningful_pdf_names-0.1.2.dist-info/RECORD,,
@@ -1,86 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: meaningful-pdf-names
3
- Version: 0.1.0
4
- Summary: Offline-friendly PDF renamer that generates meaningful, keyword-rich filenames from first-page content.
5
- Author-email: Nishant Kumar <abcnishant007@gmail.com>
6
- License: MIT
7
- Project-URL: Homepage, https://github.com/abcnishant007/meaningful-pdf-names
8
- Project-URL: Source, https://github.com/abcnishant007/meaningful-pdf-names
9
- Project-URL: Issues, https://github.com/abcnishant007/meaningful-pdf-names/issues
10
- Keywords: pdf,rename,keywords,offline,cli
11
- Classifier: Programming Language :: Python :: 3
12
- Classifier: License :: OSI Approved :: MIT License
13
- Classifier: Operating System :: OS Independent
14
- Classifier: Topic :: Utilities
15
- Classifier: Topic :: Text Processing :: Linguistic
16
- Requires-Python: >=3.9
17
- Description-Content-Type: text/markdown
18
- License-File: LICENSE
19
- Requires-Dist: pypdf>=5.0.0
20
- Provides-Extra: summarizer
21
- Requires-Dist: transformers>=4.45.0; extra == "summarizer"
22
- Requires-Dist: torch>=2.0.0; extra == "summarizer"
23
- Dynamic: license-file
24
-
25
- # meaningful-pdf-names
26
-
27
- Offline-friendly CLI to turn your messy paper filenames into **compact, keyword-rich names** based on the PDF's first page.
28
-
29
- Example:
30
-
31
- `final_v3_really_final.pdf` → `urban-resilience-transport-inequality-policy-a9f.pdf`
32
-
33
- ## Features
34
-
35
- - Uses only the **first page** (title, authors, abstract region) for speed.
36
- - Up to **5 meaningful keywords** per file.
37
- - Adds a **3-character [a-z0-9] suffix** to avoid collisions.
38
- - Works fully **offline** with `pypdf`.
39
- - Optional: use a small local Hugging Face summarizer
40
- (`sshleifer/distilbart-cnn-12-6`) via `transformers` + `torch`.
41
-
42
- ## Install
43
-
44
- From source / Git:
45
-
46
- ```bash
47
- pip install git+https://github.com/yourname/meaningful-pdf-names.git
48
- ```
49
-
50
- (When published to PyPI:)
51
-
52
- ```bash
53
- pip install meaningful-pdf-names
54
- ```
55
-
56
- With optional local summarizer:
57
-
58
- ```bash
59
- pip install "meaningful-pdf-names[summarizer]"
60
- ```
61
-
62
- ## Usage
63
-
64
- ```bash
65
- meaningful-pdf-names /path/to/pdfs
66
- meaningful-pdf-names /path/to/pdfs --dry-run
67
- mpn /path/to/pdfs
68
- ```
69
-
70
- ## Why not existing tools?
71
-
72
- Other tools often:
73
-
74
- * Depend on **OpenAI / web APIs**.
75
- * Require DOIs or external metadata.
76
- * Use long `Author - Title - Year` patterns.
77
-
78
- `meaningful-pdf-names` is:
79
-
80
- * **Local-only** (no API keys, no network).
81
- * **Fast** (first-page only).
82
- * **Slug-based**: short, grep- and git-friendly names.
83
-
84
- ## License
85
-
86
- MIT
@@ -1,9 +0,0 @@
1
- meaningful_pdf_names/__init__.py,sha256=tXbRXsO0NE_UV1kIHiZTTQQH0fj0U2KoxxNusu_gzrM,48
2
- meaningful_pdf_names/__main__.py,sha256=MSmt_5Xg84uHqzTN38JwgseJK8rsJn_11A8WD99VtEo,61
3
- meaningful_pdf_names/cli.py,sha256=C5eYS9ZTBfkf9urzKrN8G85b9-Kt0JN8qabs0CizWAs,7236
4
- meaningful_pdf_names-0.1.0.dist-info/licenses/LICENSE,sha256=OphKV48tcMv6ep-7j-8T6nycykPT0g8ZlMJ9zbGvdPs,1066
5
- meaningful_pdf_names-0.1.0.dist-info/METADATA,sha256=cIZjWGIsHtS-bkAf6tOL_J7YnwOXFit7_pytGZqt0q4,2365
6
- meaningful_pdf_names-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- meaningful_pdf_names-0.1.0.dist-info/entry_points.txt,sha256=EtPEkZe_yMNP99BJDtBPI2DL20GO3E5ELmOm2F4aPO4,107
8
- meaningful_pdf_names-0.1.0.dist-info/top_level.txt,sha256=TD_BuniRNpBdNggGi-6B8WQ4CxkYxzEgTSm2DfY4khw,21
9
- meaningful_pdf_names-0.1.0.dist-info/RECORD,,