refren 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- refren-0.1.0.dist-info/METADATA +26 -0
- refren-0.1.0.dist-info/RECORD +6 -0
- refren-0.1.0.dist-info/WHEEL +4 -0
- refren-0.1.0.dist-info/entry_points.txt +2 -0
- refren-0.1.0.dist-info/licenses/LICENSE +21 -0
- refren.py +247 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: refren
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Scientific manuscript PDF file renamer
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Requires-Dist: anthropic>=0.86.0
|
|
9
|
+
Requires-Dist: pdfplumber>=0.11
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# refren: scientific manuscript PDF file renamer
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
$ ./refren.py 1758-2946-6-10.pdf
|
|
16
|
+
(direct parsing incomplete — calling Claude API...)
|
|
17
|
+
First author last name : Krstajic
|
|
18
|
+
Second author last name: Buturovic
|
|
19
|
+
Journal : Journal of Cheminformatics -> J Cheminform
|
|
20
|
+
Year : 2014
|
|
21
|
+
|
|
22
|
+
1758-2946-6-10.pdf -> Krstajic_Buturovic_JCheminform_2014.pdf
|
|
23
|
+
Rename? [y/N] y
|
|
24
|
+
Renamed to: Krstajic_Buturovic_JCheminform_2014.pdf
|
|
25
|
+
```
|
|
26
|
+
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
refren.py,sha256=jqJe8l2Qdjz0fcKWQwyxeHpUyda_cvZggJdgot6ta8A,8461
|
|
2
|
+
refren-0.1.0.dist-info/METADATA,sha256=qU5yYrlaxr1Es_9To5YFu7Nd5I-v2cHAh5YQZBfg8OY,726
|
|
3
|
+
refren-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
4
|
+
refren-0.1.0.dist-info/entry_points.txt,sha256=MkoRFSweBva1PR16nAHJV12blb97lxLbF1KZ4UTqIfg,39
|
|
5
|
+
refren-0.1.0.dist-info/licenses/LICENSE,sha256=KFDwGS4OaI3VKJPFDHSmj8CakM5sD6WPDJN0_FN4MEQ,1068
|
|
6
|
+
refren-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 ljbuturovic
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
refren.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Rename a scientific PDF as: FirstAuthorLastName_SecondAuthorLastName_JournalAbbrev_Year.pdf
|
|
4
|
+
Usage: ./renamer.py <pdf_file>
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import anthropic
|
|
12
|
+
import pdfplumber
|
|
13
|
+
from pydantic import BaseModel
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# --- Journal abbreviation lookup ---
|
|
17
|
+
JOURNAL_ABBREVIATIONS = {
|
|
18
|
+
"diagnostic and prognostic research": "DiagProgRes",
|
|
19
|
+
"nature medicine": "NatMed",
|
|
20
|
+
"nature communications": "NatCommun",
|
|
21
|
+
"nature": "Nature",
|
|
22
|
+
"science": "Science",
|
|
23
|
+
"cell": "Cell",
|
|
24
|
+
"the lancet": "Lancet",
|
|
25
|
+
"lancet": "Lancet",
|
|
26
|
+
"new england journal of medicine": "NEJM",
|
|
27
|
+
"n engl j med": "NEJM",
|
|
28
|
+
"jama": "JAMA",
|
|
29
|
+
"bmj": "BMJ",
|
|
30
|
+
"annals of internal medicine": "AnnInternMed",
|
|
31
|
+
"plos medicine": "PLoSMed",
|
|
32
|
+
"plos one": "PLoSOne",
|
|
33
|
+
"plos biology": "PLoSBiol",
|
|
34
|
+
"plos computational biology": "PLoSComputBiol",
|
|
35
|
+
"bioinformatics": "Bioinformatics",
|
|
36
|
+
"nucleic acids research": "NucleicAcidsRes",
|
|
37
|
+
"genome biology": "GenomeBiol",
|
|
38
|
+
"genome research": "GenomeRes",
|
|
39
|
+
"molecular cell": "MolCell",
|
|
40
|
+
"cell reports": "CellRep",
|
|
41
|
+
"cell systems": "CellSyst",
|
|
42
|
+
"elife": "eLife",
|
|
43
|
+
"journal of clinical oncology": "JClinOncol",
|
|
44
|
+
"cancer research": "CancerRes",
|
|
45
|
+
"cancer cell": "CancerCell",
|
|
46
|
+
"clinical cancer research": "ClinCancerRes",
|
|
47
|
+
"journal of the american medical informatics association": "JAMIA",
|
|
48
|
+
"npj digital medicine": "NPJDigitMed",
|
|
49
|
+
"artificial intelligence in medicine": "ArtifIntellMed",
|
|
50
|
+
"medical image analysis": "MedImageAnal",
|
|
51
|
+
"radiology": "Radiology",
|
|
52
|
+
"european radiology": "EurRadiol",
|
|
53
|
+
"circulation": "Circulation",
|
|
54
|
+
"european heart journal": "EurHeartJ",
|
|
55
|
+
"journal of the american college of cardiology": "JACC",
|
|
56
|
+
"diabetes care": "DiabetesCare",
|
|
57
|
+
"statistics in medicine": "StatMed",
|
|
58
|
+
"biometrics": "Biometrics",
|
|
59
|
+
"american journal of epidemiology": "AmJEpidemiol",
|
|
60
|
+
"epidemiology": "Epidemiology",
|
|
61
|
+
"international journal of epidemiology": "IntJEpidemiol",
|
|
62
|
+
"brain": "Brain",
|
|
63
|
+
"journal of neurology": "JNeurol",
|
|
64
|
+
"neurology": "Neurology",
|
|
65
|
+
"gut": "Gut",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def abbreviate_journal(full_name: str) -> str:
|
|
70
|
+
"""Return a known abbreviation or derive a CamelCase abbreviation from the title."""
|
|
71
|
+
lower = full_name.lower().strip()
|
|
72
|
+
for key, abbr in JOURNAL_ABBREVIATIONS.items():
|
|
73
|
+
if key in lower:
|
|
74
|
+
return abbr
|
|
75
|
+
stop = {"a", "an", "the", "of", "in", "on", "and", "for", "to", "with", "&"}
|
|
76
|
+
words = re.sub(r"[^\w\s]", "", full_name).split()
|
|
77
|
+
return "".join(w.capitalize() for w in words if w.lower() not in stop) or "UnknownJournal"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def extract_last_name(name: str) -> str:
|
|
81
|
+
"""Return the last name from a full author name string (final word only)."""
|
|
82
|
+
name = name.strip().rstrip(",*0123456789† ")
|
|
83
|
+
parts = name.split()
|
|
84
|
+
return parts[-1] if parts else name
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def parse_authors_and_year(text: str):
|
|
88
|
+
"""
|
|
89
|
+
Extract (first_author_last, second_author_last, year) from first-page text.
|
|
90
|
+
Returns (first, second, year) — any may be None if not found.
|
|
91
|
+
"""
|
|
92
|
+
year = None
|
|
93
|
+
year_match = re.search(r"\b(20\d{2}|19\d{2})\b", text)
|
|
94
|
+
if year_match:
|
|
95
|
+
year = year_match.group(1)
|
|
96
|
+
|
|
97
|
+
first, second = None, None
|
|
98
|
+
|
|
99
|
+
# Pattern 1: "Surname et al." header
|
|
100
|
+
et_al_match = re.search(r"([A-Z][a-z]+(?:\s+[a-z]+)*)\s+et al\.", text)
|
|
101
|
+
if et_al_match:
|
|
102
|
+
first = et_al_match.group(1).strip()
|
|
103
|
+
|
|
104
|
+
# Pattern 2: full author line — comma-separated "Firstname Lastname, ..."
|
|
105
|
+
author_line_pattern = re.compile(
|
|
106
|
+
r"([A-Z][a-z]+(?:\s+[a-z]+)*\s+[A-Z][a-zA-Z\-]+[0-9†*,]*"
|
|
107
|
+
r"(?:\s*,\s*[A-Z][a-z]+(?:\s+[a-z]+)*\s+[A-Z][a-zA-Z\-]+[0-9†*,]*){1,})"
|
|
108
|
+
)
|
|
109
|
+
for match in author_line_pattern.finditer(text):
|
|
110
|
+
candidate = match.group(0)
|
|
111
|
+
raw_authors = [a.strip() for a in re.split(r",\s*", candidate) if a.strip()]
|
|
112
|
+
if len(raw_authors) >= 2:
|
|
113
|
+
last_names = [extract_last_name(a) for a in raw_authors]
|
|
114
|
+
if not first:
|
|
115
|
+
first = last_names[0]
|
|
116
|
+
second = last_names[1]
|
|
117
|
+
break
|
|
118
|
+
|
|
119
|
+
return first, second, year
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def extract_journal(text: str) -> str | None:
|
|
123
|
+
"""Try to find the journal name from the first-page text."""
|
|
124
|
+
lower = text.lower()
|
|
125
|
+
for key in JOURNAL_ABBREVIATIONS:
|
|
126
|
+
if key in lower:
|
|
127
|
+
idx = lower.find(key)
|
|
128
|
+
return text[idx: idx + len(key)]
|
|
129
|
+
patterns = [
|
|
130
|
+
r"(?:published in|journal)[:\s]+([A-Z][^\n]+)",
|
|
131
|
+
r"https?://doi\.org/[^\s]+\s+([A-Z][A-Za-z &]+)\n",
|
|
132
|
+
]
|
|
133
|
+
for pat in patterns:
|
|
134
|
+
m = re.search(pat, text, re.IGNORECASE)
|
|
135
|
+
if m:
|
|
136
|
+
return m.group(1).strip()
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def sanitize(s: str) -> str:
|
|
141
|
+
"""Remove characters unsafe for filenames."""
|
|
142
|
+
return re.sub(r"[^\w]", "", s)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# --- Claude API fallback ---
|
|
146
|
+
|
|
147
|
+
class PaperMetadata(BaseModel):
|
|
148
|
+
first_author_last_name: str
|
|
149
|
+
second_author_last_name: str
|
|
150
|
+
journal_full_name: str
|
|
151
|
+
journal_abbreviation: str
|
|
152
|
+
year: str
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def extract_via_llm(text: str, metadata: dict | None = None) -> PaperMetadata:
|
|
156
|
+
"""Use Claude to extract paper metadata from first-page text."""
|
|
157
|
+
print(" (direct parsing incomplete — calling Claude API...)")
|
|
158
|
+
client = anthropic.Anthropic()
|
|
159
|
+
response = client.messages.parse(
|
|
160
|
+
model="claude-opus-4-6",
|
|
161
|
+
max_tokens=512,
|
|
162
|
+
system=(
|
|
163
|
+
"You are a scientific literature assistant. "
|
|
164
|
+
"Extract bibliographic metadata from the first page of a scientific paper. "
|
|
165
|
+
"For journal_abbreviation, use the standard ISO/NLM abbreviation (e.g. 'Circulation', "
|
|
166
|
+
"'N Engl J Med', 'JAMA', 'Nat Med'). "
|
|
167
|
+
"Return only last names for authors (no initials, no titles, no credentials)."
|
|
168
|
+
),
|
|
169
|
+
messages=[{
|
|
170
|
+
"role": "user",
|
|
171
|
+
"content": (
|
|
172
|
+
"Extract the metadata from this scientific paper.\n\n"
|
|
173
|
+
+ (f"PDF metadata: {metadata}\n\n" if metadata else "")
|
|
174
|
+
+ f"First page text:\n{text[:4000]}"
|
|
175
|
+
),
|
|
176
|
+
}],
|
|
177
|
+
output_format=PaperMetadata,
|
|
178
|
+
)
|
|
179
|
+
return response.parsed_output
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def rename_pdf(pdf_path: str):
|
|
183
|
+
path = Path(pdf_path)
|
|
184
|
+
if not path.exists():
|
|
185
|
+
print(f"Error: file not found: {pdf_path}")
|
|
186
|
+
sys.exit(1)
|
|
187
|
+
if path.suffix.lower() != ".pdf":
|
|
188
|
+
print(f"Error: not a PDF file: {pdf_path}")
|
|
189
|
+
sys.exit(1)
|
|
190
|
+
|
|
191
|
+
with pdfplumber.open(path) as pdf:
|
|
192
|
+
metadata = pdf.metadata or {}
|
|
193
|
+
first_page_text = pdf.pages[0].extract_text() or ""
|
|
194
|
+
if len(first_page_text) < 200 and len(pdf.pages) > 1:
|
|
195
|
+
first_page_text += "\n" + (pdf.pages[1].extract_text() or "")
|
|
196
|
+
|
|
197
|
+
first, second, year = parse_authors_and_year(first_page_text)
|
|
198
|
+
journal_full = extract_journal(first_page_text)
|
|
199
|
+
journal_abbr = abbreviate_journal(journal_full) if journal_full else None
|
|
200
|
+
|
|
201
|
+
# Try to extract year from PDF metadata if not found in text
|
|
202
|
+
if not year:
|
|
203
|
+
meta_str = " ".join(str(v) for v in metadata.values())
|
|
204
|
+
m = re.search(r"\b(20\d{2}|19\d{2})\b", meta_str)
|
|
205
|
+
if m:
|
|
206
|
+
year = m.group(1)
|
|
207
|
+
|
|
208
|
+
missing = [label for label, val in [
|
|
209
|
+
("first author", first), ("second author", second),
|
|
210
|
+
("year", year), ("journal", journal_full),
|
|
211
|
+
] if not val]
|
|
212
|
+
|
|
213
|
+
if missing:
|
|
214
|
+
# When LLM is needed, trust it for all fields — regex results may also be wrong
|
|
215
|
+
meta = extract_via_llm(first_page_text, metadata)
|
|
216
|
+
first = meta.first_author_last_name
|
|
217
|
+
second = meta.second_author_last_name
|
|
218
|
+
year = meta.year
|
|
219
|
+
journal_full = meta.journal_full_name
|
|
220
|
+
journal_abbr = meta.journal_abbreviation
|
|
221
|
+
|
|
222
|
+
print(f" First author last name : {first}")
|
|
223
|
+
print(f" Second author last name: {second}")
|
|
224
|
+
print(f" Journal : {journal_full} -> {journal_abbr}")
|
|
225
|
+
print(f" Year : {year}")
|
|
226
|
+
|
|
227
|
+
new_name = f"{sanitize(first)}_{sanitize(second)}_{sanitize(journal_abbr)}_{sanitize(year)}.pdf"
|
|
228
|
+
new_path = path.parent / new_name
|
|
229
|
+
|
|
230
|
+
print(f"\n {path.name} -> {new_name}")
|
|
231
|
+
confirm = input("Rename? [y/N] ").strip().lower()
|
|
232
|
+
if confirm == "y":
|
|
233
|
+
path.rename(new_path)
|
|
234
|
+
print(f"Renamed to: {new_path}")
|
|
235
|
+
else:
|
|
236
|
+
print("Aborted.")
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def main():
|
|
240
|
+
if len(sys.argv) != 2:
|
|
241
|
+
print("Usage: ./renamer.py <pdf_file>")
|
|
242
|
+
sys.exit(1)
|
|
243
|
+
rename_pdf(sys.argv[1])
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
if __name__ == "__main__":
|
|
247
|
+
main()
|