refren 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ Metadata-Version: 2.4
2
+ Name: refren
3
+ Version: 0.1.0
4
+ Summary: Scientific manuscript PDF file renamer
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.12
8
+ Requires-Dist: anthropic>=0.86.0
9
+ Requires-Dist: pdfplumber>=0.11
10
+ Description-Content-Type: text/markdown
11
+
12
+ # refren: scientific manuscript PDF file renamer
13
+
14
+ ```bash
15
+ $ ./refren.py 1758-2946-6-10.pdf
16
+ (direct parsing incomplete — calling Claude API...)
17
+ First author last name : Krstajic
18
+ Second author last name: Buturovic
19
+ Journal : Journal of Cheminformatics -> J Cheminform
20
+ Year : 2014
21
+
22
+ 1758-2946-6-10.pdf -> Krstajic_Buturovic_JCheminform_2014.pdf
23
+ Rename? [y/N] y
24
+ Renamed to: Krstajic_Buturovic_JCheminform_2014.pdf
25
+ ```
26
+
@@ -0,0 +1,6 @@
1
+ refren.py,sha256=jqJe8l2Qdjz0fcKWQwyxeHpUyda_cvZggJdgot6ta8A,8461
2
+ refren-0.1.0.dist-info/METADATA,sha256=qU5yYrlaxr1Es_9To5YFu7Nd5I-v2cHAh5YQZBfg8OY,726
3
+ refren-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
4
+ refren-0.1.0.dist-info/entry_points.txt,sha256=MkoRFSweBva1PR16nAHJV12blb97lxLbF1KZ4UTqIfg,39
5
+ refren-0.1.0.dist-info/licenses/LICENSE,sha256=KFDwGS4OaI3VKJPFDHSmj8CakM5sD6WPDJN0_FN4MEQ,1068
6
+ refren-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ refren = refren:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 ljbuturovic
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
refren.py ADDED
@@ -0,0 +1,247 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Rename a scientific PDF as: FirstAuthorLastName_SecondAuthorLastName_JournalAbbrev_Year.pdf
4
+ Usage: ./renamer.py <pdf_file>
5
+ """
6
+
7
+ import re
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ import anthropic
12
+ import pdfplumber
13
+ from pydantic import BaseModel
14
+
15
+
16
+ # --- Journal abbreviation lookup ---
17
+ JOURNAL_ABBREVIATIONS = {
18
+ "diagnostic and prognostic research": "DiagProgRes",
19
+ "nature medicine": "NatMed",
20
+ "nature communications": "NatCommun",
21
+ "nature": "Nature",
22
+ "science": "Science",
23
+ "cell": "Cell",
24
+ "the lancet": "Lancet",
25
+ "lancet": "Lancet",
26
+ "new england journal of medicine": "NEJM",
27
+ "n engl j med": "NEJM",
28
+ "jama": "JAMA",
29
+ "bmj": "BMJ",
30
+ "annals of internal medicine": "AnnInternMed",
31
+ "plos medicine": "PLoSMed",
32
+ "plos one": "PLoSOne",
33
+ "plos biology": "PLoSBiol",
34
+ "plos computational biology": "PLoSComputBiol",
35
+ "bioinformatics": "Bioinformatics",
36
+ "nucleic acids research": "NucleicAcidsRes",
37
+ "genome biology": "GenomeBiol",
38
+ "genome research": "GenomeRes",
39
+ "molecular cell": "MolCell",
40
+ "cell reports": "CellRep",
41
+ "cell systems": "CellSyst",
42
+ "elife": "eLife",
43
+ "journal of clinical oncology": "JClinOncol",
44
+ "cancer research": "CancerRes",
45
+ "cancer cell": "CancerCell",
46
+ "clinical cancer research": "ClinCancerRes",
47
+ "journal of the american medical informatics association": "JAMIA",
48
+ "npj digital medicine": "NPJDigitMed",
49
+ "artificial intelligence in medicine": "ArtifIntellMed",
50
+ "medical image analysis": "MedImageAnal",
51
+ "radiology": "Radiology",
52
+ "european radiology": "EurRadiol",
53
+ "circulation": "Circulation",
54
+ "european heart journal": "EurHeartJ",
55
+ "journal of the american college of cardiology": "JACC",
56
+ "diabetes care": "DiabetesCare",
57
+ "statistics in medicine": "StatMed",
58
+ "biometrics": "Biometrics",
59
+ "american journal of epidemiology": "AmJEpidemiol",
60
+ "epidemiology": "Epidemiology",
61
+ "international journal of epidemiology": "IntJEpidemiol",
62
+ "brain": "Brain",
63
+ "journal of neurology": "JNeurol",
64
+ "neurology": "Neurology",
65
+ "gut": "Gut",
66
+ }
67
+
68
+
69
+ def abbreviate_journal(full_name: str) -> str:
70
+ """Return a known abbreviation or derive a CamelCase abbreviation from the title."""
71
+ lower = full_name.lower().strip()
72
+ for key, abbr in JOURNAL_ABBREVIATIONS.items():
73
+ if key in lower:
74
+ return abbr
75
+ stop = {"a", "an", "the", "of", "in", "on", "and", "for", "to", "with", "&"}
76
+ words = re.sub(r"[^\w\s]", "", full_name).split()
77
+ return "".join(w.capitalize() for w in words if w.lower() not in stop) or "UnknownJournal"
78
+
79
+
80
+ def extract_last_name(name: str) -> str:
81
+ """Return the last name from a full author name string (final word only)."""
82
+ name = name.strip().rstrip(",*0123456789† ")
83
+ parts = name.split()
84
+ return parts[-1] if parts else name
85
+
86
+
87
+ def parse_authors_and_year(text: str):
88
+ """
89
+ Extract (first_author_last, second_author_last, year) from first-page text.
90
+ Returns (first, second, year) — any may be None if not found.
91
+ """
92
+ year = None
93
+ year_match = re.search(r"\b(20\d{2}|19\d{2})\b", text)
94
+ if year_match:
95
+ year = year_match.group(1)
96
+
97
+ first, second = None, None
98
+
99
+ # Pattern 1: "Surname et al." header
100
+ et_al_match = re.search(r"([A-Z][a-z]+(?:\s+[a-z]+)*)\s+et al\.", text)
101
+ if et_al_match:
102
+ first = et_al_match.group(1).strip()
103
+
104
+ # Pattern 2: full author line — comma-separated "Firstname Lastname, ..."
105
+ author_line_pattern = re.compile(
106
+ r"([A-Z][a-z]+(?:\s+[a-z]+)*\s+[A-Z][a-zA-Z\-]+[0-9†*,]*"
107
+ r"(?:\s*,\s*[A-Z][a-z]+(?:\s+[a-z]+)*\s+[A-Z][a-zA-Z\-]+[0-9†*,]*){1,})"
108
+ )
109
+ for match in author_line_pattern.finditer(text):
110
+ candidate = match.group(0)
111
+ raw_authors = [a.strip() for a in re.split(r",\s*", candidate) if a.strip()]
112
+ if len(raw_authors) >= 2:
113
+ last_names = [extract_last_name(a) for a in raw_authors]
114
+ if not first:
115
+ first = last_names[0]
116
+ second = last_names[1]
117
+ break
118
+
119
+ return first, second, year
120
+
121
+
122
+ def extract_journal(text: str) -> str | None:
123
+ """Try to find the journal name from the first-page text."""
124
+ lower = text.lower()
125
+ for key in JOURNAL_ABBREVIATIONS:
126
+ if key in lower:
127
+ idx = lower.find(key)
128
+ return text[idx: idx + len(key)]
129
+ patterns = [
130
+ r"(?:published in|journal)[:\s]+([A-Z][^\n]+)",
131
+ r"https?://doi\.org/[^\s]+\s+([A-Z][A-Za-z &]+)\n",
132
+ ]
133
+ for pat in patterns:
134
+ m = re.search(pat, text, re.IGNORECASE)
135
+ if m:
136
+ return m.group(1).strip()
137
+ return None
138
+
139
+
140
+ def sanitize(s: str) -> str:
141
+ """Remove characters unsafe for filenames."""
142
+ return re.sub(r"[^\w]", "", s)
143
+
144
+
145
+ # --- Claude API fallback ---
146
+
147
+ class PaperMetadata(BaseModel):
148
+ first_author_last_name: str
149
+ second_author_last_name: str
150
+ journal_full_name: str
151
+ journal_abbreviation: str
152
+ year: str
153
+
154
+
155
+ def extract_via_llm(text: str, metadata: dict | None = None) -> PaperMetadata:
156
+ """Use Claude to extract paper metadata from first-page text."""
157
+ print(" (direct parsing incomplete — calling Claude API...)")
158
+ client = anthropic.Anthropic()
159
+ response = client.messages.parse(
160
+ model="claude-opus-4-6",
161
+ max_tokens=512,
162
+ system=(
163
+ "You are a scientific literature assistant. "
164
+ "Extract bibliographic metadata from the first page of a scientific paper. "
165
+ "For journal_abbreviation, use the standard ISO/NLM abbreviation (e.g. 'Circulation', "
166
+ "'N Engl J Med', 'JAMA', 'Nat Med'). "
167
+ "Return only last names for authors (no initials, no titles, no credentials)."
168
+ ),
169
+ messages=[{
170
+ "role": "user",
171
+ "content": (
172
+ "Extract the metadata from this scientific paper.\n\n"
173
+ + (f"PDF metadata: {metadata}\n\n" if metadata else "")
174
+ + f"First page text:\n{text[:4000]}"
175
+ ),
176
+ }],
177
+ output_format=PaperMetadata,
178
+ )
179
+ return response.parsed_output
180
+
181
+
182
+ def rename_pdf(pdf_path: str):
183
+ path = Path(pdf_path)
184
+ if not path.exists():
185
+ print(f"Error: file not found: {pdf_path}")
186
+ sys.exit(1)
187
+ if path.suffix.lower() != ".pdf":
188
+ print(f"Error: not a PDF file: {pdf_path}")
189
+ sys.exit(1)
190
+
191
+ with pdfplumber.open(path) as pdf:
192
+ metadata = pdf.metadata or {}
193
+ first_page_text = pdf.pages[0].extract_text() or ""
194
+ if len(first_page_text) < 200 and len(pdf.pages) > 1:
195
+ first_page_text += "\n" + (pdf.pages[1].extract_text() or "")
196
+
197
+ first, second, year = parse_authors_and_year(first_page_text)
198
+ journal_full = extract_journal(first_page_text)
199
+ journal_abbr = abbreviate_journal(journal_full) if journal_full else None
200
+
201
+ # Try to extract year from PDF metadata if not found in text
202
+ if not year:
203
+ meta_str = " ".join(str(v) for v in metadata.values())
204
+ m = re.search(r"\b(20\d{2}|19\d{2})\b", meta_str)
205
+ if m:
206
+ year = m.group(1)
207
+
208
+ missing = [label for label, val in [
209
+ ("first author", first), ("second author", second),
210
+ ("year", year), ("journal", journal_full),
211
+ ] if not val]
212
+
213
+ if missing:
214
+ # When LLM is needed, trust it for all fields — regex results may also be wrong
215
+ meta = extract_via_llm(first_page_text, metadata)
216
+ first = meta.first_author_last_name
217
+ second = meta.second_author_last_name
218
+ year = meta.year
219
+ journal_full = meta.journal_full_name
220
+ journal_abbr = meta.journal_abbreviation
221
+
222
+ print(f" First author last name : {first}")
223
+ print(f" Second author last name: {second}")
224
+ print(f" Journal : {journal_full} -> {journal_abbr}")
225
+ print(f" Year : {year}")
226
+
227
+ new_name = f"{sanitize(first)}_{sanitize(second)}_{sanitize(journal_abbr)}_{sanitize(year)}.pdf"
228
+ new_path = path.parent / new_name
229
+
230
+ print(f"\n {path.name} -> {new_name}")
231
+ confirm = input("Rename? [y/N] ").strip().lower()
232
+ if confirm == "y":
233
+ path.rename(new_path)
234
+ print(f"Renamed to: {new_path}")
235
+ else:
236
+ print("Aborted.")
237
+
238
+
239
+ def main():
240
+ if len(sys.argv) != 2:
241
+ print("Usage: ./renamer.py <pdf_file>")
242
+ sys.exit(1)
243
+ rename_pdf(sys.argv[1])
244
+
245
+
246
+ if __name__ == "__main__":
247
+ main()