refren 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- refren-0.1.0/.gitignore +207 -0
- refren-0.1.0/.python-version +1 -0
- refren-0.1.0/1758-2946-6-10.pdf +0 -0
- refren-0.1.0/Krstajic_Buturovic_JCheminform_2014.pdf +0 -0
- refren-0.1.0/LICENSE +21 -0
- refren-0.1.0/PKG-INFO +26 -0
- refren-0.1.0/README.md +15 -0
- refren-0.1.0/main.py +6 -0
- refren-0.1.0/pyproject.toml +25 -0
- refren-0.1.0/refren.py +247 -0
- refren-0.1.0/requirements.txt +137 -0
- refren-0.1.0/uv.lock +997 -0
refren-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
#poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
#pdm.lock
|
|
116
|
+
#pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
#pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# SageMath parsed files
|
|
135
|
+
*.sage.py
|
|
136
|
+
|
|
137
|
+
# Environments
|
|
138
|
+
.env
|
|
139
|
+
.envrc
|
|
140
|
+
.venv
|
|
141
|
+
env/
|
|
142
|
+
venv/
|
|
143
|
+
ENV/
|
|
144
|
+
env.bak/
|
|
145
|
+
venv.bak/
|
|
146
|
+
|
|
147
|
+
# Spyder project settings
|
|
148
|
+
.spyderproject
|
|
149
|
+
.spyproject
|
|
150
|
+
|
|
151
|
+
# Rope project settings
|
|
152
|
+
.ropeproject
|
|
153
|
+
|
|
154
|
+
# mkdocs documentation
|
|
155
|
+
/site
|
|
156
|
+
|
|
157
|
+
# mypy
|
|
158
|
+
.mypy_cache/
|
|
159
|
+
.dmypy.json
|
|
160
|
+
dmypy.json
|
|
161
|
+
|
|
162
|
+
# Pyre type checker
|
|
163
|
+
.pyre/
|
|
164
|
+
|
|
165
|
+
# pytype static type analyzer
|
|
166
|
+
.pytype/
|
|
167
|
+
|
|
168
|
+
# Cython debug symbols
|
|
169
|
+
cython_debug/
|
|
170
|
+
|
|
171
|
+
# PyCharm
|
|
172
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
173
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
174
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
175
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
176
|
+
#.idea/
|
|
177
|
+
|
|
178
|
+
# Abstra
|
|
179
|
+
# Abstra is an AI-powered process automation framework.
|
|
180
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
181
|
+
# Learn more at https://abstra.io/docs
|
|
182
|
+
.abstra/
|
|
183
|
+
|
|
184
|
+
# Visual Studio Code
|
|
185
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
186
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
188
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
189
|
+
# .vscode/
|
|
190
|
+
|
|
191
|
+
# Ruff stuff:
|
|
192
|
+
.ruff_cache/
|
|
193
|
+
|
|
194
|
+
# PyPI configuration file
|
|
195
|
+
.pypirc
|
|
196
|
+
|
|
197
|
+
# Cursor
|
|
198
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
|
199
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
200
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
201
|
+
.cursorignore
|
|
202
|
+
.cursorindexingignore
|
|
203
|
+
|
|
204
|
+
# Marimo
|
|
205
|
+
marimo/_static/
|
|
206
|
+
marimo/_lsp/
|
|
207
|
+
__marimo__/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
Binary file
|
|
Binary file
|
refren-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 ljbuturovic
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
refren-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: refren
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Scientific manuscript PDF file renamer
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Requires-Dist: anthropic>=0.86.0
|
|
9
|
+
Requires-Dist: pdfplumber>=0.11
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# refren: scientific manuscript PDF file renamer
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
$ ./refren.py 1758-2946-6-10.pdf
|
|
16
|
+
(direct parsing incomplete — calling Claude API...)
|
|
17
|
+
First author last name : Krstajic
|
|
18
|
+
Second author last name: Buturovic
|
|
19
|
+
Journal : Journal of Cheminformatics -> J Cheminform
|
|
20
|
+
Year : 2014
|
|
21
|
+
|
|
22
|
+
1758-2946-6-10.pdf -> Krstajic_Buturovic_JCheminform_2014.pdf
|
|
23
|
+
Rename? [y/N] y
|
|
24
|
+
Renamed to: Krstajic_Buturovic_JCheminform_2014.pdf
|
|
25
|
+
```
|
|
26
|
+
|
refren-0.1.0/README.md
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# refren: scientific manuscript PDF file renamer
|
|
2
|
+
|
|
3
|
+
```bash
|
|
4
|
+
$ ./refren.py 1758-2946-6-10.pdf
|
|
5
|
+
(direct parsing incomplete — calling Claude API...)
|
|
6
|
+
First author last name : Krstajic
|
|
7
|
+
Second author last name: Buturovic
|
|
8
|
+
Journal : Journal of Cheminformatics -> J Cheminform
|
|
9
|
+
Year : 2014
|
|
10
|
+
|
|
11
|
+
1758-2946-6-10.pdf -> Krstajic_Buturovic_JCheminform_2014.pdf
|
|
12
|
+
Rename? [y/N] y
|
|
13
|
+
Renamed to: Krstajic_Buturovic_JCheminform_2014.pdf
|
|
14
|
+
```
|
|
15
|
+
|
refren-0.1.0/main.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "refren"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Scientific manuscript PDF file renamer"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
dependencies = [
|
|
9
|
+
"anthropic>=0.86.0",
|
|
10
|
+
"pdfplumber>=0.11",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[project.scripts]
|
|
14
|
+
refren = "refren:main"
|
|
15
|
+
|
|
16
|
+
[build-system]
|
|
17
|
+
requires = ["hatchling"]
|
|
18
|
+
build-backend = "hatchling.build"
|
|
19
|
+
|
|
20
|
+
[dependency-groups]
|
|
21
|
+
dev = [
|
|
22
|
+
"build>=1.4.2",
|
|
23
|
+
"hatchling>=1.29.0",
|
|
24
|
+
"twine>=6.2.0",
|
|
25
|
+
]
|
refren-0.1.0/refren.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Rename a scientific PDF as: FirstAuthorLastName_SecondAuthorLastName_JournalAbbrev_Year.pdf
|
|
4
|
+
Usage: ./renamer.py <pdf_file>
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import anthropic
|
|
12
|
+
import pdfplumber
|
|
13
|
+
from pydantic import BaseModel
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# --- Journal abbreviation lookup ---
|
|
17
|
+
JOURNAL_ABBREVIATIONS = {
|
|
18
|
+
"diagnostic and prognostic research": "DiagProgRes",
|
|
19
|
+
"nature medicine": "NatMed",
|
|
20
|
+
"nature communications": "NatCommun",
|
|
21
|
+
"nature": "Nature",
|
|
22
|
+
"science": "Science",
|
|
23
|
+
"cell": "Cell",
|
|
24
|
+
"the lancet": "Lancet",
|
|
25
|
+
"lancet": "Lancet",
|
|
26
|
+
"new england journal of medicine": "NEJM",
|
|
27
|
+
"n engl j med": "NEJM",
|
|
28
|
+
"jama": "JAMA",
|
|
29
|
+
"bmj": "BMJ",
|
|
30
|
+
"annals of internal medicine": "AnnInternMed",
|
|
31
|
+
"plos medicine": "PLoSMed",
|
|
32
|
+
"plos one": "PLoSOne",
|
|
33
|
+
"plos biology": "PLoSBiol",
|
|
34
|
+
"plos computational biology": "PLoSComputBiol",
|
|
35
|
+
"bioinformatics": "Bioinformatics",
|
|
36
|
+
"nucleic acids research": "NucleicAcidsRes",
|
|
37
|
+
"genome biology": "GenomeBiol",
|
|
38
|
+
"genome research": "GenomeRes",
|
|
39
|
+
"molecular cell": "MolCell",
|
|
40
|
+
"cell reports": "CellRep",
|
|
41
|
+
"cell systems": "CellSyst",
|
|
42
|
+
"elife": "eLife",
|
|
43
|
+
"journal of clinical oncology": "JClinOncol",
|
|
44
|
+
"cancer research": "CancerRes",
|
|
45
|
+
"cancer cell": "CancerCell",
|
|
46
|
+
"clinical cancer research": "ClinCancerRes",
|
|
47
|
+
"journal of the american medical informatics association": "JAMIA",
|
|
48
|
+
"npj digital medicine": "NPJDigitMed",
|
|
49
|
+
"artificial intelligence in medicine": "ArtifIntellMed",
|
|
50
|
+
"medical image analysis": "MedImageAnal",
|
|
51
|
+
"radiology": "Radiology",
|
|
52
|
+
"european radiology": "EurRadiol",
|
|
53
|
+
"circulation": "Circulation",
|
|
54
|
+
"european heart journal": "EurHeartJ",
|
|
55
|
+
"journal of the american college of cardiology": "JACC",
|
|
56
|
+
"diabetes care": "DiabetesCare",
|
|
57
|
+
"statistics in medicine": "StatMed",
|
|
58
|
+
"biometrics": "Biometrics",
|
|
59
|
+
"american journal of epidemiology": "AmJEpidemiol",
|
|
60
|
+
"epidemiology": "Epidemiology",
|
|
61
|
+
"international journal of epidemiology": "IntJEpidemiol",
|
|
62
|
+
"brain": "Brain",
|
|
63
|
+
"journal of neurology": "JNeurol",
|
|
64
|
+
"neurology": "Neurology",
|
|
65
|
+
"gut": "Gut",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def abbreviate_journal(full_name: str) -> str:
|
|
70
|
+
"""Return a known abbreviation or derive a CamelCase abbreviation from the title."""
|
|
71
|
+
lower = full_name.lower().strip()
|
|
72
|
+
for key, abbr in JOURNAL_ABBREVIATIONS.items():
|
|
73
|
+
if key in lower:
|
|
74
|
+
return abbr
|
|
75
|
+
stop = {"a", "an", "the", "of", "in", "on", "and", "for", "to", "with", "&"}
|
|
76
|
+
words = re.sub(r"[^\w\s]", "", full_name).split()
|
|
77
|
+
return "".join(w.capitalize() for w in words if w.lower() not in stop) or "UnknownJournal"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def extract_last_name(name: str) -> str:
|
|
81
|
+
"""Return the last name from a full author name string (final word only)."""
|
|
82
|
+
name = name.strip().rstrip(",*0123456789† ")
|
|
83
|
+
parts = name.split()
|
|
84
|
+
return parts[-1] if parts else name
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def parse_authors_and_year(text: str):
|
|
88
|
+
"""
|
|
89
|
+
Extract (first_author_last, second_author_last, year) from first-page text.
|
|
90
|
+
Returns (first, second, year) — any may be None if not found.
|
|
91
|
+
"""
|
|
92
|
+
year = None
|
|
93
|
+
year_match = re.search(r"\b(20\d{2}|19\d{2})\b", text)
|
|
94
|
+
if year_match:
|
|
95
|
+
year = year_match.group(1)
|
|
96
|
+
|
|
97
|
+
first, second = None, None
|
|
98
|
+
|
|
99
|
+
# Pattern 1: "Surname et al." header
|
|
100
|
+
et_al_match = re.search(r"([A-Z][a-z]+(?:\s+[a-z]+)*)\s+et al\.", text)
|
|
101
|
+
if et_al_match:
|
|
102
|
+
first = et_al_match.group(1).strip()
|
|
103
|
+
|
|
104
|
+
# Pattern 2: full author line — comma-separated "Firstname Lastname, ..."
|
|
105
|
+
author_line_pattern = re.compile(
|
|
106
|
+
r"([A-Z][a-z]+(?:\s+[a-z]+)*\s+[A-Z][a-zA-Z\-]+[0-9†*,]*"
|
|
107
|
+
r"(?:\s*,\s*[A-Z][a-z]+(?:\s+[a-z]+)*\s+[A-Z][a-zA-Z\-]+[0-9†*,]*){1,})"
|
|
108
|
+
)
|
|
109
|
+
for match in author_line_pattern.finditer(text):
|
|
110
|
+
candidate = match.group(0)
|
|
111
|
+
raw_authors = [a.strip() for a in re.split(r",\s*", candidate) if a.strip()]
|
|
112
|
+
if len(raw_authors) >= 2:
|
|
113
|
+
last_names = [extract_last_name(a) for a in raw_authors]
|
|
114
|
+
if not first:
|
|
115
|
+
first = last_names[0]
|
|
116
|
+
second = last_names[1]
|
|
117
|
+
break
|
|
118
|
+
|
|
119
|
+
return first, second, year
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def extract_journal(text: str) -> str | None:
|
|
123
|
+
"""Try to find the journal name from the first-page text."""
|
|
124
|
+
lower = text.lower()
|
|
125
|
+
for key in JOURNAL_ABBREVIATIONS:
|
|
126
|
+
if key in lower:
|
|
127
|
+
idx = lower.find(key)
|
|
128
|
+
return text[idx: idx + len(key)]
|
|
129
|
+
patterns = [
|
|
130
|
+
r"(?:published in|journal)[:\s]+([A-Z][^\n]+)",
|
|
131
|
+
r"https?://doi\.org/[^\s]+\s+([A-Z][A-Za-z &]+)\n",
|
|
132
|
+
]
|
|
133
|
+
for pat in patterns:
|
|
134
|
+
m = re.search(pat, text, re.IGNORECASE)
|
|
135
|
+
if m:
|
|
136
|
+
return m.group(1).strip()
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def sanitize(s: str) -> str:
|
|
141
|
+
"""Remove characters unsafe for filenames."""
|
|
142
|
+
return re.sub(r"[^\w]", "", s)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# --- Claude API fallback ---
|
|
146
|
+
|
|
147
|
+
class PaperMetadata(BaseModel):
|
|
148
|
+
first_author_last_name: str
|
|
149
|
+
second_author_last_name: str
|
|
150
|
+
journal_full_name: str
|
|
151
|
+
journal_abbreviation: str
|
|
152
|
+
year: str
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def extract_via_llm(text: str, metadata: dict | None = None) -> PaperMetadata:
|
|
156
|
+
"""Use Claude to extract paper metadata from first-page text."""
|
|
157
|
+
print(" (direct parsing incomplete — calling Claude API...)")
|
|
158
|
+
client = anthropic.Anthropic()
|
|
159
|
+
response = client.messages.parse(
|
|
160
|
+
model="claude-opus-4-6",
|
|
161
|
+
max_tokens=512,
|
|
162
|
+
system=(
|
|
163
|
+
"You are a scientific literature assistant. "
|
|
164
|
+
"Extract bibliographic metadata from the first page of a scientific paper. "
|
|
165
|
+
"For journal_abbreviation, use the standard ISO/NLM abbreviation (e.g. 'Circulation', "
|
|
166
|
+
"'N Engl J Med', 'JAMA', 'Nat Med'). "
|
|
167
|
+
"Return only last names for authors (no initials, no titles, no credentials)."
|
|
168
|
+
),
|
|
169
|
+
messages=[{
|
|
170
|
+
"role": "user",
|
|
171
|
+
"content": (
|
|
172
|
+
"Extract the metadata from this scientific paper.\n\n"
|
|
173
|
+
+ (f"PDF metadata: {metadata}\n\n" if metadata else "")
|
|
174
|
+
+ f"First page text:\n{text[:4000]}"
|
|
175
|
+
),
|
|
176
|
+
}],
|
|
177
|
+
output_format=PaperMetadata,
|
|
178
|
+
)
|
|
179
|
+
return response.parsed_output
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def rename_pdf(pdf_path: str):
|
|
183
|
+
path = Path(pdf_path)
|
|
184
|
+
if not path.exists():
|
|
185
|
+
print(f"Error: file not found: {pdf_path}")
|
|
186
|
+
sys.exit(1)
|
|
187
|
+
if path.suffix.lower() != ".pdf":
|
|
188
|
+
print(f"Error: not a PDF file: {pdf_path}")
|
|
189
|
+
sys.exit(1)
|
|
190
|
+
|
|
191
|
+
with pdfplumber.open(path) as pdf:
|
|
192
|
+
metadata = pdf.metadata or {}
|
|
193
|
+
first_page_text = pdf.pages[0].extract_text() or ""
|
|
194
|
+
if len(first_page_text) < 200 and len(pdf.pages) > 1:
|
|
195
|
+
first_page_text += "\n" + (pdf.pages[1].extract_text() or "")
|
|
196
|
+
|
|
197
|
+
first, second, year = parse_authors_and_year(first_page_text)
|
|
198
|
+
journal_full = extract_journal(first_page_text)
|
|
199
|
+
journal_abbr = abbreviate_journal(journal_full) if journal_full else None
|
|
200
|
+
|
|
201
|
+
# Try to extract year from PDF metadata if not found in text
|
|
202
|
+
if not year:
|
|
203
|
+
meta_str = " ".join(str(v) for v in metadata.values())
|
|
204
|
+
m = re.search(r"\b(20\d{2}|19\d{2})\b", meta_str)
|
|
205
|
+
if m:
|
|
206
|
+
year = m.group(1)
|
|
207
|
+
|
|
208
|
+
missing = [label for label, val in [
|
|
209
|
+
("first author", first), ("second author", second),
|
|
210
|
+
("year", year), ("journal", journal_full),
|
|
211
|
+
] if not val]
|
|
212
|
+
|
|
213
|
+
if missing:
|
|
214
|
+
# When LLM is needed, trust it for all fields — regex results may also be wrong
|
|
215
|
+
meta = extract_via_llm(first_page_text, metadata)
|
|
216
|
+
first = meta.first_author_last_name
|
|
217
|
+
second = meta.second_author_last_name
|
|
218
|
+
year = meta.year
|
|
219
|
+
journal_full = meta.journal_full_name
|
|
220
|
+
journal_abbr = meta.journal_abbreviation
|
|
221
|
+
|
|
222
|
+
print(f" First author last name : {first}")
|
|
223
|
+
print(f" Second author last name: {second}")
|
|
224
|
+
print(f" Journal : {journal_full} -> {journal_abbr}")
|
|
225
|
+
print(f" Year : {year}")
|
|
226
|
+
|
|
227
|
+
new_name = f"{sanitize(first)}_{sanitize(second)}_{sanitize(journal_abbr)}_{sanitize(year)}.pdf"
|
|
228
|
+
new_path = path.parent / new_name
|
|
229
|
+
|
|
230
|
+
print(f"\n {path.name} -> {new_name}")
|
|
231
|
+
confirm = input("Rename? [y/N] ").strip().lower()
|
|
232
|
+
if confirm == "y":
|
|
233
|
+
path.rename(new_path)
|
|
234
|
+
print(f"Renamed to: {new_path}")
|
|
235
|
+
else:
|
|
236
|
+
print("Aborted.")
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def main():
|
|
240
|
+
if len(sys.argv) != 2:
|
|
241
|
+
print("Usage: ./renamer.py <pdf_file>")
|
|
242
|
+
sys.exit(1)
|
|
243
|
+
rename_pdf(sys.argv[1])
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
if __name__ == "__main__":
|
|
247
|
+
main()
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# This file was autogenerated by uv via the following command:
|
|
2
|
+
# uv export --format requirements-txt --no-hashes -o requirements.txt
|
|
3
|
+
-e .
|
|
4
|
+
annotated-types==0.7.0
|
|
5
|
+
# via pydantic
|
|
6
|
+
anthropic==0.86.0
|
|
7
|
+
# via refren
|
|
8
|
+
anyio==4.13.0
|
|
9
|
+
# via
|
|
10
|
+
# anthropic
|
|
11
|
+
# httpx
|
|
12
|
+
build==1.4.2
|
|
13
|
+
certifi==2026.2.25
|
|
14
|
+
# via
|
|
15
|
+
# httpcore
|
|
16
|
+
# httpx
|
|
17
|
+
# requests
|
|
18
|
+
cffi==2.0.0 ; platform_python_implementation != 'PyPy'
|
|
19
|
+
# via cryptography
|
|
20
|
+
charset-normalizer==3.4.6
|
|
21
|
+
# via
|
|
22
|
+
# pdfminer-six
|
|
23
|
+
# requests
|
|
24
|
+
colorama==0.4.6 ; os_name == 'nt'
|
|
25
|
+
# via build
|
|
26
|
+
cryptography==46.0.6
|
|
27
|
+
# via
|
|
28
|
+
# pdfminer-six
|
|
29
|
+
# secretstorage
|
|
30
|
+
distro==1.9.0
|
|
31
|
+
# via anthropic
|
|
32
|
+
docstring-parser==0.17.0
|
|
33
|
+
# via anthropic
|
|
34
|
+
docutils==0.22.4
|
|
35
|
+
# via readme-renderer
|
|
36
|
+
h11==0.16.0
|
|
37
|
+
# via httpcore
|
|
38
|
+
hatchling==1.29.0
|
|
39
|
+
httpcore==1.0.9
|
|
40
|
+
# via httpx
|
|
41
|
+
httpx==0.28.1
|
|
42
|
+
# via anthropic
|
|
43
|
+
id==1.6.1
|
|
44
|
+
# via twine
|
|
45
|
+
idna==3.11
|
|
46
|
+
# via
|
|
47
|
+
# anyio
|
|
48
|
+
# httpx
|
|
49
|
+
# requests
|
|
50
|
+
jaraco-classes==3.4.0 ; platform_machine != 'ppc64le' and platform_machine != 's390x'
|
|
51
|
+
# via keyring
|
|
52
|
+
jaraco-context==6.1.2 ; platform_machine != 'ppc64le' and platform_machine != 's390x'
|
|
53
|
+
# via keyring
|
|
54
|
+
jaraco-functools==4.4.0 ; platform_machine != 'ppc64le' and platform_machine != 's390x'
|
|
55
|
+
# via keyring
|
|
56
|
+
jeepney==0.9.0 ; platform_machine != 'ppc64le' and platform_machine != 's390x' and sys_platform == 'linux'
|
|
57
|
+
# via
|
|
58
|
+
# keyring
|
|
59
|
+
# secretstorage
|
|
60
|
+
jiter==0.13.0
|
|
61
|
+
# via anthropic
|
|
62
|
+
keyring==25.7.0 ; platform_machine != 'ppc64le' and platform_machine != 's390x'
|
|
63
|
+
# via twine
|
|
64
|
+
markdown-it-py==4.0.0
|
|
65
|
+
# via rich
|
|
66
|
+
mdurl==0.1.2
|
|
67
|
+
# via markdown-it-py
|
|
68
|
+
more-itertools==10.8.0 ; platform_machine != 'ppc64le' and platform_machine != 's390x'
|
|
69
|
+
# via
|
|
70
|
+
# jaraco-classes
|
|
71
|
+
# jaraco-functools
|
|
72
|
+
nh3==0.3.4
|
|
73
|
+
# via readme-renderer
|
|
74
|
+
packaging==26.0
|
|
75
|
+
# via
|
|
76
|
+
# build
|
|
77
|
+
# hatchling
|
|
78
|
+
# twine
|
|
79
|
+
pathspec==1.0.4
|
|
80
|
+
# via hatchling
|
|
81
|
+
pdfminer-six==20251230
|
|
82
|
+
# via pdfplumber
|
|
83
|
+
pdfplumber==0.11.9
|
|
84
|
+
# via refren
|
|
85
|
+
pillow==12.1.1
|
|
86
|
+
# via pdfplumber
|
|
87
|
+
pluggy==1.6.0
|
|
88
|
+
# via hatchling
|
|
89
|
+
pycparser==3.0 ; implementation_name != 'PyPy' and platform_python_implementation != 'PyPy'
|
|
90
|
+
# via cffi
|
|
91
|
+
pydantic==2.12.5
|
|
92
|
+
# via anthropic
|
|
93
|
+
pydantic-core==2.41.5
|
|
94
|
+
# via pydantic
|
|
95
|
+
pygments==2.19.2
|
|
96
|
+
# via
|
|
97
|
+
# readme-renderer
|
|
98
|
+
# rich
|
|
99
|
+
pypdfium2==5.6.0
|
|
100
|
+
# via pdfplumber
|
|
101
|
+
pyproject-hooks==1.2.0
|
|
102
|
+
# via build
|
|
103
|
+
pywin32-ctypes==0.2.3 ; platform_machine != 'ppc64le' and platform_machine != 's390x' and sys_platform == 'win32'
|
|
104
|
+
# via keyring
|
|
105
|
+
readme-renderer==44.0
|
|
106
|
+
# via twine
|
|
107
|
+
requests==2.33.0
|
|
108
|
+
# via
|
|
109
|
+
# requests-toolbelt
|
|
110
|
+
# twine
|
|
111
|
+
requests-toolbelt==1.0.0
|
|
112
|
+
# via twine
|
|
113
|
+
rfc3986==2.0.0
|
|
114
|
+
# via twine
|
|
115
|
+
rich==14.3.3
|
|
116
|
+
# via twine
|
|
117
|
+
secretstorage==3.5.0 ; platform_machine != 'ppc64le' and platform_machine != 's390x' and sys_platform == 'linux'
|
|
118
|
+
# via keyring
|
|
119
|
+
sniffio==1.3.1
|
|
120
|
+
# via anthropic
|
|
121
|
+
trove-classifiers==2026.1.14.14
|
|
122
|
+
# via hatchling
|
|
123
|
+
twine==6.2.0
|
|
124
|
+
typing-extensions==4.15.0
|
|
125
|
+
# via
|
|
126
|
+
# anthropic
|
|
127
|
+
# anyio
|
|
128
|
+
# pydantic
|
|
129
|
+
# pydantic-core
|
|
130
|
+
# typing-inspection
|
|
131
|
+
typing-inspection==0.4.2
|
|
132
|
+
# via pydantic
|
|
133
|
+
urllib3==2.6.3
|
|
134
|
+
# via
|
|
135
|
+
# id
|
|
136
|
+
# requests
|
|
137
|
+
# twine
|