commonmeta-py 0.101__py3-none-any.whl → 0.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- commonmeta/__init__.py +51 -50
- commonmeta/base_utils.py +1 -0
- commonmeta/cli.py +6 -5
- commonmeta/constants.py +35 -1
- commonmeta/crossref_utils.py +11 -8
- commonmeta/date_utils.py +1 -0
- commonmeta/doi_utils.py +42 -14
- commonmeta/metadata.py +209 -100
- commonmeta/readers/cff_reader.py +1 -0
- commonmeta/readers/codemeta_reader.py +1 -0
- commonmeta/readers/commonmeta_reader.py +1 -0
- commonmeta/readers/crossref_reader.py +19 -18
- commonmeta/readers/csl_reader.py +4 -1
- commonmeta/readers/inveniordm_reader.py +14 -9
- commonmeta/readers/json_feed_reader.py +9 -3
- commonmeta/readers/kbase_reader.py +1 -0
- commonmeta/readers/openalex_reader.py +380 -0
- commonmeta/readers/ris_reader.py +1 -0
- commonmeta/resources/commonmeta_v0.16.json +21 -5
- commonmeta/schema_utils.py +1 -0
- commonmeta/utils.py +121 -16
- commonmeta/writers/bibtex_writer.py +1 -0
- commonmeta/writers/citation_writer.py +1 -0
- commonmeta/writers/crossref_xml_writer.py +1 -0
- commonmeta/writers/csl_writer.py +1 -0
- commonmeta/writers/datacite_writer.py +1 -0
- commonmeta/writers/ris_writer.py +1 -0
- commonmeta/writers/schema_org_writer.py +1 -0
- {commonmeta_py-0.101.dist-info → commonmeta_py-0.104.dist-info}/METADATA +5 -8
- {commonmeta_py-0.101.dist-info → commonmeta_py-0.104.dist-info}/RECORD +33 -32
- {commonmeta_py-0.101.dist-info → commonmeta_py-0.104.dist-info}/licenses/LICENSE +1 -1
- {commonmeta_py-0.101.dist-info → commonmeta_py-0.104.dist-info}/WHEEL +0 -0
- {commonmeta_py-0.101.dist-info → commonmeta_py-0.104.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,380 @@
|
|
1
|
+
"""OpenAlex reader for commonmeta-py"""
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
import httpx
|
6
|
+
from pydash import py_
|
7
|
+
|
8
|
+
from ..author_utils import get_authors
|
9
|
+
from ..base_utils import compact, presence, sanitize, wrap
|
10
|
+
from ..constants import (
|
11
|
+
CR_TO_CM_TRANSLATIONS,
|
12
|
+
OA_TO_CM_CONTAINER_TRANLATIONS,
|
13
|
+
OA_TO_CM_TRANSLATIONS,
|
14
|
+
Commonmeta,
|
15
|
+
)
|
16
|
+
from ..doi_utils import (
|
17
|
+
normalize_doi,
|
18
|
+
openalex_api_sample_url,
|
19
|
+
openalex_api_url,
|
20
|
+
)
|
21
|
+
from ..utils import (
|
22
|
+
dict_to_spdx,
|
23
|
+
normalize_url,
|
24
|
+
validate_openalex,
|
25
|
+
)
|
26
|
+
|
27
|
+
# Map OpenAlex license strings to SPDX licenceId. May not be the correct license version.
|
28
|
+
OA_LICENSES = {"cc-by": "CC-BY-4.0", "cc0": "CC0-1.0"}
|
29
|
+
OA_IDENTIFIER_TYPES = {
|
30
|
+
"openalex": "OpenAlex",
|
31
|
+
"doi": "DOI",
|
32
|
+
"mag": "MAG",
|
33
|
+
"pmid": "PMID",
|
34
|
+
"pmcid": "PMCID",
|
35
|
+
}
|
36
|
+
|
37
|
+
|
38
|
+
def get_openalex(pid: str, **kwargs) -> dict:
|
39
|
+
"""get_openalex"""
|
40
|
+
doi = normalize_doi(pid)
|
41
|
+
if doi is None:
|
42
|
+
return {"state": "not_found"}
|
43
|
+
url = openalex_api_url(doi)
|
44
|
+
response = httpx.get(url, timeout=10, **kwargs)
|
45
|
+
if response.status_code != 200:
|
46
|
+
return {"state": "not_found"}
|
47
|
+
return response.json() | {"via": "openalex"}
|
48
|
+
|
49
|
+
|
50
|
+
def read_openalex(data: Optional[dict], **kwargs) -> Commonmeta:
|
51
|
+
"""read_openalex"""
|
52
|
+
if data is None:
|
53
|
+
return {"state": "not_found"}
|
54
|
+
meta = data
|
55
|
+
read_options = kwargs or {}
|
56
|
+
|
57
|
+
doi = meta.get("doi", None)
|
58
|
+
_id = normalize_doi(doi)
|
59
|
+
_type = CR_TO_CM_TRANSLATIONS.get(meta.get("type_crossref", None)) or "Other"
|
60
|
+
additional_type = OA_TO_CM_TRANSLATIONS.get(meta.get("type", None))
|
61
|
+
if additional_type == _type:
|
62
|
+
additional_type = None
|
63
|
+
|
64
|
+
archive_locations = []
|
65
|
+
contributors = get_contributors(wrap(meta.get("authorships")))
|
66
|
+
contributors = get_authors(contributors)
|
67
|
+
|
68
|
+
url = normalize_url(
|
69
|
+
py_.get(meta, "primary_location.landing_page_url") or py_.get(meta, "id")
|
70
|
+
)
|
71
|
+
title = meta.get("title", None)
|
72
|
+
if title is not None:
|
73
|
+
titles = [{"title": sanitize(title)}]
|
74
|
+
else:
|
75
|
+
titles = None
|
76
|
+
publisher = compact(
|
77
|
+
{"name": py_.get(meta, "primary_location.source.host_organization_name")}
|
78
|
+
)
|
79
|
+
date = compact(
|
80
|
+
{
|
81
|
+
"published": py_.get(meta, "publication_date")
|
82
|
+
or py_.get(meta, "created_date")
|
83
|
+
}
|
84
|
+
)
|
85
|
+
identifiers = [
|
86
|
+
{
|
87
|
+
"identifier": uid,
|
88
|
+
"identifierType": OA_IDENTIFIER_TYPES[uidType],
|
89
|
+
}
|
90
|
+
for uidType, uid in (meta.get("ids", {})).items()
|
91
|
+
]
|
92
|
+
|
93
|
+
license_ = py_.get(meta, "best_oa_location.license")
|
94
|
+
if license_ is not None:
|
95
|
+
license_ = OA_LICENSES.get(license_, license_)
|
96
|
+
license_ = dict_to_spdx({"id": license_})
|
97
|
+
container = get_container(meta)
|
98
|
+
relations = []
|
99
|
+
references = [
|
100
|
+
get_related(i) for i in get_references(meta.get("referenced_works", []))
|
101
|
+
]
|
102
|
+
funding_references = from_openalex_funding(wrap(meta.get("grants", None)))
|
103
|
+
|
104
|
+
description = get_abstract(meta)
|
105
|
+
if description is not None:
|
106
|
+
descriptions = [{"description": sanitize(description), "type": "Abstract"}]
|
107
|
+
else:
|
108
|
+
descriptions = None
|
109
|
+
|
110
|
+
subjects = py_.uniq(
|
111
|
+
[
|
112
|
+
{"subject": py_.get(i, "subfield.display_name")}
|
113
|
+
for i in wrap(meta.get("topics", None))
|
114
|
+
]
|
115
|
+
)
|
116
|
+
files = get_files(meta)
|
117
|
+
|
118
|
+
return {
|
119
|
+
# required properties
|
120
|
+
"id": _id,
|
121
|
+
"type": _type,
|
122
|
+
# recommended and optional properties
|
123
|
+
"additionalType": additional_type,
|
124
|
+
"archiveLocations": presence(archive_locations),
|
125
|
+
"container": presence(container),
|
126
|
+
"contributors": presence(contributors),
|
127
|
+
"date": presence(date),
|
128
|
+
"descriptions": presence(descriptions),
|
129
|
+
"files": presence(files),
|
130
|
+
"fundingReferences": presence(funding_references),
|
131
|
+
"geoLocations": None,
|
132
|
+
"identifiers": identifiers,
|
133
|
+
"language": meta.get("language", None),
|
134
|
+
"license": license_,
|
135
|
+
"provider": "OpenAlex",
|
136
|
+
"publisher": presence(publisher),
|
137
|
+
"references": presence(references),
|
138
|
+
"relations": presence(relations),
|
139
|
+
"subjects": presence(subjects),
|
140
|
+
"titles": presence(titles),
|
141
|
+
"url": url,
|
142
|
+
"version": meta.get("version", None),
|
143
|
+
} | read_options
|
144
|
+
|
145
|
+
|
146
|
+
def get_abstract(meta):
|
147
|
+
"""Parse abstract from OpenAlex abstract_inverted_index"""
|
148
|
+
abstract_inverted_index = py_.get(meta, "abstract_inverted_index")
|
149
|
+
|
150
|
+
if abstract_inverted_index:
|
151
|
+
# Determine the length of the abstract
|
152
|
+
max_pos = max(
|
153
|
+
p for positions in abstract_inverted_index.values() for p in positions
|
154
|
+
)
|
155
|
+
abstract_words = [""] * (max_pos + 1)
|
156
|
+
|
157
|
+
for word, positions in abstract_inverted_index.items():
|
158
|
+
for p in positions:
|
159
|
+
abstract_words[p] = word
|
160
|
+
|
161
|
+
abstract = " ".join(abstract_words)
|
162
|
+
else:
|
163
|
+
abstract = None
|
164
|
+
return abstract
|
165
|
+
|
166
|
+
|
167
|
+
def get_contributors(contributors: list) -> list:
|
168
|
+
"""Parse contributor"""
|
169
|
+
|
170
|
+
def parse_contributor(c):
|
171
|
+
affiliations = []
|
172
|
+
for affiliation in c.get("institutions", []):
|
173
|
+
affiliations.append(
|
174
|
+
compact(
|
175
|
+
{
|
176
|
+
"id": affiliation.get("ror", None),
|
177
|
+
"name": affiliation.get("display_name", None),
|
178
|
+
}
|
179
|
+
)
|
180
|
+
)
|
181
|
+
|
182
|
+
return compact(
|
183
|
+
{
|
184
|
+
"id": py_.get(c, "author.orcid"),
|
185
|
+
"name": py_.get(c, "author.display_name"),
|
186
|
+
"affiliations": affiliations,
|
187
|
+
}
|
188
|
+
)
|
189
|
+
|
190
|
+
return [parse_contributor(i) for i in contributors]
|
191
|
+
|
192
|
+
|
193
|
+
def get_references(pids: list, **kwargs) -> list:
|
194
|
+
"""Get related articles from OpenAlex using their pid
|
195
|
+
Used for retrieving metadata for citations and references which are not included in the OpenAlex record
|
196
|
+
"""
|
197
|
+
references = get_openalex_works(pids)
|
198
|
+
return references
|
199
|
+
|
200
|
+
|
201
|
+
def get_citations(citation_url: str, **kwargs) -> list:
|
202
|
+
response = httpx.get(citation_url, timeout=10, **kwargs)
|
203
|
+
if response.status_code != 200:
|
204
|
+
return {"state": "not_found"}
|
205
|
+
response = response.json()
|
206
|
+
return response.json().get("results", [])
|
207
|
+
|
208
|
+
|
209
|
+
def get_related(related: Optional[dict]) -> Optional[dict]:
|
210
|
+
"""Get reference from OpenAlex reference"""
|
211
|
+
if related is None or not isinstance(related, dict):
|
212
|
+
return None
|
213
|
+
doi = related.get("doi", None)
|
214
|
+
metadata = {
|
215
|
+
"id": normalize_doi(doi) if doi else None,
|
216
|
+
"contributor": related.get("author", None),
|
217
|
+
"title": related.get("display_name", None),
|
218
|
+
"publisher": related.get(
|
219
|
+
"primary_location.source.host_organization_name", None
|
220
|
+
),
|
221
|
+
"publicationYear": related.get("publication_year", None),
|
222
|
+
"volume": py_.get(related, "biblio.volume"),
|
223
|
+
"issue": py_.get(related, "biblio.issue"),
|
224
|
+
"firstPage": py_.get(related, "biblio.first_page"),
|
225
|
+
"lastPage": py_.get(related, "biblio.last_page"),
|
226
|
+
"containerTitle": related.get("primary_location.source.display_name", None),
|
227
|
+
}
|
228
|
+
return compact(metadata)
|
229
|
+
|
230
|
+
|
231
|
+
def get_openalex_works(pids: list, **kwargs) -> list:
|
232
|
+
"""Get OpenAlex works, use batches of 49 to honor API limit."""
|
233
|
+
pid_batches = [pids[i : i + 49] for i in range(0, len(pids), 49)]
|
234
|
+
works = []
|
235
|
+
for pid_batch in pid_batches:
|
236
|
+
ids = "|".join(pid_batch)
|
237
|
+
url = f"https://api.openalex.org/works?filter=ids.openalex:{ids}"
|
238
|
+
response = httpx.get(url, timeout=10, **kwargs)
|
239
|
+
if response.status_code != 200:
|
240
|
+
return {"state": "not_found"}
|
241
|
+
response = response.json()
|
242
|
+
if py_.get(response, "count") == 0:
|
243
|
+
return {"state": "not_found"}
|
244
|
+
|
245
|
+
works.extend(response.get("results"))
|
246
|
+
|
247
|
+
return works
|
248
|
+
|
249
|
+
|
250
|
+
def get_openalex_funders(pids: list, **kwargs) -> list:
|
251
|
+
"""Get ROR id and name from OpenAlex funders.
|
252
|
+
use batches of 49 to honor API limit."""
|
253
|
+
pid_batches = [pids[i : i + 49] for i in range(0, len(pids), 49)]
|
254
|
+
funders = []
|
255
|
+
for pid_batch in pid_batches:
|
256
|
+
ids = "|".join(pid_batch)
|
257
|
+
url = f"https://api.openalex.org/funders?filter=ids.openalex:{ids}"
|
258
|
+
response = httpx.get(url, timeout=10, **kwargs)
|
259
|
+
if response.status_code != 200:
|
260
|
+
return {"state": "not_found"}
|
261
|
+
response = response.json()
|
262
|
+
if py_.get(response, "count") == 0:
|
263
|
+
return {"state": "not_found"}
|
264
|
+
|
265
|
+
def format_funder(funder):
|
266
|
+
return compact(
|
267
|
+
{
|
268
|
+
"id": py_.get(funder, "id"),
|
269
|
+
"ror": py_.get(funder, "ids.ror"),
|
270
|
+
"name": py_.get(funder, "display_name"),
|
271
|
+
}
|
272
|
+
)
|
273
|
+
|
274
|
+
f = [format_funder(i) for i in response.get("results")]
|
275
|
+
funders.extend(f)
|
276
|
+
|
277
|
+
return funders
|
278
|
+
|
279
|
+
|
280
|
+
def get_openalex_source(str: Optional[str], **kwargs) -> Optional[dict]:
|
281
|
+
"""Get issn, name, homepage_url and type from OpenAlex source."""
|
282
|
+
id = validate_openalex(str)
|
283
|
+
if not id:
|
284
|
+
return None
|
285
|
+
|
286
|
+
url = f"https://api.openalex.org/sources/{id}"
|
287
|
+
response = httpx.get(url, timeout=10, **kwargs)
|
288
|
+
if response.status_code != 200:
|
289
|
+
return {"state": "not_found"}
|
290
|
+
response = response.json()
|
291
|
+
if py_.get(response, "count") == 0:
|
292
|
+
return {"state": "not_found"}
|
293
|
+
|
294
|
+
return compact(
|
295
|
+
{
|
296
|
+
"id": py_.get(response, "id"),
|
297
|
+
"url": py_.get(response, "homepage_url"),
|
298
|
+
"issn": py_.get(response, "issn_l"),
|
299
|
+
"title": py_.get(response, "display_name"),
|
300
|
+
"type": py_.get(response, "type"),
|
301
|
+
}
|
302
|
+
)
|
303
|
+
|
304
|
+
|
305
|
+
def get_files(meta) -> Optional[list]:
|
306
|
+
"""get file links"""
|
307
|
+
pdf_url = py_.get(meta, "best_oa_location.pdf_url")
|
308
|
+
if pdf_url is None:
|
309
|
+
return None
|
310
|
+
return [
|
311
|
+
{"mimeType": "application/pdf", "url": pdf_url},
|
312
|
+
]
|
313
|
+
|
314
|
+
|
315
|
+
def get_container(meta: dict) -> dict:
|
316
|
+
"""Get container from OpenAlex"""
|
317
|
+
source = get_openalex_source(py_.get(meta, "primary_location.source.id"))
|
318
|
+
print(source)
|
319
|
+
container_type = py_.get(source, "type")
|
320
|
+
if container_type:
|
321
|
+
container_type = OA_TO_CM_CONTAINER_TRANLATIONS.get(
|
322
|
+
container_type, container_type
|
323
|
+
)
|
324
|
+
issn = py_.get(source, "issn")
|
325
|
+
container_title = py_.get(source, "title")
|
326
|
+
url_ = py_.get(source, "url")
|
327
|
+
|
328
|
+
return compact(
|
329
|
+
{
|
330
|
+
"type": container_type,
|
331
|
+
"identifier": issn or url_,
|
332
|
+
"identifierType": "ISSN" if issn else "URL" if url_ else None,
|
333
|
+
"title": container_title,
|
334
|
+
"volume": py_.get(meta, "biblio.volume"),
|
335
|
+
"issue": py_.get(meta, "biblio.issue"),
|
336
|
+
"firstPage": py_.get(meta, "biblio.first_page"),
|
337
|
+
"lastPage": py_.get(meta, "biblio.last_page"),
|
338
|
+
}
|
339
|
+
)
|
340
|
+
|
341
|
+
|
342
|
+
def from_openalex_funding(funding_references: list) -> list:
|
343
|
+
"""Get funding references from OpenAlex"""
|
344
|
+
funder_ids = [
|
345
|
+
validate_openalex(funding.get("funder"))
|
346
|
+
for funding in funding_references
|
347
|
+
if "funder" in funding
|
348
|
+
]
|
349
|
+
funders = get_openalex_funders(funder_ids)
|
350
|
+
formatted_funding_references = []
|
351
|
+
for funding in funding_references:
|
352
|
+
funder = next(
|
353
|
+
item for item in funders if item["id"] == funding.get("funder", None)
|
354
|
+
)
|
355
|
+
f = compact(
|
356
|
+
{
|
357
|
+
"funderName": funder.get("name", None),
|
358
|
+
"funderIdentifier": funder.get("ror", None),
|
359
|
+
"funderIdentifierType": "ROR" if funder.get("ror", None) else None,
|
360
|
+
"awardNumber": funding.get("award_id", None),
|
361
|
+
}
|
362
|
+
)
|
363
|
+
formatted_funding_references.append(f)
|
364
|
+
return py_.uniq(formatted_funding_references)
|
365
|
+
|
366
|
+
|
367
|
+
def get_random_id_from_openalex(number: int = 1, **kwargs) -> list:
|
368
|
+
"""Get random ID from OpenAlex"""
|
369
|
+
number = min(number, 20)
|
370
|
+
url = openalex_api_sample_url(number, **kwargs)
|
371
|
+
try:
|
372
|
+
response = httpx.get(url, timeout=10)
|
373
|
+
if response.status_code != 200:
|
374
|
+
return []
|
375
|
+
|
376
|
+
items = py_.get(response.json(), "results")
|
377
|
+
print(items)
|
378
|
+
return [i.get("id") for i in items]
|
379
|
+
except (httpx.ReadTimeout, httpx.ConnectError):
|
380
|
+
return []
|
commonmeta/readers/ris_reader.py
CHANGED
@@ -5,8 +5,7 @@
|
|
5
5
|
"description": "JSON representation of the Commonmeta schema.",
|
6
6
|
"commonmeta": {
|
7
7
|
"anyOf": [
|
8
|
-
{ "$ref": "#/definitions/commonmeta"
|
9
|
-
},
|
8
|
+
{ "$ref": "#/definitions/commonmeta" },
|
10
9
|
{
|
11
10
|
"type": "array",
|
12
11
|
"description": "An array of commonmeta objects.",
|
@@ -196,7 +195,13 @@
|
|
196
195
|
"type": {
|
197
196
|
"description": "The type of the description.",
|
198
197
|
"type": "string",
|
199
|
-
"enum": [
|
198
|
+
"enum": [
|
199
|
+
"Abstract",
|
200
|
+
"Summary",
|
201
|
+
"Methods",
|
202
|
+
"TechnicalInfo",
|
203
|
+
"Other"
|
204
|
+
]
|
200
205
|
},
|
201
206
|
"language": {
|
202
207
|
"description": "The language of the title. Use one of the language codes from the IETF BCP 47 standard.",
|
@@ -267,7 +272,9 @@
|
|
267
272
|
"items": { "$ref": "#/definitions/geoLocationPoint" },
|
268
273
|
"minItems": 4
|
269
274
|
},
|
270
|
-
"inPolygonPoint": {
|
275
|
+
"inPolygonPoint": {
|
276
|
+
"$ref": "#/definitions/geoLocationPoint"
|
277
|
+
}
|
271
278
|
},
|
272
279
|
"required": ["polygonPoints"]
|
273
280
|
},
|
@@ -294,6 +301,7 @@
|
|
294
301
|
"Handle",
|
295
302
|
"ISBN",
|
296
303
|
"ISSN",
|
304
|
+
"OpenAlex",
|
297
305
|
"PMID",
|
298
306
|
"PMCID",
|
299
307
|
"PURL",
|
@@ -323,7 +331,15 @@
|
|
323
331
|
"provider": {
|
324
332
|
"description": "The provider of the resource. This can be a DOI registration agency or a repository.",
|
325
333
|
"type": "string",
|
326
|
-
"enum": [
|
334
|
+
"enum": [
|
335
|
+
"Crossref",
|
336
|
+
"DataCite",
|
337
|
+
"GitHub",
|
338
|
+
"JaLC",
|
339
|
+
"KISTI",
|
340
|
+
"mEDRA",
|
341
|
+
"OP"
|
342
|
+
]
|
327
343
|
},
|
328
344
|
"publisher": {
|
329
345
|
"description": "The publisher of the resource.",
|
commonmeta/schema_utils.py
CHANGED
commonmeta/utils.py
CHANGED
@@ -1,22 +1,22 @@
|
|
1
1
|
"""Utils module for commonmeta-py"""
|
2
2
|
|
3
3
|
import os
|
4
|
-
import orjson as json
|
5
4
|
import re
|
6
5
|
import time
|
7
6
|
from typing import Optional
|
8
7
|
from urllib.parse import urlparse
|
9
|
-
|
10
|
-
from furl import furl
|
8
|
+
|
11
9
|
import bibtexparser
|
10
|
+
import orjson as json
|
11
|
+
import pycountry
|
12
|
+
import yaml
|
12
13
|
from bs4 import BeautifulSoup
|
14
|
+
from furl import furl
|
13
15
|
from pydash import py_
|
14
|
-
import pycountry
|
15
16
|
|
16
|
-
from .base_utils import
|
17
|
-
from .doi_utils import normalize_doi, doi_from_url, get_doi_ra, validate_doi, doi_as_url
|
17
|
+
from .base_utils import compact, parse_attributes, wrap
|
18
18
|
from .constants import DATACITE_CONTRIBUTOR_TYPES
|
19
|
-
|
19
|
+
from .doi_utils import doi_as_url, doi_from_url, get_doi_ra, normalize_doi, validate_doi
|
20
20
|
|
21
21
|
NORMALIZED_LICENSES = {
|
22
22
|
"https://creativecommons.org/licenses/by/1.0": "https://creativecommons.org/licenses/by/1.0/legalcode",
|
@@ -144,17 +144,13 @@ def normalize_id(pid: Optional[str], **kwargs) -> Optional[str]:
|
|
144
144
|
return doi
|
145
145
|
|
146
146
|
# check for valid HTTP uri and ensure https
|
147
|
-
|
148
|
-
if not
|
147
|
+
f = furl(pid)
|
148
|
+
if not f.host or f.scheme not in ["http", "https"]:
|
149
149
|
return None
|
150
|
-
if
|
151
|
-
|
150
|
+
if f.scheme == "http":
|
151
|
+
f.scheme = "https"
|
152
152
|
|
153
|
-
|
154
|
-
if pid.endswith("/"):
|
155
|
-
pid = pid.strip("/")
|
156
|
-
|
157
|
-
return pid
|
153
|
+
return f.url
|
158
154
|
|
159
155
|
|
160
156
|
def normalize_ids(ids: list, relation_type=None) -> list:
|
@@ -289,6 +285,115 @@ def validate_isni(isni: Optional[str]) -> Optional[str]:
|
|
289
285
|
return isni
|
290
286
|
|
291
287
|
|
288
|
+
def validate_mag(mag: Optional[str]) -> Optional[str]:
|
289
|
+
"""Validate Microsoft Academic Graph ID (mag)"""
|
290
|
+
if mag is None or not isinstance(mag, str):
|
291
|
+
return None
|
292
|
+
match = re.search(
|
293
|
+
r"\A(\d{4,10})\Z",
|
294
|
+
mag,
|
295
|
+
)
|
296
|
+
if match is None:
|
297
|
+
return None
|
298
|
+
return match.group(1)
|
299
|
+
|
300
|
+
|
301
|
+
def validate_openalex(openalex: Optional[str]) -> Optional[str]:
|
302
|
+
"""Validate OpenAlex ID"""
|
303
|
+
if openalex is None or not isinstance(openalex, str):
|
304
|
+
return None
|
305
|
+
match = re.search(
|
306
|
+
r"\A(?:(?:http|https)://openalex\.org/)?([AFIPSW]\d{8,10})\Z",
|
307
|
+
openalex,
|
308
|
+
)
|
309
|
+
if match is None:
|
310
|
+
return None
|
311
|
+
return match.group(1)
|
312
|
+
|
313
|
+
|
314
|
+
def validate_pmid(pmid: Optional[str]) -> Optional[str]:
|
315
|
+
"""Validate PubMed ID (pmid)"""
|
316
|
+
if pmid is None or not isinstance(pmid, str):
|
317
|
+
return None
|
318
|
+
match = re.search(
|
319
|
+
r"\A(?:(?:http|https)://pubmed\.ncbi\.nlm\.nih\.gov/)?(\d{4,8})\Z",
|
320
|
+
pmid,
|
321
|
+
)
|
322
|
+
if match is None:
|
323
|
+
return None
|
324
|
+
return match.group(1)
|
325
|
+
|
326
|
+
|
327
|
+
def validate_pmcid(pmcid: Optional[str]) -> Optional[str]:
|
328
|
+
"""Validate PubMed Central ID (pmcid)"""
|
329
|
+
if pmcid is None or not isinstance(pmcid, str):
|
330
|
+
return None
|
331
|
+
match = re.search(
|
332
|
+
r"\A(?:(?:http|https)://www\.ncbi\.nlm\.nih\.gov/pmc/articles/)?(\d{4,8})\Z",
|
333
|
+
pmcid,
|
334
|
+
)
|
335
|
+
if match is None:
|
336
|
+
return None
|
337
|
+
return match.group(1)
|
338
|
+
|
339
|
+
|
340
|
+
def validate_id(id: Optional[str]) -> tuple[Optional[str], Optional[str]]:
|
341
|
+
"""
|
342
|
+
Validate an identifier and return the validated identifier and its type.
|
343
|
+
|
344
|
+
Args:
|
345
|
+
id: The identifier string to validate
|
346
|
+
|
347
|
+
Returns:
|
348
|
+
A tuple containing (validated_id, id_type) or (None, None) if invalid
|
349
|
+
"""
|
350
|
+
if id is None:
|
351
|
+
return None, None
|
352
|
+
|
353
|
+
# Check if it's a DOI
|
354
|
+
doi = validate_doi(id)
|
355
|
+
if doi:
|
356
|
+
return normalize_doi(id), "DOI"
|
357
|
+
|
358
|
+
# Check if it's an ORCID
|
359
|
+
orcid = validate_orcid(id)
|
360
|
+
if orcid:
|
361
|
+
return normalize_orcid(id), "ORCID"
|
362
|
+
|
363
|
+
# Check if it's a ROR
|
364
|
+
ror = validate_ror(id)
|
365
|
+
if ror:
|
366
|
+
return normalize_ror(id), "ROR"
|
367
|
+
|
368
|
+
# Check if it's an ISNI
|
369
|
+
isni = validate_isni(id)
|
370
|
+
if isni:
|
371
|
+
return normalize_isni(id), "ISNI"
|
372
|
+
|
373
|
+
# Check if it's an OpenAlex ID
|
374
|
+
openalex = validate_openalex(id)
|
375
|
+
if openalex:
|
376
|
+
return f"https://openalex.org/{openalex}", "OpenAlex"
|
377
|
+
|
378
|
+
# Check if it's a PubMed ID
|
379
|
+
pmid = validate_pmid(id)
|
380
|
+
if pmid:
|
381
|
+
return f"https://pubmed.ncbi.nlm.nih.gov/{pmid}", "PMID"
|
382
|
+
|
383
|
+
# Check if it's a PubMed Central ID
|
384
|
+
pmcid = validate_pmcid(id)
|
385
|
+
if pmcid:
|
386
|
+
return f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}", "PMCID"
|
387
|
+
|
388
|
+
# Check if it's a URL
|
389
|
+
url_type = validate_url(id)
|
390
|
+
if url_type:
|
391
|
+
return normalize_url(id), url_type
|
392
|
+
|
393
|
+
# No known valid identifier type was found
|
394
|
+
return None, None
|
395
|
+
|
396
|
+
|
292
397
|
def normalize_isni(isni: Optional[str]) -> Optional[str]:
|
293
398
|
"""Normalize ISNI"""
|
294
399
|
if isni is None or not isinstance(isni, str):
|
commonmeta/writers/csl_writer.py
CHANGED
commonmeta/writers/ris_writer.py
CHANGED