commonmeta-py 0.100__py3-none-any.whl → 0.103__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- commonmeta/__init__.py +51 -50
- commonmeta/author_utils.py +7 -1
- commonmeta/base_utils.py +1 -0
- commonmeta/constants.py +35 -1
- commonmeta/crossref_utils.py +11 -8
- commonmeta/date_utils.py +1 -0
- commonmeta/doi_utils.py +42 -14
- commonmeta/metadata.py +209 -100
- commonmeta/readers/cff_reader.py +1 -0
- commonmeta/readers/codemeta_reader.py +1 -0
- commonmeta/readers/commonmeta_reader.py +1 -0
- commonmeta/readers/crossref_reader.py +19 -18
- commonmeta/readers/csl_reader.py +4 -1
- commonmeta/readers/inveniordm_reader.py +14 -9
- commonmeta/readers/json_feed_reader.py +9 -3
- commonmeta/readers/kbase_reader.py +1 -0
- commonmeta/readers/openalex_reader.py +380 -0
- commonmeta/readers/ris_reader.py +1 -0
- commonmeta/readers/schema_org_reader.py +2 -3
- commonmeta/schema_utils.py +1 -0
- commonmeta/utils.py +126 -63
- commonmeta/writers/bibtex_writer.py +1 -0
- commonmeta/writers/citation_writer.py +1 -0
- commonmeta/writers/crossref_xml_writer.py +1 -0
- commonmeta/writers/csl_writer.py +1 -0
- commonmeta/writers/datacite_writer.py +1 -0
- commonmeta/writers/ris_writer.py +1 -0
- commonmeta/writers/schema_org_writer.py +1 -0
- {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/METADATA +5 -8
- {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/RECORD +33 -32
- {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/licenses/LICENSE +1 -1
- {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/WHEEL +0 -0
- {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,380 @@
|
|
1
|
+
"""OpenAlex reader for commonmeta-py"""
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
import httpx
|
6
|
+
from pydash import py_
|
7
|
+
|
8
|
+
from ..author_utils import get_authors
|
9
|
+
from ..base_utils import compact, presence, sanitize, wrap
|
10
|
+
from ..constants import (
|
11
|
+
CR_TO_CM_TRANSLATIONS,
|
12
|
+
OA_TO_CM_CONTAINER_TRANLATIONS,
|
13
|
+
OA_TO_CM_TRANSLATIONS,
|
14
|
+
Commonmeta,
|
15
|
+
)
|
16
|
+
from ..doi_utils import (
|
17
|
+
normalize_doi,
|
18
|
+
openalex_api_sample_url,
|
19
|
+
openalex_api_url,
|
20
|
+
)
|
21
|
+
from ..utils import (
|
22
|
+
dict_to_spdx,
|
23
|
+
normalize_url,
|
24
|
+
validate_openalex,
|
25
|
+
)
|
26
|
+
|
27
|
+
# Map OpenAlex license strings to SPDX licenceId. May not be the correct license version.
|
28
|
+
OA_LICENSES = {"cc-by": "CC-BY-4.0", "cc0": "CC0-1.0"}
|
29
|
+
OA_IDENTIFIER_TYPES = {
|
30
|
+
"openalex": "OpenAlex",
|
31
|
+
"doi": "DOI",
|
32
|
+
"mag": "MAG",
|
33
|
+
"pmid": "PMID",
|
34
|
+
"pmcid": "PMCID",
|
35
|
+
}
|
36
|
+
|
37
|
+
|
38
|
+
def get_openalex(pid: str, **kwargs) -> dict:
|
39
|
+
"""get_openalex"""
|
40
|
+
doi = normalize_doi(pid)
|
41
|
+
if doi is None:
|
42
|
+
return {"state": "not_found"}
|
43
|
+
url = openalex_api_url(doi)
|
44
|
+
response = httpx.get(url, timeout=10, **kwargs)
|
45
|
+
if response.status_code != 200:
|
46
|
+
return {"state": "not_found"}
|
47
|
+
return response.json() | {"via": "openalex"}
|
48
|
+
|
49
|
+
|
50
|
+
def read_openalex(data: Optional[dict], **kwargs) -> Commonmeta:
|
51
|
+
"""read_openalex"""
|
52
|
+
if data is None:
|
53
|
+
return {"state": "not_found"}
|
54
|
+
meta = data
|
55
|
+
read_options = kwargs or {}
|
56
|
+
|
57
|
+
doi = meta.get("doi", None)
|
58
|
+
_id = normalize_doi(doi)
|
59
|
+
_type = CR_TO_CM_TRANSLATIONS.get(meta.get("type_crossref", None)) or "Other"
|
60
|
+
additional_type = OA_TO_CM_TRANSLATIONS.get(meta.get("type", None))
|
61
|
+
if additional_type == _type:
|
62
|
+
additional_type = None
|
63
|
+
|
64
|
+
archive_locations = []
|
65
|
+
contributors = get_contributors(wrap(meta.get("authorships")))
|
66
|
+
contributors = get_authors(contributors)
|
67
|
+
|
68
|
+
url = normalize_url(
|
69
|
+
py_.get(meta, "primary_location.landing_page_url") or py_.get(meta, "id")
|
70
|
+
)
|
71
|
+
title = meta.get("title", None)
|
72
|
+
if title is not None:
|
73
|
+
titles = [{"title": sanitize(title)}]
|
74
|
+
else:
|
75
|
+
titles = None
|
76
|
+
publisher = compact(
|
77
|
+
{"name": py_.get(meta, "primary_location.source.host_organization_name")}
|
78
|
+
)
|
79
|
+
date = compact(
|
80
|
+
{
|
81
|
+
"published": py_.get(meta, "publication_date")
|
82
|
+
or py_.get(meta, "created_date")
|
83
|
+
}
|
84
|
+
)
|
85
|
+
identifiers = [
|
86
|
+
{
|
87
|
+
"identifier": uid,
|
88
|
+
"identifierType": OA_IDENTIFIER_TYPES[uidType],
|
89
|
+
}
|
90
|
+
for uidType, uid in (meta.get("ids", {})).items()
|
91
|
+
]
|
92
|
+
|
93
|
+
license_ = py_.get(meta, "best_oa_location.license")
|
94
|
+
if license_ is not None:
|
95
|
+
license_ = OA_LICENSES.get(license_, license_)
|
96
|
+
license_ = dict_to_spdx({"id": license_})
|
97
|
+
container = get_container(meta)
|
98
|
+
relations = []
|
99
|
+
references = [
|
100
|
+
get_related(i) for i in get_references(meta.get("referenced_works", []))
|
101
|
+
]
|
102
|
+
funding_references = from_openalex_funding(wrap(meta.get("grants", None)))
|
103
|
+
|
104
|
+
description = get_abstract(meta)
|
105
|
+
if description is not None:
|
106
|
+
descriptions = [{"description": sanitize(description), "type": "Abstract"}]
|
107
|
+
else:
|
108
|
+
descriptions = None
|
109
|
+
|
110
|
+
subjects = py_.uniq(
|
111
|
+
[
|
112
|
+
{"subject": py_.get(i, "subfield.display_name")}
|
113
|
+
for i in wrap(meta.get("topics", None))
|
114
|
+
]
|
115
|
+
)
|
116
|
+
files = get_files(meta)
|
117
|
+
|
118
|
+
return {
|
119
|
+
# required properties
|
120
|
+
"id": _id,
|
121
|
+
"type": _type,
|
122
|
+
# recommended and optional properties
|
123
|
+
"additionalType": additional_type,
|
124
|
+
"archiveLocations": presence(archive_locations),
|
125
|
+
"container": presence(container),
|
126
|
+
"contributors": presence(contributors),
|
127
|
+
"date": presence(date),
|
128
|
+
"descriptions": presence(descriptions),
|
129
|
+
"files": presence(files),
|
130
|
+
"fundingReferences": presence(funding_references),
|
131
|
+
"geoLocations": None,
|
132
|
+
"identifiers": identifiers,
|
133
|
+
"language": meta.get("language", None),
|
134
|
+
"license": license_,
|
135
|
+
"provider": "OpenAlex",
|
136
|
+
"publisher": presence(publisher),
|
137
|
+
"references": presence(references),
|
138
|
+
"relations": presence(relations),
|
139
|
+
"subjects": presence(subjects),
|
140
|
+
"titles": presence(titles),
|
141
|
+
"url": url,
|
142
|
+
"version": meta.get("version", None),
|
143
|
+
} | read_options
|
144
|
+
|
145
|
+
|
146
|
+
def get_abstract(meta):
|
147
|
+
"""Parse abstract from OpenAlex abstract_inverted_index"""
|
148
|
+
abstract_inverted_index = py_.get(meta, "abstract_inverted_index")
|
149
|
+
|
150
|
+
if abstract_inverted_index:
|
151
|
+
# Determine the length of the abstract
|
152
|
+
max_pos = max(
|
153
|
+
p for positions in abstract_inverted_index.values() for p in positions
|
154
|
+
)
|
155
|
+
abstract_words = [""] * (max_pos + 1)
|
156
|
+
|
157
|
+
for word, positions in abstract_inverted_index.items():
|
158
|
+
for p in positions:
|
159
|
+
abstract_words[p] = word
|
160
|
+
|
161
|
+
abstract = " ".join(abstract_words)
|
162
|
+
else:
|
163
|
+
abstract = None
|
164
|
+
return abstract
|
165
|
+
|
166
|
+
|
167
|
+
def get_contributors(contributors: list) -> list:
|
168
|
+
"""Parse contributor"""
|
169
|
+
|
170
|
+
def parse_contributor(c):
|
171
|
+
affiliations = []
|
172
|
+
for affiliation in c.get("institutions", []):
|
173
|
+
affiliations.append(
|
174
|
+
compact(
|
175
|
+
{
|
176
|
+
"id": affiliation.get("ror", None),
|
177
|
+
"name": affiliation.get("display_name", None),
|
178
|
+
}
|
179
|
+
)
|
180
|
+
)
|
181
|
+
|
182
|
+
return compact(
|
183
|
+
{
|
184
|
+
"id": py_.get(c, "author.orcid"),
|
185
|
+
"name": py_.get(c, "author.display_name"),
|
186
|
+
"affiliations": affiliations,
|
187
|
+
}
|
188
|
+
)
|
189
|
+
|
190
|
+
return [parse_contributor(i) for i in contributors]
|
191
|
+
|
192
|
+
|
193
|
+
def get_references(pids: list, **kwargs) -> list:
|
194
|
+
"""Get related articles from OpenAlex using their pid
|
195
|
+
Used for retrieving metadata for citations and references which are not included in the OpenAlex record
|
196
|
+
"""
|
197
|
+
references = get_openalex_works(pids)
|
198
|
+
return references
|
199
|
+
|
200
|
+
|
201
|
+
def get_citations(citation_url: str, **kwargs) -> list:
|
202
|
+
response = httpx.get(citation_url, timeout=10, **kwargs)
|
203
|
+
if response.status_code != 200:
|
204
|
+
return {"state": "not_found"}
|
205
|
+
response = response.json()
|
206
|
+
return response.json().get("results", [])
|
207
|
+
|
208
|
+
|
209
|
+
def get_related(related: Optional[dict]) -> Optional[dict]:
|
210
|
+
"""Get reference from OpenAlex reference"""
|
211
|
+
if related is None or not isinstance(related, dict):
|
212
|
+
return None
|
213
|
+
doi = related.get("doi", None)
|
214
|
+
metadata = {
|
215
|
+
"id": normalize_doi(doi) if doi else None,
|
216
|
+
"contributor": related.get("author", None),
|
217
|
+
"title": related.get("display_name", None),
|
218
|
+
"publisher": related.get(
|
219
|
+
"primary_location.source.host_organization_name", None
|
220
|
+
),
|
221
|
+
"publicationYear": related.get("publication_year", None),
|
222
|
+
"volume": py_.get(related, "biblio.volume"),
|
223
|
+
"issue": py_.get(related, "biblio.issue"),
|
224
|
+
"firstPage": py_.get(related, "biblio.first_page"),
|
225
|
+
"lastPage": py_.get(related, "biblio.last_page"),
|
226
|
+
"containerTitle": related.get("primary_location.source.display_name", None),
|
227
|
+
}
|
228
|
+
return compact(metadata)
|
229
|
+
|
230
|
+
|
231
|
+
def get_openalex_works(pids: list, **kwargs) -> list:
|
232
|
+
"""Get OpenAlex works, use batches of 49 to honor API limit."""
|
233
|
+
pid_batches = [pids[i : i + 49] for i in range(0, len(pids), 49)]
|
234
|
+
works = []
|
235
|
+
for pid_batch in pid_batches:
|
236
|
+
ids = "|".join(pid_batch)
|
237
|
+
url = f"https://api.openalex.org/works?filter=ids.openalex:{ids}"
|
238
|
+
response = httpx.get(url, timeout=10, **kwargs)
|
239
|
+
if response.status_code != 200:
|
240
|
+
return {"state": "not_found"}
|
241
|
+
response = response.json()
|
242
|
+
if py_.get(response, "count") == 0:
|
243
|
+
return {"state": "not_found"}
|
244
|
+
|
245
|
+
works.extend(response.get("results"))
|
246
|
+
|
247
|
+
return works
|
248
|
+
|
249
|
+
|
250
|
+
def get_openalex_funders(pids: list, **kwargs) -> list:
|
251
|
+
"""Get ROR id and name from OpenAlex funders.
|
252
|
+
use batches of 49 to honor API limit."""
|
253
|
+
pid_batches = [pids[i : i + 49] for i in range(0, len(pids), 49)]
|
254
|
+
funders = []
|
255
|
+
for pid_batch in pid_batches:
|
256
|
+
ids = "|".join(pid_batch)
|
257
|
+
url = f"https://api.openalex.org/funders?filter=ids.openalex:{ids}"
|
258
|
+
response = httpx.get(url, timeout=10, **kwargs)
|
259
|
+
if response.status_code != 200:
|
260
|
+
return {"state": "not_found"}
|
261
|
+
response = response.json()
|
262
|
+
if py_.get(response, "count") == 0:
|
263
|
+
return {"state": "not_found"}
|
264
|
+
|
265
|
+
def format_funder(funder):
|
266
|
+
return compact(
|
267
|
+
{
|
268
|
+
"id": py_.get(funder, "id"),
|
269
|
+
"ror": py_.get(funder, "ids.ror"),
|
270
|
+
"name": py_.get(funder, "display_name"),
|
271
|
+
}
|
272
|
+
)
|
273
|
+
|
274
|
+
f = [format_funder(i) for i in response.get("results")]
|
275
|
+
funders.extend(f)
|
276
|
+
|
277
|
+
return funders
|
278
|
+
|
279
|
+
|
280
|
+
def get_openalex_source(str: Optional[str], **kwargs) -> Optional[dict]:
|
281
|
+
"""Get issn, name, homepage_url and type from OpenAlex source."""
|
282
|
+
id = validate_openalex(str)
|
283
|
+
if not id:
|
284
|
+
return None
|
285
|
+
|
286
|
+
url = f"https://api.openalex.org/sources/{id}"
|
287
|
+
response = httpx.get(url, timeout=10, **kwargs)
|
288
|
+
if response.status_code != 200:
|
289
|
+
return {"state": "not_found"}
|
290
|
+
response = response.json()
|
291
|
+
if py_.get(response, "count") == 0:
|
292
|
+
return {"state": "not_found"}
|
293
|
+
|
294
|
+
return compact(
|
295
|
+
{
|
296
|
+
"id": py_.get(response, "id"),
|
297
|
+
"url": py_.get(response, "homepage_url"),
|
298
|
+
"issn": py_.get(response, "issn_l"),
|
299
|
+
"title": py_.get(response, "display_name"),
|
300
|
+
"type": py_.get(response, "type"),
|
301
|
+
}
|
302
|
+
)
|
303
|
+
|
304
|
+
|
305
|
+
def get_files(meta) -> Optional[list]:
|
306
|
+
"""get file links"""
|
307
|
+
pdf_url = py_.get(meta, "best_oa_location.pdf_url")
|
308
|
+
if pdf_url is None:
|
309
|
+
return None
|
310
|
+
return [
|
311
|
+
{"mimeType": "application/pdf", "url": pdf_url},
|
312
|
+
]
|
313
|
+
|
314
|
+
|
315
|
+
def get_container(meta: dict) -> dict:
|
316
|
+
"""Get container from OpenAlex"""
|
317
|
+
source = get_openalex_source(py_.get(meta, "primary_location.source.id"))
|
318
|
+
print(source)
|
319
|
+
container_type = py_.get(source, "type")
|
320
|
+
if container_type:
|
321
|
+
container_type = OA_TO_CM_CONTAINER_TRANLATIONS.get(
|
322
|
+
container_type, container_type
|
323
|
+
)
|
324
|
+
issn = py_.get(source, "issn")
|
325
|
+
container_title = py_.get(source, "title")
|
326
|
+
url_ = py_.get(source, "url")
|
327
|
+
|
328
|
+
return compact(
|
329
|
+
{
|
330
|
+
"type": container_type,
|
331
|
+
"identifier": issn or url_,
|
332
|
+
"identifierType": "ISSN" if issn else "URL" if url_ else None,
|
333
|
+
"title": container_title,
|
334
|
+
"volume": py_.get(meta, "biblio.volume"),
|
335
|
+
"issue": py_.get(meta, "biblio.issue"),
|
336
|
+
"firstPage": py_.get(meta, "biblio.first_page"),
|
337
|
+
"lastPage": py_.get(meta, "biblio.last_page"),
|
338
|
+
}
|
339
|
+
)
|
340
|
+
|
341
|
+
|
342
|
+
def from_openalex_funding(funding_references: list) -> list:
|
343
|
+
"""Get funding references from OpenAlex"""
|
344
|
+
funder_ids = [
|
345
|
+
validate_openalex(funding.get("funder"))
|
346
|
+
for funding in funding_references
|
347
|
+
if "funder" in funding
|
348
|
+
]
|
349
|
+
funders = get_openalex_funders(funder_ids)
|
350
|
+
formatted_funding_references = []
|
351
|
+
for funding in funding_references:
|
352
|
+
funder = next(
|
353
|
+
item for item in funders if item["id"] == funding.get("funder", None)
|
354
|
+
)
|
355
|
+
f = compact(
|
356
|
+
{
|
357
|
+
"funderName": funder.get("name", None),
|
358
|
+
"funderIdentifier": funder.get("ror", None),
|
359
|
+
"funderIdentifierType": "ROR" if funder.get("ror", None) else None,
|
360
|
+
"awardNumber": funding.get("award_id", None),
|
361
|
+
}
|
362
|
+
)
|
363
|
+
formatted_funding_references.append(f)
|
364
|
+
return py_.uniq(formatted_funding_references)
|
365
|
+
|
366
|
+
|
367
|
+
def get_random_id_from_openalex(number: int = 1, **kwargs) -> list:
|
368
|
+
"""Get random ID from OpenAlex"""
|
369
|
+
number = min(number, 20)
|
370
|
+
url = openalex_api_sample_url(number, **kwargs)
|
371
|
+
try:
|
372
|
+
response = httpx.get(url, timeout=10)
|
373
|
+
if response.status_code != 200:
|
374
|
+
return []
|
375
|
+
|
376
|
+
items = py_.get(response.json(), "results")
|
377
|
+
print(items)
|
378
|
+
return [i.get("id") for i in items]
|
379
|
+
except (httpx.ReadTimeout, httpx.ConnectError):
|
380
|
+
return []
|
commonmeta/readers/ris_reader.py
CHANGED
@@ -410,9 +410,8 @@ def get_html_meta(soup):
|
|
410
410
|
pid = pid.get("content", None) or pid.get("href", None)
|
411
411
|
data["@id"] = normalize_id(pid)
|
412
412
|
|
413
|
-
_type = (
|
414
|
-
|
415
|
-
or soup.select_one("meta[name='DC.type']")
|
413
|
+
_type = soup.select_one("meta[name='dc.type']") or soup.select_one(
|
414
|
+
"meta[name='DC.type']"
|
416
415
|
)
|
417
416
|
data["@type"] = _type["content"].capitalize() if _type else None
|
418
417
|
if _type is None:
|
commonmeta/schema_utils.py
CHANGED
commonmeta/utils.py
CHANGED
@@ -1,22 +1,22 @@
|
|
1
1
|
"""Utils module for commonmeta-py"""
|
2
2
|
|
3
3
|
import os
|
4
|
-
import orjson as json
|
5
4
|
import re
|
6
5
|
import time
|
7
6
|
from typing import Optional
|
8
7
|
from urllib.parse import urlparse
|
9
|
-
|
10
|
-
from furl import furl
|
8
|
+
|
11
9
|
import bibtexparser
|
10
|
+
import orjson as json
|
11
|
+
import pycountry
|
12
|
+
import yaml
|
12
13
|
from bs4 import BeautifulSoup
|
14
|
+
from furl import furl
|
13
15
|
from pydash import py_
|
14
|
-
import pycountry
|
15
16
|
|
16
|
-
from .base_utils import
|
17
|
-
from .doi_utils import normalize_doi, doi_from_url, get_doi_ra, validate_doi, doi_as_url
|
17
|
+
from .base_utils import compact, parse_attributes, wrap
|
18
18
|
from .constants import DATACITE_CONTRIBUTOR_TYPES
|
19
|
-
|
19
|
+
from .doi_utils import doi_as_url, doi_from_url, get_doi_ra, normalize_doi, validate_doi
|
20
20
|
|
21
21
|
NORMALIZED_LICENSES = {
|
22
22
|
"https://creativecommons.org/licenses/by/1.0": "https://creativecommons.org/licenses/by/1.0/legalcode",
|
@@ -144,17 +144,13 @@ def normalize_id(pid: Optional[str], **kwargs) -> Optional[str]:
|
|
144
144
|
return doi
|
145
145
|
|
146
146
|
# check for valid HTTP uri and ensure https
|
147
|
-
|
148
|
-
if not
|
147
|
+
f = furl(pid)
|
148
|
+
if not f.host or f.scheme not in ["http", "https"]:
|
149
149
|
return None
|
150
|
-
if
|
151
|
-
|
150
|
+
if f.scheme == "http":
|
151
|
+
f.scheme = "https"
|
152
152
|
|
153
|
-
|
154
|
-
if pid.endswith("/"):
|
155
|
-
pid = pid.strip("/")
|
156
|
-
|
157
|
-
return pid
|
153
|
+
return f.url
|
158
154
|
|
159
155
|
|
160
156
|
def normalize_ids(ids: list, relation_type=None) -> list:
|
@@ -190,8 +186,6 @@ def normalize_url(
|
|
190
186
|
if url is None or not isinstance(url, str):
|
191
187
|
return None
|
192
188
|
url = url.strip()
|
193
|
-
if url.endswith("/"):
|
194
|
-
url = url.strip("/")
|
195
189
|
scheme = urlparse(url).scheme
|
196
190
|
if not scheme or scheme not in ["http", "https"]:
|
197
191
|
return None
|
@@ -202,55 +196,13 @@ def normalize_url(
|
|
202
196
|
return url
|
203
197
|
|
204
198
|
|
205
|
-
# def normalize_url(url: Optional[str], secure=False, fragments=False, lower=False) -> Optional[str]:
|
206
|
-
# """Normalize URL"""
|
207
|
-
# if url is None or not isinstance(url, str):
|
208
|
-
# return None
|
209
|
-
# try:
|
210
|
-
# f = furl(url.strip())
|
211
|
-
# f.path.normalize()
|
212
|
-
|
213
|
-
# # only allow http and https schemes
|
214
|
-
# if f.scheme not in ["http", "https"]:
|
215
|
-
# return None
|
216
|
-
# if secure and f.scheme == "http":
|
217
|
-
# f.set(scheme="https")
|
218
|
-
|
219
|
-
# # remove index.html
|
220
|
-
# if f.path.segments and f.path.segments[-1] in ["index.html"]:
|
221
|
-
# f.path.segments.pop(-1)
|
222
|
-
|
223
|
-
# # remove fragments
|
224
|
-
# if fragments:
|
225
|
-
# f.remove(fragment=True)
|
226
|
-
|
227
|
-
# # remove specific query parameters
|
228
|
-
# f.remove(
|
229
|
-
# [
|
230
|
-
# "origin",
|
231
|
-
# "ref",
|
232
|
-
# "referrer",
|
233
|
-
# "source",
|
234
|
-
# "utm_content",
|
235
|
-
# "utm_medium",
|
236
|
-
# "utm_campaign",
|
237
|
-
# "utm_source",
|
238
|
-
# ]
|
239
|
-
# )
|
240
|
-
|
241
|
-
# if lower:
|
242
|
-
# return f.url.lower().strip("/")
|
243
|
-
# return f.url.strip("/")
|
244
|
-
# except ValueError:
|
245
|
-
# print(f"Error normalizing url {url}")
|
246
|
-
# return None
|
247
|
-
|
248
|
-
|
249
199
|
def normalize_cc_url(url: Optional[str]):
|
250
200
|
"""Normalize Creative Commons URL"""
|
251
201
|
if url is None or not isinstance(url, str):
|
252
202
|
return None
|
253
203
|
url = normalize_url(url, secure=True)
|
204
|
+
if url and url.endswith("/"):
|
205
|
+
url = url.strip("/")
|
254
206
|
return NORMALIZED_LICENSES.get(url, url)
|
255
207
|
|
256
208
|
|
@@ -333,6 +285,115 @@ def validate_isni(isni: Optional[str]) -> Optional[str]:
|
|
333
285
|
return isni
|
334
286
|
|
335
287
|
|
288
|
+
def validate_mag(mag: Optional[str]) -> Optional[str]:
|
289
|
+
"""Validate Microsoft Academic Graph ID (mag)"""
|
290
|
+
if mag is None or not isinstance(mag, str):
|
291
|
+
return None
|
292
|
+
match = re.search(
|
293
|
+
r"\A(\d{4,10})\Z",
|
294
|
+
mag,
|
295
|
+
)
|
296
|
+
if match is None:
|
297
|
+
return None
|
298
|
+
return match.group(1)
|
299
|
+
|
300
|
+
|
301
|
+
def validate_openalex(openalex: Optional[str]) -> Optional[str]:
|
302
|
+
"""Validate OpenAlex ID"""
|
303
|
+
if openalex is None or not isinstance(openalex, str):
|
304
|
+
return None
|
305
|
+
match = re.search(
|
306
|
+
r"\A(?:(?:http|https)://openalex\.org/)?([AFIPSW]\d{8,10})\Z",
|
307
|
+
openalex,
|
308
|
+
)
|
309
|
+
if match is None:
|
310
|
+
return None
|
311
|
+
return match.group(1)
|
312
|
+
|
313
|
+
|
314
|
+
def validate_pmid(pmid: Optional[str]) -> Optional[str]:
|
315
|
+
"""Validate PubMed ID (pmid)"""
|
316
|
+
if pmid is None or not isinstance(pmid, str):
|
317
|
+
return None
|
318
|
+
match = re.search(
|
319
|
+
r"\A(?:(?:http|https)://pubmed\.ncbi\.nlm\.nih\.gov/)?(\d{4,8})\Z",
|
320
|
+
pmid,
|
321
|
+
)
|
322
|
+
if match is None:
|
323
|
+
return None
|
324
|
+
return match.group(1)
|
325
|
+
|
326
|
+
|
327
|
+
def validate_pmcid(pmcid: Optional[str]) -> Optional[str]:
|
328
|
+
"""Validate PubMed Central ID (pmcid)"""
|
329
|
+
if pmcid is None or not isinstance(pmcid, str):
|
330
|
+
return None
|
331
|
+
match = re.search(
|
332
|
+
r"\A(?:(?:http|https)://www\.ncbi\.nlm\.nih\.gov/pmc/articles/)?(\d{4,8})\Z",
|
333
|
+
pmcid,
|
334
|
+
)
|
335
|
+
if match is None:
|
336
|
+
return None
|
337
|
+
return match.group(1)
|
338
|
+
|
339
|
+
|
340
|
+
def validate_id(id: Optional[str]) -> tuple[Optional[str], Optional[str]]:
|
341
|
+
"""
|
342
|
+
Validate an identifier and return the validated identifier and its type.
|
343
|
+
|
344
|
+
Args:
|
345
|
+
id: The identifier string to validate
|
346
|
+
|
347
|
+
Returns:
|
348
|
+
A tuple containing (validated_id, id_type) or (None, None) if invalid
|
349
|
+
"""
|
350
|
+
if id is None:
|
351
|
+
return None, None
|
352
|
+
|
353
|
+
# Check if it's a DOI
|
354
|
+
doi = validate_doi(id)
|
355
|
+
if doi:
|
356
|
+
return normalize_doi(id), "DOI"
|
357
|
+
|
358
|
+
# Check if it's an ORCID
|
359
|
+
orcid = validate_orcid(id)
|
360
|
+
if orcid:
|
361
|
+
return normalize_orcid(id), "ORCID"
|
362
|
+
|
363
|
+
# Check if it's a ROR
|
364
|
+
ror = validate_ror(id)
|
365
|
+
if ror:
|
366
|
+
return normalize_ror(id), "ROR"
|
367
|
+
|
368
|
+
# Check if it's an ISNI
|
369
|
+
isni = validate_isni(id)
|
370
|
+
if isni:
|
371
|
+
return normalize_isni(id), "ISNI"
|
372
|
+
|
373
|
+
# Check if it's an OpenAlex ID
|
374
|
+
openalex = validate_openalex(id)
|
375
|
+
if openalex:
|
376
|
+
return f"https://openalex.org/{openalex}", "OpenAlex"
|
377
|
+
|
378
|
+
# Check if it's a PubMed ID
|
379
|
+
pmid = validate_pmid(id)
|
380
|
+
if pmid:
|
381
|
+
return f"https://pubmed.ncbi.nlm.nih.gov/{pmid}", "PMID"
|
382
|
+
|
383
|
+
# Check if it's a PubMed Central ID
|
384
|
+
pmcid = validate_pmcid(id)
|
385
|
+
if pmcid:
|
386
|
+
return f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}", "PMCID"
|
387
|
+
|
388
|
+
# Check if it's a URL
|
389
|
+
url_type = validate_url(id)
|
390
|
+
if url_type:
|
391
|
+
return normalize_url(id), url_type
|
392
|
+
|
393
|
+
# No known valid identifier type was found
|
394
|
+
return None, None
|
395
|
+
|
396
|
+
|
336
397
|
def normalize_isni(isni: Optional[str]) -> Optional[str]:
|
337
398
|
"""Normalize ISNI"""
|
338
399
|
if isni is None or not isinstance(isni, str):
|
@@ -1129,7 +1190,9 @@ def replace_curie(string: Optional[str]) -> Optional[str]:
|
|
1129
1190
|
if string is None:
|
1130
1191
|
return None
|
1131
1192
|
match = re.sub(
|
1132
|
-
r"((?:doi|DOI):\s?([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))",
|
1193
|
+
r"((?:doi|DOI):\s?([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))",
|
1194
|
+
r"https://doi.org/\2",
|
1195
|
+
string,
|
1133
1196
|
)
|
1134
1197
|
if match is None:
|
1135
1198
|
return None
|
commonmeta/writers/csl_writer.py
CHANGED