commonmeta-py 0.23__py3-none-any.whl → 0.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- commonmeta/__init__.py +96 -0
- commonmeta/api_utils.py +77 -0
- commonmeta/author_utils.py +260 -0
- commonmeta/base_utils.py +121 -0
- commonmeta/cli.py +200 -0
- commonmeta/constants.py +587 -0
- commonmeta/crossref_utils.py +575 -0
- commonmeta/date_utils.py +193 -0
- commonmeta/doi_utils.py +273 -0
- commonmeta/metadata.py +320 -0
- commonmeta/readers/__init__.py +1 -0
- commonmeta/readers/cff_reader.py +199 -0
- commonmeta/readers/codemeta_reader.py +112 -0
- commonmeta/readers/commonmeta_reader.py +13 -0
- commonmeta/readers/crossref_reader.py +409 -0
- commonmeta/readers/crossref_xml_reader.py +505 -0
- commonmeta/readers/csl_reader.py +98 -0
- commonmeta/readers/datacite_reader.py +390 -0
- commonmeta/readers/datacite_xml_reader.py +359 -0
- commonmeta/readers/inveniordm_reader.py +218 -0
- commonmeta/readers/json_feed_reader.py +420 -0
- commonmeta/readers/kbase_reader.py +205 -0
- commonmeta/readers/ris_reader.py +103 -0
- commonmeta/readers/schema_org_reader.py +506 -0
- commonmeta/resources/cff_v1.2.0.json +1827 -0
- commonmeta/resources/commonmeta_v0.12.json +601 -0
- commonmeta/resources/commonmeta_v0.13.json +559 -0
- commonmeta/resources/commonmeta_v0.14.json +573 -0
- commonmeta/resources/crossref/AccessIndicators.xsd +47 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-3d2-mathml3-elements.xsd +10130 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-3d2-mathml3.xsd +48 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-elements.xsd +8705 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-mathml3-elements.xsd +8608 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-mathml3.xsd +49 -0
- commonmeta/resources/crossref/JATS-journalpublishing1.xsd +6176 -0
- commonmeta/resources/crossref/clinicaltrials.xsd +61 -0
- commonmeta/resources/crossref/common5.3.1.xsd +1538 -0
- commonmeta/resources/crossref/crossref5.3.1.xsd +1949 -0
- commonmeta/resources/crossref/crossref_query_output3.0.xsd +1097 -0
- commonmeta/resources/crossref/fundref.xsd +49 -0
- commonmeta/resources/crossref/module-ali.xsd +39 -0
- commonmeta/resources/crossref/relations.xsd +444 -0
- commonmeta/resources/crossref-v0.2.json +60 -0
- commonmeta/resources/csl-data.json +538 -0
- commonmeta/resources/datacite-v4.5.json +829 -0
- commonmeta/resources/datacite-v4.5pr.json +608 -0
- commonmeta/resources/ietf-bcp-47.json +3025 -0
- commonmeta/resources/iso-8601.json +3182 -0
- commonmeta/resources/spdx/licenses.json +4851 -0
- commonmeta/resources/spdx-schema..json +903 -0
- commonmeta/resources/styles/apa.csl +1697 -0
- commonmeta/resources/styles/chicago-author-date.csl +684 -0
- commonmeta/resources/styles/harvard-cite-them-right.csl +321 -0
- commonmeta/resources/styles/ieee.csl +468 -0
- commonmeta/resources/styles/modern-language-association.csl +341 -0
- commonmeta/resources/styles/vancouver.csl +376 -0
- commonmeta/schema_utils.py +27 -0
- commonmeta/translators.py +47 -0
- commonmeta/utils.py +1108 -0
- commonmeta/writers/__init__.py +1 -0
- commonmeta/writers/bibtex_writer.py +149 -0
- commonmeta/writers/citation_writer.py +70 -0
- commonmeta/writers/commonmeta_writer.py +68 -0
- commonmeta/writers/crossref_xml_writer.py +17 -0
- commonmeta/writers/csl_writer.py +79 -0
- commonmeta/writers/datacite_writer.py +193 -0
- commonmeta/writers/inveniordm_writer.py +94 -0
- commonmeta/writers/ris_writer.py +58 -0
- commonmeta/writers/schema_org_writer.py +146 -0
- {commonmeta_py-0.23.dist-info → commonmeta_py-0.25.dist-info}/METADATA +56 -45
- commonmeta_py-0.25.dist-info/RECORD +75 -0
- {commonmeta_py-0.23.dist-info → commonmeta_py-0.25.dist-info}/WHEEL +1 -1
- commonmeta_py-0.25.dist-info/entry_points.txt +3 -0
- commonmeta_py-0.23.dist-info/RECORD +0 -5
- /commonmeta_py/__init__.py → /commonmeta/readers/bibtex_reader.py +0 -0
- {commonmeta_py-0.23.dist-info/licenses → commonmeta_py-0.25.dist-info}/LICENSE +0 -0
commonmeta/utils.py
ADDED
@@ -0,0 +1,1108 @@
|
|
1
|
+
"""Utils module for commonmeta-py"""
|
2
|
+
|
3
|
+
import os
|
4
|
+
import orjson as json
|
5
|
+
import re
|
6
|
+
import time
|
7
|
+
from typing import Optional
|
8
|
+
from urllib.parse import urlparse
|
9
|
+
import yaml
|
10
|
+
from furl import furl
|
11
|
+
import bibtexparser
|
12
|
+
from bs4 import BeautifulSoup
|
13
|
+
from pydash import py_
|
14
|
+
import base32_lib as base32
|
15
|
+
import pycountry
|
16
|
+
|
17
|
+
from .base_utils import wrap, compact, parse_attributes
|
18
|
+
from .doi_utils import normalize_doi, doi_from_url, get_doi_ra, validate_doi, doi_as_url
|
19
|
+
from .constants import DATACITE_CONTRIBUTOR_TYPES
|
20
|
+
|
21
|
+
|
22
|
+
NORMALIZED_LICENSES = {
|
23
|
+
"https://creativecommons.org/licenses/by/1.0": "https://creativecommons.org/licenses/by/1.0/legalcode",
|
24
|
+
"https://creativecommons.org/licenses/by/2.0": "https://creativecommons.org/licenses/by/2.0/legalcode",
|
25
|
+
"https://creativecommons.org/licenses/by/2.5": "https://creativecommons.org/licenses/by/2.5/legalcode",
|
26
|
+
"https://creativecommons.org/licenses/by/3.0": "https://creativecommons.org/licenses/by/3.0/legalcode",
|
27
|
+
"https://creativecommons.org/licenses/by/3.0/us": "https://creativecommons.org/licenses/by/3.0/legalcode",
|
28
|
+
"https://creativecommons.org/licenses/by/4.0": "https://creativecommons.org/licenses/by/4.0/legalcode",
|
29
|
+
"https://creativecommons.org/licenses/by-nc/1.0": "https://creativecommons.org/licenses/by-nc/1.0/legalcode",
|
30
|
+
"https://creativecommons.org/licenses/by-nc/2.0": "https://creativecommons.org/licenses/by-nc/2.0/legalcode",
|
31
|
+
"https://creativecommons.org/licenses/by-nc/2.5": "https://creativecommons.org/licenses/by-nc/2.5/legalcode",
|
32
|
+
"https://creativecommons.org/licenses/by-nc/3.0": "https://creativecommons.org/licenses/by-nc/3.0/legalcode",
|
33
|
+
"https://creativecommons.org/licenses/by-nc/4.0": "https://creativecommons.org/licenses/by-nc/4.0/legalcode",
|
34
|
+
"https://creativecommons.org/licenses/by-nd-nc/1.0": "https://creativecommons.org/licenses/by-nd-nc/1.0/legalcode",
|
35
|
+
"https://creativecommons.org/licenses/by-nd-nc/2.0": "https://creativecommons.org/licenses/by-nd-nc/2.0/legalcode",
|
36
|
+
"https://creativecommons.org/licenses/by-nd-nc/2.5": "https://creativecommons.org/licenses/by-nd-nc/2.5/legalcode",
|
37
|
+
"https://creativecommons.org/licenses/by-nd-nc/3.0": "https://creativecommons.org/licenses/by-nd-nc/3.0/legalcode",
|
38
|
+
"https://creativecommons.org/licenses/by-nd-nc/4.0": "https://creativecommons.org/licenses/by-nd-nc/4.0/legalcode",
|
39
|
+
"https://creativecommons.org/licenses/by-nc-sa/1.0": "https://creativecommons.org/licenses/by-nc-sa/1.0/legalcode",
|
40
|
+
"https://creativecommons.org/licenses/by-nc-sa/2.0": "https://creativecommons.org/licenses/by-nc-sa/2.0/legalcode",
|
41
|
+
"https://creativecommons.org/licenses/by-nc-sa/2.5": "https://creativecommons.org/licenses/by-nc-sa/2.5/legalcode",
|
42
|
+
"https://creativecommons.org/licenses/by-nc-sa/3.0": "https://creativecommons.org/licenses/by-nc-sa/3.0/legalcode",
|
43
|
+
"https://creativecommons.org/licenses/by-nc-sa/3.0/us": "https://creativecommons.org/licenses/by-nc-sa/3.0/legalcode",
|
44
|
+
"https://creativecommons.org/licenses/by-nc-sa/4.0": "https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode",
|
45
|
+
"https://creativecommons.org/licenses/by-nd/1.0": "https://creativecommons.org/licenses/by-nd/1.0/legalcode",
|
46
|
+
"https://creativecommons.org/licenses/by-nd/2.0": "https://creativecommons.org/licenses/by-nd/2.0/legalcode",
|
47
|
+
"https://creativecommons.org/licenses/by-nd/2.5": "https://creativecommons.org/licenses/by-nd/2.5/legalcode",
|
48
|
+
"https://creativecommons.org/licenses/by-nd/3.0": "https://creativecommons.org/licenses/by-nd/3.0/legalcode",
|
49
|
+
"https://creativecommons.org/licenses/by-nd/4.0": "https://creativecommons.org/licenses/by-nd/2.0/legalcode",
|
50
|
+
"https://creativecommons.org/licenses/by-sa/1.0": "https://creativecommons.org/licenses/by-sa/1.0/legalcode",
|
51
|
+
"https://creativecommons.org/licenses/by-sa/2.0": "https://creativecommons.org/licenses/by-sa/2.0/legalcode",
|
52
|
+
"https://creativecommons.org/licenses/by-sa/2.5": "https://creativecommons.org/licenses/by-sa/2.5/legalcode",
|
53
|
+
"https://creativecommons.org/licenses/by-sa/3.0": "https://creativecommons.org/licenses/by-sa/3.0/legalcode",
|
54
|
+
"https://creativecommons.org/licenses/by-sa/4.0": "https://creativecommons.org/licenses/by-sa/4.0/legalcode",
|
55
|
+
"https://creativecommons.org/licenses/by-nc-nd/1.0": "https://creativecommons.org/licenses/by-nc-nd/1.0/legalcode",
|
56
|
+
"https://creativecommons.org/licenses/by-nc-nd/2.0": "https://creativecommons.org/licenses/by-nc-nd/2.0/legalcode",
|
57
|
+
"https://creativecommons.org/licenses/by-nc-nd/2.5": "https://creativecommons.org/licenses/by-nc-nd/2.5/legalcode",
|
58
|
+
"https://creativecommons.org/licenses/by-nc-nd/3.0": "https://creativecommons.org/licenses/by-nc-nd/3.0/legalcode",
|
59
|
+
"https://creativecommons.org/licenses/by-nc-nd/4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode",
|
60
|
+
"https://creativecommons.org/licenses/publicdomain": "https://creativecommons.org/licenses/publicdomain/",
|
61
|
+
"https://creativecommons.org/publicdomain/zero/1.0": "https://creativecommons.org/publicdomain/zero/1.0/legalcode",
|
62
|
+
}
|
63
|
+
|
64
|
+
UNKNOWN_INFORMATION = {
|
65
|
+
":unac": "temporarily inaccessible",
|
66
|
+
":unal": "unallowed, suppressed intentionally",
|
67
|
+
":unap": "not applicable, makes no sense",
|
68
|
+
":unas": "value unassigned (e.g., Untitled)",
|
69
|
+
":unav": "value unavailable, possibly unknown",
|
70
|
+
":unkn": "known to be unknown (e.g., Anonymous, Inconnue)",
|
71
|
+
":none": "never had a value, never will",
|
72
|
+
":null": "explicitly and meaningfully empty",
|
73
|
+
":tba": "to be assigned or announced later",
|
74
|
+
":etal": "too numerous to list (et alia)",
|
75
|
+
}
|
76
|
+
|
77
|
+
HTTP_SCHEME = "http://"
|
78
|
+
HTTPS_SCHEME = "https://"
|
79
|
+
|
80
|
+
|
81
|
+
def normalize_id(pid: Optional[str], **kwargs) -> Optional[str]:
|
82
|
+
"""Check for valid DOI or HTTP(S) URL"""
|
83
|
+
if pid is None:
|
84
|
+
return None
|
85
|
+
|
86
|
+
# check if pid is a bytes object
|
87
|
+
if isinstance(pid, (bytes, bytearray)):
|
88
|
+
pid = pid.decode()
|
89
|
+
|
90
|
+
# check for valid DOI
|
91
|
+
doi = normalize_doi(pid, **kwargs)
|
92
|
+
if doi is not None:
|
93
|
+
return doi
|
94
|
+
|
95
|
+
# check for valid HTTP uri and ensure https
|
96
|
+
uri = urlparse(pid)
|
97
|
+
if not uri.netloc or uri.scheme not in ["http", "https"]:
|
98
|
+
return None
|
99
|
+
if uri.scheme == "http":
|
100
|
+
pid = pid.replace(HTTP_SCHEME, HTTPS_SCHEME)
|
101
|
+
|
102
|
+
# remove trailing slash
|
103
|
+
if pid.endswith("/"):
|
104
|
+
pid = pid.strip("/")
|
105
|
+
|
106
|
+
return pid
|
107
|
+
|
108
|
+
|
109
|
+
def normalize_ids(ids: list, relation_type=None) -> list:
|
110
|
+
"""Normalize identifiers"""
|
111
|
+
|
112
|
+
def format_id(i):
|
113
|
+
if i.get("id", None):
|
114
|
+
idn = normalize_id(i["id"])
|
115
|
+
doi = doi_from_url(idn)
|
116
|
+
related_identifier_type = "DOI" if doi is not None else "URL"
|
117
|
+
idn = doi or idn
|
118
|
+
_type = (
|
119
|
+
i.get("type")
|
120
|
+
if isinstance(i.get("type", None), str)
|
121
|
+
else wrap(i.get("type", None))[0]
|
122
|
+
)
|
123
|
+
return compact(
|
124
|
+
{
|
125
|
+
"relatedIdentifier": idn,
|
126
|
+
"relationType": relation_type,
|
127
|
+
"relatedIdentifierType": related_identifier_type,
|
128
|
+
}
|
129
|
+
)
|
130
|
+
return None
|
131
|
+
|
132
|
+
return [format_id(i) for i in ids]
|
133
|
+
|
134
|
+
|
135
|
+
def normalize_url(url: Optional[str], secure=False, lower=False) -> Optional[str]:
|
136
|
+
"""Normalize URL"""
|
137
|
+
if url is None or not isinstance(url, str):
|
138
|
+
return None
|
139
|
+
if url.endswith("/"):
|
140
|
+
url = url.strip("/")
|
141
|
+
if secure is True and url.startswith(HTTP_SCHEME):
|
142
|
+
url = url.replace(HTTP_SCHEME, HTTPS_SCHEME)
|
143
|
+
if lower is True:
|
144
|
+
return url.lower()
|
145
|
+
return url
|
146
|
+
|
147
|
+
|
148
|
+
def normalize_cc_url(url: Optional[str]):
|
149
|
+
"""Normalize Creative Commons URL"""
|
150
|
+
if url is None or not isinstance(url, str):
|
151
|
+
return None
|
152
|
+
url = normalize_url(url, secure=True)
|
153
|
+
return NORMALIZED_LICENSES.get(url, url)
|
154
|
+
|
155
|
+
|
156
|
+
def normalize_ror(ror: Optional[str]) -> Optional[str]:
|
157
|
+
"""Normalize ROR ID"""
|
158
|
+
ror = validate_ror(ror)
|
159
|
+
if ror is None:
|
160
|
+
return None
|
161
|
+
|
162
|
+
# turn ROR ID into URL
|
163
|
+
return "https://ror.org/" + ror
|
164
|
+
|
165
|
+
|
166
|
+
def validate_ror(ror: Optional[str]) -> Optional[str]:
|
167
|
+
"""Validate ROR"""
|
168
|
+
if ror is None or not isinstance(ror, str):
|
169
|
+
return None
|
170
|
+
match = re.search(
|
171
|
+
r"\A(?:(?:http|https)://ror\.org/)?([0-9a-z]{7}\d{2})\Z",
|
172
|
+
ror,
|
173
|
+
)
|
174
|
+
if match is None:
|
175
|
+
return None
|
176
|
+
ror = match.group(1).replace(" ", "-")
|
177
|
+
return ror
|
178
|
+
|
179
|
+
|
180
|
+
def validate_url(url: str) -> Optional[str]:
|
181
|
+
if url is None:
|
182
|
+
return None
|
183
|
+
elif validate_doi(url):
|
184
|
+
return "DOI"
|
185
|
+
f = furl(url)
|
186
|
+
if f and f.scheme in ["http", "https"]:
|
187
|
+
return "URL"
|
188
|
+
match = re.search(
|
189
|
+
r"\A(ISSN|eISSN) (\d{4}-\d{3}[0-9X]+)\Z",
|
190
|
+
url,
|
191
|
+
)
|
192
|
+
if match is not None:
|
193
|
+
return "ISSN"
|
194
|
+
return None
|
195
|
+
|
196
|
+
|
197
|
+
def normalize_orcid(orcid: Optional[str]) -> Optional[str]:
|
198
|
+
"""Normalize ORCID"""
|
199
|
+
if orcid is None or not isinstance(orcid, str):
|
200
|
+
return None
|
201
|
+
orcid = validate_orcid(orcid)
|
202
|
+
if orcid is None:
|
203
|
+
return None
|
204
|
+
return "https://orcid.org/" + orcid
|
205
|
+
|
206
|
+
|
207
|
+
def validate_orcid(orcid: Optional[str]) -> Optional[str]:
|
208
|
+
"""Validate ORCID"""
|
209
|
+
if orcid is None or not isinstance(orcid, str):
|
210
|
+
return None
|
211
|
+
match = re.search(
|
212
|
+
r"\A(?:(?:http|https)://(?:(?:www|sandbox)?\.)?orcid\.org/)?(\d{4}[ -]\d{4}[ -]\d{4}[ -]\d{3}[0-9X]+)\Z",
|
213
|
+
orcid,
|
214
|
+
)
|
215
|
+
if match is None:
|
216
|
+
return None
|
217
|
+
orcid = match.group(1).replace(" ", "-")
|
218
|
+
return orcid
|
219
|
+
|
220
|
+
|
221
|
+
def validate_isni(isni: Optional[str]) -> Optional[str]:
|
222
|
+
"""Validate ISNI"""
|
223
|
+
if isni is None or not isinstance(isni, str):
|
224
|
+
return None
|
225
|
+
match = re.search(
|
226
|
+
r"\A(?:(?:http|https)://isni\.org/isni/)?(\d{4}([ -])?\d{4}([ -])?\d{4}([ -])?\d{3}[0-9X]+)\Z",
|
227
|
+
isni,
|
228
|
+
)
|
229
|
+
if match is None:
|
230
|
+
return None
|
231
|
+
isni = match.group(1).replace(" ", "")
|
232
|
+
return isni
|
233
|
+
|
234
|
+
|
235
|
+
def normalize_isni(isni: Optional[str]) -> Optional[str]:
|
236
|
+
"""Normalize ISNI"""
|
237
|
+
if isni is None or not isinstance(isni, str):
|
238
|
+
return None
|
239
|
+
isni = validate_isni(isni)
|
240
|
+
if isni is None:
|
241
|
+
return None
|
242
|
+
return "https://isni.org/isni/" + isni
|
243
|
+
|
244
|
+
|
245
|
+
def normalize_name_identifier(ni: Optional[str]) -> Optional[str]:
|
246
|
+
"""Normalize name identifier"""
|
247
|
+
if ni is None:
|
248
|
+
return None
|
249
|
+
if isinstance(ni, str):
|
250
|
+
return
|
251
|
+
if isinstance(ni, dict):
|
252
|
+
return format_name_identifier(ni)
|
253
|
+
if isinstance(ni, list):
|
254
|
+
return next(
|
255
|
+
(format_name_identifier(i) for i in wrap(ni.get("nameIdentifiers", None))),
|
256
|
+
None,
|
257
|
+
)
|
258
|
+
return None
|
259
|
+
|
260
|
+
|
261
|
+
def format_name_identifier(ni):
|
262
|
+
"""format_name_identifier"""
|
263
|
+
if ni is None:
|
264
|
+
return None
|
265
|
+
elif isinstance(ni, str):
|
266
|
+
return normalize_orcid(ni) or normalize_ror(ni) or normalize_isni(ni)
|
267
|
+
name_identifier = (
|
268
|
+
ni.get("nameIdentifier", None)
|
269
|
+
or ni.get("identifier", None)
|
270
|
+
or ni.get("publisherIdentifier", None)
|
271
|
+
)
|
272
|
+
name_identifier_scheme = (
|
273
|
+
ni.get("nameIdentifierScheme", None)
|
274
|
+
or ni.get("scheme", None)
|
275
|
+
or ni.get("publisherIdentifierScheme", None)
|
276
|
+
)
|
277
|
+
scheme_uri = ni.get("schemeURI", None) or ni.get("schemeUri", None)
|
278
|
+
if name_identifier is None:
|
279
|
+
return None
|
280
|
+
elif name_identifier_scheme in ["ORCID", "orcid"]:
|
281
|
+
return normalize_orcid(name_identifier)
|
282
|
+
elif name_identifier_scheme == "ISNI":
|
283
|
+
return normalize_isni(name_identifier)
|
284
|
+
elif name_identifier_scheme == "ROR":
|
285
|
+
return normalize_ror(name_identifier)
|
286
|
+
elif validate_url(name_identifier) == "URL":
|
287
|
+
return name_identifier
|
288
|
+
elif isinstance(name_identifier, str) and scheme_uri is not None:
|
289
|
+
return scheme_uri + name_identifier
|
290
|
+
return None
|
291
|
+
|
292
|
+
|
293
|
+
def normalize_issn(string, **kwargs):
|
294
|
+
"""Normalize ISSN
|
295
|
+
Pick electronic issn if there are multiple
|
296
|
+
Format issn as xxxx-xxxx"""
|
297
|
+
content = kwargs.get("content", "#text")
|
298
|
+
if string is None:
|
299
|
+
return None
|
300
|
+
if isinstance(string, str):
|
301
|
+
issn = string
|
302
|
+
elif isinstance(string, dict):
|
303
|
+
issn = string.get(content, None)
|
304
|
+
elif isinstance(string, list):
|
305
|
+
issn = next(
|
306
|
+
(i for i in string if i.get("media_type", None) == "electronic"), {}
|
307
|
+
).get(content, None)
|
308
|
+
if issn is None:
|
309
|
+
return None
|
310
|
+
if len(issn) == 9:
|
311
|
+
return issn
|
312
|
+
if len(issn) == 8:
|
313
|
+
return issn[0:4] + "-" + issn[4:8]
|
314
|
+
return None
|
315
|
+
|
316
|
+
|
317
|
+
def dict_to_spdx(dct: dict) -> dict:
|
318
|
+
"""Convert a dict to SPDX"""
|
319
|
+
dct.update({"url": normalize_cc_url(dct.get("url", None))})
|
320
|
+
file_path = os.path.join(
|
321
|
+
os.path.dirname(__file__), "resources", "spdx", "licenses.json"
|
322
|
+
)
|
323
|
+
with open(file_path, encoding="utf-8") as file:
|
324
|
+
string = file.read()
|
325
|
+
spdx = json.loads(string).get("licenses")
|
326
|
+
license_ = next(
|
327
|
+
(
|
328
|
+
lic
|
329
|
+
for lic in spdx
|
330
|
+
if lic["licenseId"].casefold() == dct.get("id", "").casefold()
|
331
|
+
or lic["seeAlso"][0] == dct.get("url", None)
|
332
|
+
),
|
333
|
+
None,
|
334
|
+
)
|
335
|
+
if license_ is None:
|
336
|
+
return compact(dct)
|
337
|
+
# license = spdx.find do |l|
|
338
|
+
# l['licenseId'].casecmp?(hsh['rightsIdentifier']) || l['seeAlso'].first == normalize_cc_url(hsh['rightsUri']) || l['name'] == hsh['rights'] || l['seeAlso'].first == normalize_cc_url(hsh['rights'])
|
339
|
+
# end
|
340
|
+
return compact(
|
341
|
+
{
|
342
|
+
"id": license_["licenseId"],
|
343
|
+
"url": license_["seeAlso"][0],
|
344
|
+
}
|
345
|
+
)
|
346
|
+
|
347
|
+
# else
|
348
|
+
# {
|
349
|
+
# 'rights': hsh['#text'] || hsh['rights'],
|
350
|
+
# 'rightsUri': hsh['rightsUri'] || hsh['rightsUri'],
|
351
|
+
# 'rightsIdentifier': hsh['rightsIdentifier'].present? ? hsh['rightsIdentifier'].downcase : None,
|
352
|
+
# 'rightsIdentifierScheme': hsh['rightsIdentifierScheme'],
|
353
|
+
# 'schemeUri': hsh['schemeUri'],
|
354
|
+
# 'lang': hsh['lang']
|
355
|
+
# }.compact
|
356
|
+
# end
|
357
|
+
# end
|
358
|
+
|
359
|
+
|
360
|
+
def from_json_feed(elements: list) -> list:
|
361
|
+
"""Convert from JSON Feed elements"""
|
362
|
+
|
363
|
+
def format_element(element):
|
364
|
+
"""format element"""
|
365
|
+
if not isinstance(element, dict):
|
366
|
+
return None
|
367
|
+
mapping = {"url": "id"}
|
368
|
+
for key, value in mapping.items():
|
369
|
+
if element.get(key, None) is not None:
|
370
|
+
element[value] = element.pop(key)
|
371
|
+
return element
|
372
|
+
|
373
|
+
return [format_element(i) for i in elements]
|
374
|
+
|
375
|
+
|
376
|
+
def from_inveniordm(elements: list) -> list:
|
377
|
+
"""Convert from inveniordm elements"""
|
378
|
+
|
379
|
+
def format_element(element):
|
380
|
+
if "person_or_org" in element.keys():
|
381
|
+
element = element["person_or_org"]
|
382
|
+
|
383
|
+
"""format element"""
|
384
|
+
if not isinstance(element, dict):
|
385
|
+
return None
|
386
|
+
mapping = {"orcid": "ORCID"}
|
387
|
+
for key, value in mapping.items():
|
388
|
+
if element.get(key, None) is not None:
|
389
|
+
element[value] = element.pop(key)
|
390
|
+
return element
|
391
|
+
|
392
|
+
return [format_element(i) for i in elements]
|
393
|
+
|
394
|
+
|
395
|
+
def to_inveniordm(elements: list) -> list:
|
396
|
+
"""Convert elements to InvenioRDM"""
|
397
|
+
|
398
|
+
def format_element(i):
|
399
|
+
"""format element"""
|
400
|
+
element = {}
|
401
|
+
element["familyName"] = i.get("familyName", None)
|
402
|
+
element["givenName"] = i.get("givenName", None)
|
403
|
+
element["name"] = i.get("name", None)
|
404
|
+
element["type"] = i.get("type", None)
|
405
|
+
element["ORCID"] = i.get("ORCID", None)
|
406
|
+
return compact(element)
|
407
|
+
|
408
|
+
return [format_element(i) for i in elements]
|
409
|
+
|
410
|
+
|
411
|
+
def from_crossref_xml(elements: list) -> list:
|
412
|
+
"""Convert from crossref_xml elements"""
|
413
|
+
|
414
|
+
def format_affiliation(element):
|
415
|
+
"""Format affiliation"""
|
416
|
+
return {"name": element}
|
417
|
+
|
418
|
+
def format_element(element):
|
419
|
+
"""format element"""
|
420
|
+
if element.get("name", None) is not None:
|
421
|
+
element["type"] = "Organization"
|
422
|
+
element["name"] = element.get("name")
|
423
|
+
else:
|
424
|
+
element["type"] = "Person"
|
425
|
+
element["givenName"] = element.get("given_name", None)
|
426
|
+
element["familyName"] = element.get("surname", None)
|
427
|
+
element["contributorType"] = element.get(
|
428
|
+
"contributor_role", "author"
|
429
|
+
).capitalize()
|
430
|
+
if element.get("ORCID", None) is not None:
|
431
|
+
orcid = parse_attributes(element.get("ORCID"))
|
432
|
+
element["ORCID"] = normalize_orcid(orcid)
|
433
|
+
element = py_.omit(
|
434
|
+
element, "given_name", "surname", "sequence", "contributor_role"
|
435
|
+
)
|
436
|
+
return compact(element)
|
437
|
+
|
438
|
+
return [format_element(i) for i in elements]
|
439
|
+
|
440
|
+
|
441
|
+
def from_kbase(elements: list) -> list:
|
442
|
+
"""Convert from kbase elements"""
|
443
|
+
|
444
|
+
def map_contributor_role(role):
|
445
|
+
if role.split(":")[0] == "CRediT":
|
446
|
+
return py_.pascal_case(role.split(":")[1])
|
447
|
+
elif role.split(":")[0] == "DataCite":
|
448
|
+
return DATACITE_CONTRIBUTOR_TYPES.get(role.split(":")[1], "Other")
|
449
|
+
else:
|
450
|
+
return role.split(":")[1]
|
451
|
+
|
452
|
+
def format_element(element):
|
453
|
+
"""format element"""
|
454
|
+
if not isinstance(element, dict):
|
455
|
+
return None
|
456
|
+
if element.get("contributor_id", None) is not None:
|
457
|
+
element["ORCID"] = from_curie(element["contributor_id"])
|
458
|
+
element["contributor_roles"] = [
|
459
|
+
map_contributor_role(i)
|
460
|
+
for i in wrap(element.get("contributor_roles", None))
|
461
|
+
]
|
462
|
+
element = py_.omit(element, "contributor_id")
|
463
|
+
return compact(element)
|
464
|
+
|
465
|
+
return [format_element(i) for i in elements]
|
466
|
+
|
467
|
+
|
468
|
+
def from_csl(elements: list) -> list:
|
469
|
+
"""Convert from csl elements"""
|
470
|
+
|
471
|
+
def format_element(element):
|
472
|
+
"""format element"""
|
473
|
+
if element.get("literal", None) is not None:
|
474
|
+
element["type"] = "Organization"
|
475
|
+
element["name"] = element["literal"]
|
476
|
+
elif element.get("name", None) is not None:
|
477
|
+
element["type"] = "Organization"
|
478
|
+
element["name"] = element.get("name")
|
479
|
+
else:
|
480
|
+
element["type"] = "Person"
|
481
|
+
element["name"] = " ".join(
|
482
|
+
[element.get("given", ""), element.get("family", "")]
|
483
|
+
)
|
484
|
+
element["givenName"] = element.get("given", None)
|
485
|
+
element["familyName"] = element.get("family", None)
|
486
|
+
element["affiliation"] = element.get("affiliation", None)
|
487
|
+
element = py_.omit(element, "given", "family", "literal", "sequence")
|
488
|
+
return compact(element)
|
489
|
+
|
490
|
+
return [format_element(i) for i in elements]
|
491
|
+
|
492
|
+
|
493
|
+
def to_csl(elements: list) -> list:
|
494
|
+
"""Convert elements to CSL-JSON"""
|
495
|
+
|
496
|
+
def format_element(i):
|
497
|
+
"""format element"""
|
498
|
+
element = {}
|
499
|
+
element["family"] = i.get("familyName", None)
|
500
|
+
element["given"] = i.get("givenName", None)
|
501
|
+
element["literal"] = (
|
502
|
+
i.get("name", None) if i.get("familyName", None) is None else None
|
503
|
+
)
|
504
|
+
return compact(element)
|
505
|
+
|
506
|
+
return [format_element(i) for i in elements]
|
507
|
+
|
508
|
+
|
509
|
+
def to_ris(elements: Optional[list]) -> list:
|
510
|
+
"""Convert element to RIS"""
|
511
|
+
if elements is None:
|
512
|
+
return []
|
513
|
+
|
514
|
+
def format_element(i):
|
515
|
+
"""format element"""
|
516
|
+
if i.get("familyName", None) and i.get("givenName", None):
|
517
|
+
element = ", ".join([i["familyName"], i.get("givenName", None)])
|
518
|
+
else:
|
519
|
+
element = i.get("name", None)
|
520
|
+
return element
|
521
|
+
|
522
|
+
return [
|
523
|
+
format_element(i)
|
524
|
+
for i in elements
|
525
|
+
if i.get("name", None) or i.get("familyName", None)
|
526
|
+
]
|
527
|
+
|
528
|
+
|
529
|
+
def to_schema_org(element: Optional[dict]) -> Optional[dict]:
|
530
|
+
"""Convert a metadata element to Schema.org"""
|
531
|
+
if not isinstance(element, dict):
|
532
|
+
return None
|
533
|
+
mapping = {"type": "@type", "id": "@id", "title": "name"}
|
534
|
+
for key, value in mapping.items():
|
535
|
+
if element.get(key, None) is not None:
|
536
|
+
element[value] = element.pop(key)
|
537
|
+
return element
|
538
|
+
|
539
|
+
|
540
|
+
def to_schema_org_creators(elements: list) -> list():
|
541
|
+
"""Convert creators to Schema.org"""
|
542
|
+
|
543
|
+
def format_element(element):
|
544
|
+
"""format element"""
|
545
|
+
element["@type"] = element["type"][0:-2] if element.get("type", None) else None
|
546
|
+
if element.get("familyName", None) and element.get("name", None) is None:
|
547
|
+
element["name"] = " ".join(
|
548
|
+
[element.get("givenName", None), element.get("familyName")]
|
549
|
+
)
|
550
|
+
element["@type"] = "Person"
|
551
|
+
else:
|
552
|
+
element["@type"] = "Organization"
|
553
|
+
element = py_.omit(element, "type", "contributorRoles")
|
554
|
+
return compact(element)
|
555
|
+
|
556
|
+
return [format_element(i) for i in elements]
|
557
|
+
|
558
|
+
|
559
|
+
def to_schema_org_container(element: Optional[dict], **kwargs) -> Optional[dict]:
|
560
|
+
"""Convert CSL container to Schema.org container"""
|
561
|
+
if element is None and kwargs.get("container_title", None) is None:
|
562
|
+
return None
|
563
|
+
if not isinstance(element, dict):
|
564
|
+
return None
|
565
|
+
|
566
|
+
return compact(
|
567
|
+
{
|
568
|
+
"@id": element.get("identifier", None),
|
569
|
+
"@type": "DataCatalog"
|
570
|
+
if kwargs.get("type", None) == "DataRepository"
|
571
|
+
else "Periodical",
|
572
|
+
"name": element.get("title", None) or kwargs.get("container_title", None),
|
573
|
+
}
|
574
|
+
)
|
575
|
+
|
576
|
+
|
577
|
+
def to_schema_org_identifiers(elements: list) -> list:
|
578
|
+
"""Convert identifiers to Schema.org"""
|
579
|
+
|
580
|
+
def format_element(i):
|
581
|
+
"""format element"""
|
582
|
+
element = {}
|
583
|
+
element["@type"] = "PropertyValue"
|
584
|
+
element["propertyID"] = i.get("identifierType", None)
|
585
|
+
element["value"] = i.get("identifier", None)
|
586
|
+
return compact(element)
|
587
|
+
|
588
|
+
return [format_element(i) for i in elements]
|
589
|
+
|
590
|
+
|
591
|
+
def to_schema_org_relations(related_items: list, relation_type=None):
|
592
|
+
"""Convert relatedItems to Schema.org relations"""
|
593
|
+
|
594
|
+
def format_element(i):
|
595
|
+
"""format element"""
|
596
|
+
if i["relatedItemIdentifierType"] == "ISSN" and i["relationType"] == "IsPartOf":
|
597
|
+
return compact({"@type": "Periodical", "issn": i["relatedItemIdentifier"]})
|
598
|
+
return compact({"@id": normalize_id(i["relatedIdentifier"])})
|
599
|
+
|
600
|
+
# consolidate different relation types
|
601
|
+
if relation_type == "References":
|
602
|
+
relation_type = ["References", "Cites"]
|
603
|
+
else:
|
604
|
+
relation_type = [relation_type]
|
605
|
+
|
606
|
+
related_items = py_.filter(
|
607
|
+
wrap(related_items), lambda ri: ri["relationType"] in relation_type
|
608
|
+
)
|
609
|
+
return [format_element(i) for i in related_items]
|
610
|
+
|
611
|
+
|
612
|
+
def find_from_format(pid=None, string=None, ext=None, dct=None, filename=None):
|
613
|
+
"""Find reader from format"""
|
614
|
+
if pid is not None:
|
615
|
+
return find_from_format_by_id(pid)
|
616
|
+
if string is not None and ext is not None:
|
617
|
+
return find_from_format_by_ext(ext)
|
618
|
+
if dct is not None:
|
619
|
+
return find_from_format_by_dict(dct)
|
620
|
+
if string is not None:
|
621
|
+
return find_from_format_by_string(string)
|
622
|
+
if filename is not None:
|
623
|
+
return find_from_format_by_filename(filename)
|
624
|
+
return "datacite"
|
625
|
+
|
626
|
+
|
627
|
+
def find_from_format_by_id(pid: str) -> Optional[str]:
|
628
|
+
"""Find reader from format by id"""
|
629
|
+
doi = validate_doi(pid)
|
630
|
+
if doi and (registration_agency := get_doi_ra(doi)) is not None:
|
631
|
+
return registration_agency.lower()
|
632
|
+
if (
|
633
|
+
re.match(r"\A(http|https):/(/)?github\.com/(.+)/CITATION.cff\Z", pid)
|
634
|
+
is not None
|
635
|
+
):
|
636
|
+
return "cff"
|
637
|
+
if (
|
638
|
+
re.match(r"\A(http|https):/(/)?github\.com/(.+)/codemeta.json\Z", pid)
|
639
|
+
is not None
|
640
|
+
):
|
641
|
+
return "codemeta"
|
642
|
+
if re.match(r"\A(http|https):/(/)?github\.com/(.+)\Z", pid) is not None:
|
643
|
+
return "cff"
|
644
|
+
if re.match(r"\Ahttps:/(/)?api\.rogue-scholar\.org/posts/(.+)\Z", pid) is not None:
|
645
|
+
return "json_feed_item"
|
646
|
+
if re.match(r"\Ahttps:/(/)(.+)/api/records/(.+)\Z", pid) is not None:
|
647
|
+
return "inveniordm"
|
648
|
+
return "schema_org"
|
649
|
+
|
650
|
+
|
651
|
+
def find_from_format_by_ext(ext: str) -> Optional[str]:
|
652
|
+
"""Find reader from format by ext"""
|
653
|
+
if ext == ".bib":
|
654
|
+
return "bibtex"
|
655
|
+
if ext == ".ris":
|
656
|
+
return "ris"
|
657
|
+
return None
|
658
|
+
|
659
|
+
|
660
|
+
def find_from_format_by_dict(dct: dict) -> Optional[str]:
|
661
|
+
if dct is None or not isinstance(dct, dict):
|
662
|
+
return None
|
663
|
+
"""Find reader from format by dict"""
|
664
|
+
if dct.get("schema_version", "").startswith("https://commonmeta.org"):
|
665
|
+
return "commonmeta"
|
666
|
+
if dct.get("@context", None) == "http://schema.org":
|
667
|
+
return "schema_org"
|
668
|
+
if dct.get("@context", None) in [
|
669
|
+
"https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld"
|
670
|
+
]:
|
671
|
+
return "codemeta"
|
672
|
+
if dct.get("guid", None) is not None:
|
673
|
+
return "json_feed_item"
|
674
|
+
if dct.get("schemaVersion", "").startswith("http://datacite.org/schema/kernel"):
|
675
|
+
return "datacite"
|
676
|
+
if dct.get("source", None) == "Crossref":
|
677
|
+
return "crossref"
|
678
|
+
if py_.get(dct, "issued.date-parts") is not None:
|
679
|
+
return "csl"
|
680
|
+
if py_.get(dct, "conceptdoi") is not None:
|
681
|
+
return "inveniordm"
|
682
|
+
if py_.get(dct, "credit_metadata") is not None:
|
683
|
+
return "kbase"
|
684
|
+
return None
|
685
|
+
|
686
|
+
|
687
|
+
def find_from_format_by_string(string: str) -> Optional[str]:
|
688
|
+
"""Find reader from format by string"""
|
689
|
+
if string is None:
|
690
|
+
return None
|
691
|
+
try:
|
692
|
+
data = json.loads(string)
|
693
|
+
if not isinstance(data, dict):
|
694
|
+
raise TypeError
|
695
|
+
if data.get("schema", "").startswith("https://commonmeta.org"):
|
696
|
+
return "commonmeta"
|
697
|
+
if data.get("items", None) is not None:
|
698
|
+
data = data["items"][0]
|
699
|
+
if data.get("@context", None) == "http://schema.org":
|
700
|
+
return "schema_org"
|
701
|
+
if data.get("@context", None) in [
|
702
|
+
"https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld"
|
703
|
+
]:
|
704
|
+
return "codemeta"
|
705
|
+
if data.get("guid", None) is not None:
|
706
|
+
return "json_feed_item"
|
707
|
+
if data.get("schemaVersion", "").startswith(
|
708
|
+
"http://datacite.org/schema/kernel"
|
709
|
+
):
|
710
|
+
return "datacite"
|
711
|
+
if data.get("source", None) == "Crossref":
|
712
|
+
return "crossref"
|
713
|
+
if py_.get(data, "issued.date-parts") is not None:
|
714
|
+
return "csl"
|
715
|
+
if py_.get(data, "conceptdoi") is not None:
|
716
|
+
return "inveniordm"
|
717
|
+
if py_.get(data, "credit_metadata") is not None:
|
718
|
+
return "kbase"
|
719
|
+
except (TypeError, json.JSONDecodeError):
|
720
|
+
pass
|
721
|
+
try:
|
722
|
+
data = BeautifulSoup(string, "xml")
|
723
|
+
if data.find("doi_record"):
|
724
|
+
return "crossref_xml"
|
725
|
+
if data.find("resource"):
|
726
|
+
return "datacite_xml"
|
727
|
+
except ValueError:
|
728
|
+
pass
|
729
|
+
try:
|
730
|
+
data = BeautifulSoup(string, "html.parser")
|
731
|
+
if (
|
732
|
+
data.find("script", type="application/ld+json")
|
733
|
+
or data.find("meta", {"name": "citation_doi"})
|
734
|
+
or data.find("meta", {"name": "dc.identifier"})
|
735
|
+
):
|
736
|
+
return "schema_org"
|
737
|
+
except ValueError:
|
738
|
+
pass
|
739
|
+
try:
|
740
|
+
data = yaml.safe_load(string)
|
741
|
+
if data.get("cff-version", None):
|
742
|
+
return "cff"
|
743
|
+
except (yaml.YAMLError, AttributeError):
|
744
|
+
pass
|
745
|
+
|
746
|
+
if string.startswith("TY - "):
|
747
|
+
return "ris"
|
748
|
+
if any(string.startswith(f"@{t}") for t in bibtexparser.bibdatabase.STANDARD_TYPES):
|
749
|
+
return "bibtex"
|
750
|
+
|
751
|
+
# no format found
|
752
|
+
return None
|
753
|
+
|
754
|
+
|
755
|
+
def find_from_format_by_filename(filename):
|
756
|
+
"""Find reader from format by filename"""
|
757
|
+
if filename == "CITATION.cff":
|
758
|
+
return "cff"
|
759
|
+
return None
|
760
|
+
|
761
|
+
|
762
|
+
def from_schema_org(element):
|
763
|
+
"""Convert schema.org to DataCite"""
|
764
|
+
if element is None:
|
765
|
+
return None
|
766
|
+
element["type"] = element.get("@type", None)
|
767
|
+
element["id"] = element.get("@id", None)
|
768
|
+
return compact(py_.omit(element, ["@type", "@id"]))
|
769
|
+
|
770
|
+
|
771
|
+
def from_schema_org_creators(elements: list) -> list:
|
772
|
+
"""Convert schema.org creators to commonmeta"""
|
773
|
+
|
774
|
+
def format_element(i):
|
775
|
+
"""format element"""
|
776
|
+
element = {}
|
777
|
+
if isinstance(i, str):
|
778
|
+
return {"name": i}
|
779
|
+
if urlparse(i.get("@id", None)).hostname == "orcid.org":
|
780
|
+
element["id"] = i.get("@id")
|
781
|
+
element["type"] = "Person"
|
782
|
+
elif isinstance(i.get("@type", None), str):
|
783
|
+
element["type"] = i.get("@type")
|
784
|
+
elif isinstance(i.get("@type", None), list):
|
785
|
+
element["type"] = py_.find(
|
786
|
+
i["@type"], lambda x: x in ["Person", "Organization"]
|
787
|
+
)
|
788
|
+
|
789
|
+
# strip text after comma if suffix is an academic title
|
790
|
+
if str(i["name"]).split(", ", maxsplit=1)[-1] in [
|
791
|
+
"MD",
|
792
|
+
"PhD",
|
793
|
+
"DVM",
|
794
|
+
"DDS",
|
795
|
+
"DMD",
|
796
|
+
"JD",
|
797
|
+
"MBA",
|
798
|
+
"MPH",
|
799
|
+
"MS",
|
800
|
+
"MA",
|
801
|
+
"MFA",
|
802
|
+
"MSc",
|
803
|
+
"MEd",
|
804
|
+
"MEng",
|
805
|
+
"MPhil",
|
806
|
+
"MRes",
|
807
|
+
"LLM",
|
808
|
+
"LLB",
|
809
|
+
"BSc",
|
810
|
+
"BA",
|
811
|
+
"BFA",
|
812
|
+
"BEd",
|
813
|
+
"BEng",
|
814
|
+
"BPhil",
|
815
|
+
]:
|
816
|
+
i["name"] = str(i["name"]).split(", ", maxsplit=1)[0]
|
817
|
+
length = len(str(i["name"]).split(" "))
|
818
|
+
if i.get("givenName", None):
|
819
|
+
element["givenName"] = i.get("givenName", None)
|
820
|
+
if i.get("familyName", None):
|
821
|
+
element["familyName"] = i.get("familyName", None)
|
822
|
+
element["type"] = "Person"
|
823
|
+
# parentheses around the last word indicate an organization
|
824
|
+
elif length > 1 and not str(i["name"]).rsplit(" ", maxsplit=1)[-1].startswith(
|
825
|
+
"("
|
826
|
+
):
|
827
|
+
element["givenName"] = " ".join(str(i["name"]).split(" ")[0 : length - 1])
|
828
|
+
element["familyName"] = str(i["name"]).rsplit(" ", maxsplit=1)[1:]
|
829
|
+
if not element.get("familyName", None):
|
830
|
+
element["creatorName"] = compact(
|
831
|
+
{
|
832
|
+
"type": i.get("@type", None),
|
833
|
+
"#text": i.get("name", None),
|
834
|
+
}
|
835
|
+
)
|
836
|
+
|
837
|
+
if isinstance(i.get("affiliation", None), str):
|
838
|
+
element["affiliation"] = {"type": "Organization", "name": i["affiliation"]}
|
839
|
+
elif urlparse(py_.get(i, "affiliation.@id", "")).hostname in [
|
840
|
+
"ror.org",
|
841
|
+
"isni.org",
|
842
|
+
]:
|
843
|
+
element["affiliation"] = {
|
844
|
+
"id": i["affiliation"]["@id"],
|
845
|
+
"type": "Organization",
|
846
|
+
"name": i["affiliation"]["name"],
|
847
|
+
}
|
848
|
+
return compact(element)
|
849
|
+
|
850
|
+
return [format_element(i) for i in wrap(elements)]
|
851
|
+
|
852
|
+
|
853
|
+
def github_from_url(url: str) -> dict:
|
854
|
+
"""Get github owner, repo, release and path from url"""
|
855
|
+
|
856
|
+
match = re.match(
|
857
|
+
r"\Ahttps://(github|raw\.githubusercontent)\.com/(.+)(?:/)?(.+)?(?:/tree/)?(.*)\Z",
|
858
|
+
url,
|
859
|
+
)
|
860
|
+
if match is None:
|
861
|
+
return {}
|
862
|
+
words = urlparse(url).path.lstrip("/").split("/")
|
863
|
+
owner = words[0] if len(words) > 0 else None
|
864
|
+
repo = words[1] if len(words) > 1 else None
|
865
|
+
release = words[3] if len(words) > 3 else None
|
866
|
+
path = "/".join(words[4:]) if len(words) > 3 else ""
|
867
|
+
if len(path) == 0:
|
868
|
+
path = None
|
869
|
+
|
870
|
+
return compact({"owner": owner, "repo": repo, "release": release, "path": path})
|
871
|
+
|
872
|
+
|
873
|
+
def github_repo_from_url(url: str) -> Optional[str]:
|
874
|
+
"""Get github repo from url"""
|
875
|
+
return github_from_url(url).get("repo", None)
|
876
|
+
|
877
|
+
|
878
|
+
def github_release_from_url(url: str) -> Optional[str]:
|
879
|
+
"""Get github release from url"""
|
880
|
+
return github_from_url(url).get("release", None)
|
881
|
+
|
882
|
+
|
883
|
+
def github_owner_from_url(url: str) -> Optional[str]:
|
884
|
+
"""Get github owner from url"""
|
885
|
+
return github_from_url(url).get("owner", None)
|
886
|
+
|
887
|
+
|
888
|
+
def github_as_owner_url(url: str) -> Optional[str]:
|
889
|
+
"""Get github owner url from url"""
|
890
|
+
github_dict = github_from_url(url)
|
891
|
+
if github_dict.get("owner", None) is None:
|
892
|
+
return None
|
893
|
+
return f"https://github.com/{github_dict.get('owner')}"
|
894
|
+
|
895
|
+
|
896
|
+
def github_as_repo_url(url) -> Optional[str]:
|
897
|
+
"""Get github repo url from url"""
|
898
|
+
github_dict = github_from_url(url)
|
899
|
+
if github_dict.get("repo", None) is None:
|
900
|
+
return None
|
901
|
+
return f"https://github.com/{github_dict.get('owner')}/{github_dict.get('repo')}"
|
902
|
+
|
903
|
+
|
904
|
+
def github_as_release_url(url: str) -> Optional[str]:
|
905
|
+
"""Get github release url from url"""
|
906
|
+
github_dict = github_from_url(url)
|
907
|
+
if github_dict.get("release", None) is None:
|
908
|
+
return None
|
909
|
+
return f"https://github.com/{github_dict.get('owner')}/{github_dict.get('repo')}/tree/{github_dict.get('release')}"
|
910
|
+
|
911
|
+
|
912
|
+
def github_as_codemeta_url(url: str) -> Optional[str]:
|
913
|
+
"""Get github codemeta.json url from url"""
|
914
|
+
github_dict = github_from_url(url)
|
915
|
+
|
916
|
+
if github_dict.get("path", None) and github_dict.get("path").endswith(
|
917
|
+
"codemeta.json"
|
918
|
+
):
|
919
|
+
return f"https://raw.githubusercontent.com/{github_dict.get('owner')}/{github_dict.get('repo')}/{github_dict.get('release')}/{github_dict.get('path')}"
|
920
|
+
elif github_dict.get("owner", None):
|
921
|
+
return f"https://raw.githubusercontent.com/{github_dict.get('owner')}/{github_dict.get('repo')}/master/codemeta.json"
|
922
|
+
else:
|
923
|
+
return None
|
924
|
+
|
925
|
+
|
926
|
+
def github_as_cff_url(url: str) -> Optional[str]:
|
927
|
+
"""Get github CITATION.cff url from url"""
|
928
|
+
github_dict = github_from_url(url)
|
929
|
+
|
930
|
+
if github_dict.get("path", None) and github_dict.get("path").endswith(
|
931
|
+
"CITATION.cff"
|
932
|
+
):
|
933
|
+
return f"https://raw.githubusercontent.com/{github_dict.get('owner')}/{github_dict.get('repo')}/{github_dict.get('release')}/{github_dict.get('path')}"
|
934
|
+
if github_dict.get("owner", None):
|
935
|
+
return f"https://raw.githubusercontent.com/{github_dict.get('owner')}/{github_dict.get('repo')}/main/CITATION.cff"
|
936
|
+
return None
|
937
|
+
|
938
|
+
|
939
|
+
def pages_as_string(
|
940
|
+
container: Optional[dict], page_range_separator="-"
|
941
|
+
) -> Optional[str]:
|
942
|
+
"""Parse pages for BibTeX"""
|
943
|
+
if container is None:
|
944
|
+
return None
|
945
|
+
if container.get("firstPage", None) is None:
|
946
|
+
return None
|
947
|
+
if container.get("lastPage", None) is None:
|
948
|
+
return container.get("firstPage", None)
|
949
|
+
|
950
|
+
return page_range_separator.join(
|
951
|
+
[container.get("firstPage"), container.get("lastPage", None)]
|
952
|
+
)
|
953
|
+
|
954
|
+
|
955
|
+
def subjects_as_string(subjects):
|
956
|
+
"""convert subject list to string, e.g. for bibtex"""
|
957
|
+
if subjects is None:
|
958
|
+
return None
|
959
|
+
|
960
|
+
keywords = []
|
961
|
+
for subject in wrap(subjects):
|
962
|
+
keywords.append(subject.get("subject", None))
|
963
|
+
return ", ".join(keywords)
|
964
|
+
|
965
|
+
|
966
|
+
# def reverse():
|
967
|
+
# return { 'citation': wrap(related_identifiers).select do |ri|
|
968
|
+
# ri['relationType'] == 'IsReferencedBy'
|
969
|
+
# end.map do |r|
|
970
|
+
# { '@id': normalize_doi(r['relatedIdentifier']),
|
971
|
+
# '@type': r['resourceTypeGeneral'] validate_orcid 'ScholarlyArticle',
|
972
|
+
# 'identifier': r['relatedIdentifierType'] == 'DOI' ? nil : to_identifier(r) }.compact
|
973
|
+
# end.unwrap,
|
974
|
+
# 'isBasedOn': wrap(related_identifiers).select do |ri|
|
975
|
+
# ri['relationType'] == 'IsSupplementTo'
|
976
|
+
# end.map do |r|
|
977
|
+
# { '@id': normalize_doi(r['relatedIdentifier']),
|
978
|
+
# '@type': r['resourceTypeGeneral'] or 'ScholarlyArticle',
|
979
|
+
# 'identifier': r['relatedIdentifierType'] == 'DOI' ? nil : to_identifier(r) }.compact
|
980
|
+
# end.unwrap }.compact
|
981
|
+
|
982
|
+
|
983
|
+
def name_to_fos(name: str) -> Optional[dict]:
|
984
|
+
"""Convert name to Fields of Science (OECD) subject"""
|
985
|
+
# # first find subject in Fields of Science (OECD)
|
986
|
+
# fos = JSON.load(File.read(File.expand_path('../../resources/oecd/fos-mappings.json',
|
987
|
+
# __dir__))).fetch('fosFields')
|
988
|
+
|
989
|
+
# subject = fos.find { |l| l['fosLabel'] == name || 'FOS: ' + l['fosLabel'] == name }
|
990
|
+
|
991
|
+
# if subject
|
992
|
+
# return [{
|
993
|
+
# 'subject': sanitize(name).downcase
|
994
|
+
# },
|
995
|
+
# {
|
996
|
+
# 'subject': 'FOS: ' + subject['fosLabel'],
|
997
|
+
# 'subjectScheme': 'Fields of Science and Technology (FOS)',
|
998
|
+
# 'schemeUri': 'http://www.oecd.org/science/inno/38235147.pdf'
|
999
|
+
# }]
|
1000
|
+
# end
|
1001
|
+
|
1002
|
+
# # if not found, look in Fields of Research (Australian and New Zealand Standard Research Classification)
|
1003
|
+
# # and map to Fields of Science. Add an extra entry for the latter
|
1004
|
+
# fores = JSON.load(File.read(File.expand_path('../../resources/oecd/for-mappings.json',
|
1005
|
+
# __dir__)))
|
1006
|
+
# for_fields = fores.fetch('forFields')
|
1007
|
+
# for_disciplines = fores.fetch('forDisciplines')
|
1008
|
+
|
1009
|
+
# subject = for_fields.find { |l| l['forLabel'] == name } ||
|
1010
|
+
# for_disciplines.find { |l| l['forLabel'] == name }
|
1011
|
+
|
1012
|
+
# if subject
|
1013
|
+
# [{
|
1014
|
+
# 'subject': sanitize(name).downcase
|
1015
|
+
# },
|
1016
|
+
# {
|
1017
|
+
# 'subject': 'FOS: ' + subject['fosLabel'],
|
1018
|
+
# 'subjectScheme': 'Fields of Science and Technology (FOS)',
|
1019
|
+
# 'schemeUri': 'http://www.oecd.org/science/inno/38235147.pdf'
|
1020
|
+
# }]
|
1021
|
+
# else
|
1022
|
+
|
1023
|
+
return {"subject": name.strip()}
|
1024
|
+
|
1025
|
+
|
1026
|
+
def encode_doi(prefix):
|
1027
|
+
"""Generate a DOI using the DOI prefix and a random base32 suffix"""
|
1028
|
+
suffix = base32.generate(length=10, split_every=5, checksum=True)
|
1029
|
+
return f"https://doi.org/{prefix}/{suffix}"
|
1030
|
+
|
1031
|
+
|
1032
|
+
def decode_doi(doi: str) -> int:
|
1033
|
+
"""Decode a DOI to a number"""
|
1034
|
+
suffix = doi.split("/", maxsplit=5)[-1]
|
1035
|
+
return base32.decode(suffix)
|
1036
|
+
|
1037
|
+
|
1038
|
+
def from_curie(id: Optional[str]) -> Optional[str]:
|
1039
|
+
"""from CURIE"""
|
1040
|
+
if id is None:
|
1041
|
+
return None
|
1042
|
+
_type = id.split(":")[0]
|
1043
|
+
if _type == "DOI":
|
1044
|
+
return doi_as_url(id.split(":")[1])
|
1045
|
+
elif _type == "ROR":
|
1046
|
+
return "https://ror.org/" + id.split(":")[1]
|
1047
|
+
elif _type == "ISNI":
|
1048
|
+
return "https://isni.org/isni/" + id.split(":")[1]
|
1049
|
+
elif _type == "ORCID":
|
1050
|
+
return normalize_orcid(id.split(":")[1])
|
1051
|
+
elif _type == "URL":
|
1052
|
+
return normalize_url(id.split(":")[1])
|
1053
|
+
elif _type == "JDP":
|
1054
|
+
return id.split(":")[1]
|
1055
|
+
# TODO: resolvable url for other identifier types
|
1056
|
+
return None
|
1057
|
+
|
1058
|
+
|
1059
|
+
def issn_as_url(issn: str) -> Optional[str]:
|
1060
|
+
"""ISSN as URL"""
|
1061
|
+
if issn is None:
|
1062
|
+
return None
|
1063
|
+
return f"https://portal.issn.org/resource/ISSN/{issn}"
|
1064
|
+
|
1065
|
+
|
1066
|
+
def get_language(lang: str, format: str = "alpha_2") -> Optional[str]:
|
1067
|
+
"""Provide a language string based on ISO 639, with either a name in English,
|
1068
|
+
ISO 639-1, or ISO 639-3 code as input. Optionally format as alpha_2 (defaul),
|
1069
|
+
alpha_3, or name.
|
1070
|
+
"""
|
1071
|
+
if not lang:
|
1072
|
+
return None
|
1073
|
+
if len(lang) == 2:
|
1074
|
+
language = pycountry.languages.get(alpha_2=lang)
|
1075
|
+
elif len(lang) == 3:
|
1076
|
+
language = pycountry.languages.get(alpha_3=lang)
|
1077
|
+
else:
|
1078
|
+
language = pycountry.languages.get(name=lang)
|
1079
|
+
|
1080
|
+
if language is None:
|
1081
|
+
return None
|
1082
|
+
elif format == "name":
|
1083
|
+
return language.name
|
1084
|
+
elif format == "alpha_3":
|
1085
|
+
return language.alpha_3
|
1086
|
+
|
1087
|
+
else:
|
1088
|
+
return language.alpha_2
|
1089
|
+
|
1090
|
+
|
1091
|
+
def start_case(content: str) -> str:
|
1092
|
+
"""Capitalize first letter of each word without lowercasing the rest"""
|
1093
|
+
words = content.split(" ")
|
1094
|
+
content = " ".join([word[0].upper() + word[1:] for word in words])
|
1095
|
+
return content
|
1096
|
+
|
1097
|
+
|
1098
|
+
def timer_func(func):
|
1099
|
+
def function_timer(*args, **kwargs):
|
1100
|
+
start = time.time()
|
1101
|
+
value = func(*args, **kwargs)
|
1102
|
+
end = time.time()
|
1103
|
+
runtime = end - start
|
1104
|
+
msg = "{func} took {time} seconds to complete its execution."
|
1105
|
+
print(msg.format(func=func.__name__, time=runtime))
|
1106
|
+
return value
|
1107
|
+
|
1108
|
+
return function_timer
|