commonmeta-py 0.17.3__py3-none-any.whl → 0.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. commonmeta_py/__init__.py +2 -0
  2. {commonmeta_py-0.17.3.dist-info → commonmeta_py-0.20.dist-info}/METADATA +45 -56
  3. commonmeta_py-0.20.dist-info/RECORD +5 -0
  4. {commonmeta_py-0.17.3.dist-info → commonmeta_py-0.20.dist-info}/WHEEL +1 -1
  5. {commonmeta_py-0.17.3.dist-info → commonmeta_py-0.20.dist-info/licenses}/LICENSE +1 -1
  6. commonmeta/__init__.py +0 -96
  7. commonmeta/api_utils.py +0 -77
  8. commonmeta/author_utils.py +0 -261
  9. commonmeta/base_utils.py +0 -121
  10. commonmeta/cli.py +0 -200
  11. commonmeta/constants.py +0 -576
  12. commonmeta/crossref_utils.py +0 -575
  13. commonmeta/date_utils.py +0 -193
  14. commonmeta/doi_utils.py +0 -260
  15. commonmeta/metadata.py +0 -317
  16. commonmeta/readers/__init__.py +0 -1
  17. commonmeta/readers/bibtex_reader.py +0 -0
  18. commonmeta/readers/cff_reader.py +0 -199
  19. commonmeta/readers/codemeta_reader.py +0 -112
  20. commonmeta/readers/commonmeta_reader.py +0 -13
  21. commonmeta/readers/crossref_reader.py +0 -409
  22. commonmeta/readers/crossref_xml_reader.py +0 -508
  23. commonmeta/readers/csl_reader.py +0 -98
  24. commonmeta/readers/datacite_reader.py +0 -384
  25. commonmeta/readers/datacite_xml_reader.py +0 -357
  26. commonmeta/readers/inveniordm_reader.py +0 -199
  27. commonmeta/readers/json_feed_reader.py +0 -422
  28. commonmeta/readers/kbase_reader.py +0 -205
  29. commonmeta/readers/ris_reader.py +0 -103
  30. commonmeta/readers/schema_org_reader.py +0 -493
  31. commonmeta/resources/cff_v1.2.0.json +0 -1827
  32. commonmeta/resources/commonmeta_v0.12.json +0 -601
  33. commonmeta/resources/commonmeta_v0.13.json +0 -571
  34. commonmeta/resources/crossref/AccessIndicators.xsd +0 -47
  35. commonmeta/resources/crossref/JATS-journalpublishing1-3d2-mathml3-elements.xsd +0 -10130
  36. commonmeta/resources/crossref/JATS-journalpublishing1-3d2-mathml3.xsd +0 -48
  37. commonmeta/resources/crossref/JATS-journalpublishing1-elements.xsd +0 -8705
  38. commonmeta/resources/crossref/JATS-journalpublishing1-mathml3-elements.xsd +0 -8608
  39. commonmeta/resources/crossref/JATS-journalpublishing1-mathml3.xsd +0 -49
  40. commonmeta/resources/crossref/JATS-journalpublishing1.xsd +0 -6176
  41. commonmeta/resources/crossref/clinicaltrials.xsd +0 -61
  42. commonmeta/resources/crossref/common5.3.1.xsd +0 -1538
  43. commonmeta/resources/crossref/crossref5.3.1.xsd +0 -1949
  44. commonmeta/resources/crossref/crossref_query_output3.0.xsd +0 -1097
  45. commonmeta/resources/crossref/fundref.xsd +0 -49
  46. commonmeta/resources/crossref/module-ali.xsd +0 -39
  47. commonmeta/resources/crossref/relations.xsd +0 -444
  48. commonmeta/resources/crossref-v0.2.json +0 -60
  49. commonmeta/resources/csl-data.json +0 -538
  50. commonmeta/resources/datacite-v4.5.json +0 -829
  51. commonmeta/resources/ietf-bcp-47.json +0 -3025
  52. commonmeta/resources/iso-8601.json +0 -3182
  53. commonmeta/resources/spdx/licenses.json +0 -4851
  54. commonmeta/resources/spdx-schema..json +0 -903
  55. commonmeta/resources/styles/apa.csl +0 -1697
  56. commonmeta/resources/styles/chicago-author-date.csl +0 -684
  57. commonmeta/resources/styles/harvard-cite-them-right.csl +0 -321
  58. commonmeta/resources/styles/ieee.csl +0 -468
  59. commonmeta/resources/styles/modern-language-association.csl +0 -341
  60. commonmeta/resources/styles/vancouver.csl +0 -376
  61. commonmeta/schema_utils.py +0 -27
  62. commonmeta/translators.py +0 -47
  63. commonmeta/utils.py +0 -1075
  64. commonmeta/writers/__init__.py +0 -1
  65. commonmeta/writers/bibtex_writer.py +0 -149
  66. commonmeta/writers/citation_writer.py +0 -70
  67. commonmeta/writers/commonmeta_writer.py +0 -68
  68. commonmeta/writers/crossref_xml_writer.py +0 -17
  69. commonmeta/writers/csl_writer.py +0 -78
  70. commonmeta/writers/datacite_writer.py +0 -190
  71. commonmeta/writers/ris_writer.py +0 -58
  72. commonmeta/writers/schema_org_writer.py +0 -146
  73. commonmeta_py-0.17.3.dist-info/RECORD +0 -72
  74. commonmeta_py-0.17.3.dist-info/entry_points.txt +0 -3
commonmeta/utils.py DELETED
@@ -1,1075 +0,0 @@
1
- """Utils module for commonmeta-py"""
2
-
3
- import os
4
- import orjson as json
5
- import re
6
- import time
7
- from typing import Optional
8
- from urllib.parse import urlparse
9
- import yaml
10
- from furl import furl
11
- import bibtexparser
12
- from bs4 import BeautifulSoup
13
- from pydash import py_
14
- import base32_lib as base32
15
- import pycountry
16
-
17
- from .base_utils import wrap, compact, parse_attributes
18
- from .doi_utils import normalize_doi, doi_from_url, get_doi_ra, validate_doi, doi_as_url
19
- from .constants import DATACITE_CONTRIBUTOR_TYPES
20
-
21
-
22
- NORMALIZED_LICENSES = {
23
- "https://creativecommons.org/licenses/by/1.0": "https://creativecommons.org/licenses/by/1.0/legalcode",
24
- "https://creativecommons.org/licenses/by/2.0": "https://creativecommons.org/licenses/by/2.0/legalcode",
25
- "https://creativecommons.org/licenses/by/2.5": "https://creativecommons.org/licenses/by/2.5/legalcode",
26
- "https://creativecommons.org/licenses/by/3.0": "https://creativecommons.org/licenses/by/3.0/legalcode",
27
- "https://creativecommons.org/licenses/by/3.0/us": "https://creativecommons.org/licenses/by/3.0/legalcode",
28
- "https://creativecommons.org/licenses/by/4.0": "https://creativecommons.org/licenses/by/4.0/legalcode",
29
- "https://creativecommons.org/licenses/by-nc/1.0": "https://creativecommons.org/licenses/by-nc/1.0/legalcode",
30
- "https://creativecommons.org/licenses/by-nc/2.0": "https://creativecommons.org/licenses/by-nc/2.0/legalcode",
31
- "https://creativecommons.org/licenses/by-nc/2.5": "https://creativecommons.org/licenses/by-nc/2.5/legalcode",
32
- "https://creativecommons.org/licenses/by-nc/3.0": "https://creativecommons.org/licenses/by-nc/3.0/legalcode",
33
- "https://creativecommons.org/licenses/by-nc/4.0": "https://creativecommons.org/licenses/by-nc/4.0/legalcode",
34
- "https://creativecommons.org/licenses/by-nd-nc/1.0": "https://creativecommons.org/licenses/by-nd-nc/1.0/legalcode",
35
- "https://creativecommons.org/licenses/by-nd-nc/2.0": "https://creativecommons.org/licenses/by-nd-nc/2.0/legalcode",
36
- "https://creativecommons.org/licenses/by-nd-nc/2.5": "https://creativecommons.org/licenses/by-nd-nc/2.5/legalcode",
37
- "https://creativecommons.org/licenses/by-nd-nc/3.0": "https://creativecommons.org/licenses/by-nd-nc/3.0/legalcode",
38
- "https://creativecommons.org/licenses/by-nd-nc/4.0": "https://creativecommons.org/licenses/by-nd-nc/4.0/legalcode",
39
- "https://creativecommons.org/licenses/by-nc-sa/1.0": "https://creativecommons.org/licenses/by-nc-sa/1.0/legalcode",
40
- "https://creativecommons.org/licenses/by-nc-sa/2.0": "https://creativecommons.org/licenses/by-nc-sa/2.0/legalcode",
41
- "https://creativecommons.org/licenses/by-nc-sa/2.5": "https://creativecommons.org/licenses/by-nc-sa/2.5/legalcode",
42
- "https://creativecommons.org/licenses/by-nc-sa/3.0": "https://creativecommons.org/licenses/by-nc-sa/3.0/legalcode",
43
- "https://creativecommons.org/licenses/by-nc-sa/3.0/us": "https://creativecommons.org/licenses/by-nc-sa/3.0/legalcode",
44
- "https://creativecommons.org/licenses/by-nc-sa/4.0": "https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode",
45
- "https://creativecommons.org/licenses/by-nd/1.0": "https://creativecommons.org/licenses/by-nd/1.0/legalcode",
46
- "https://creativecommons.org/licenses/by-nd/2.0": "https://creativecommons.org/licenses/by-nd/2.0/legalcode",
47
- "https://creativecommons.org/licenses/by-nd/2.5": "https://creativecommons.org/licenses/by-nd/2.5/legalcode",
48
- "https://creativecommons.org/licenses/by-nd/3.0": "https://creativecommons.org/licenses/by-nd/3.0/legalcode",
49
- "https://creativecommons.org/licenses/by-nd/4.0": "https://creativecommons.org/licenses/by-nd/2.0/legalcode",
50
- "https://creativecommons.org/licenses/by-sa/1.0": "https://creativecommons.org/licenses/by-sa/1.0/legalcode",
51
- "https://creativecommons.org/licenses/by-sa/2.0": "https://creativecommons.org/licenses/by-sa/2.0/legalcode",
52
- "https://creativecommons.org/licenses/by-sa/2.5": "https://creativecommons.org/licenses/by-sa/2.5/legalcode",
53
- "https://creativecommons.org/licenses/by-sa/3.0": "https://creativecommons.org/licenses/by-sa/3.0/legalcode",
54
- "https://creativecommons.org/licenses/by-sa/4.0": "https://creativecommons.org/licenses/by-sa/4.0/legalcode",
55
- "https://creativecommons.org/licenses/by-nc-nd/1.0": "https://creativecommons.org/licenses/by-nc-nd/1.0/legalcode",
56
- "https://creativecommons.org/licenses/by-nc-nd/2.0": "https://creativecommons.org/licenses/by-nc-nd/2.0/legalcode",
57
- "https://creativecommons.org/licenses/by-nc-nd/2.5": "https://creativecommons.org/licenses/by-nc-nd/2.5/legalcode",
58
- "https://creativecommons.org/licenses/by-nc-nd/3.0": "https://creativecommons.org/licenses/by-nc-nd/3.0/legalcode",
59
- "https://creativecommons.org/licenses/by-nc-nd/4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode",
60
- "https://creativecommons.org/licenses/publicdomain": "https://creativecommons.org/licenses/publicdomain/",
61
- "https://creativecommons.org/publicdomain/zero/1.0": "https://creativecommons.org/publicdomain/zero/1.0/legalcode",
62
- }
63
-
64
- UNKNOWN_INFORMATION = {
65
- ":unac": "temporarily inaccessible",
66
- ":unal": "unallowed, suppressed intentionally",
67
- ":unap": "not applicable, makes no sense",
68
- ":unas": "value unassigned (e.g., Untitled)",
69
- ":unav": "value unavailable, possibly unknown",
70
- ":unkn": "known to be unknown (e.g., Anonymous, Inconnue)",
71
- ":none": "never had a value, never will",
72
- ":null": "explicitly and meaningfully empty",
73
- ":tba": "to be assigned or announced later",
74
- ":etal": "too numerous to list (et alia)",
75
- }
76
-
77
- HTTP_SCHEME = "http://"
78
- HTTPS_SCHEME = "https://"
79
-
80
-
81
- def normalize_id(pid: Optional[str], **kwargs) -> Optional[str]:
82
- """Check for valid DOI or HTTP(S) URL"""
83
- if pid is None:
84
- return None
85
-
86
- # check if pid is a bytes object
87
- if isinstance(pid, (bytes, bytearray)):
88
- pid = pid.decode()
89
-
90
- # check for valid DOI
91
- doi = normalize_doi(pid, **kwargs)
92
- if doi is not None:
93
- return doi
94
-
95
- # check for valid HTTP uri and ensure https
96
- uri = urlparse(pid)
97
- if not uri.netloc or uri.scheme not in ["http", "https"]:
98
- return None
99
- if uri.scheme == "http":
100
- pid = pid.replace(HTTP_SCHEME, HTTPS_SCHEME)
101
-
102
- # remove trailing slash
103
- if pid.endswith("/"):
104
- pid = pid.strip("/")
105
-
106
- return pid
107
-
108
-
109
- def normalize_ids(ids: list, relation_type=None) -> list:
110
- """Normalize identifiers"""
111
-
112
- def format_id(i):
113
- if i.get("id", None):
114
- idn = normalize_id(i["id"])
115
- doi = doi_from_url(idn)
116
- related_identifier_type = "DOI" if doi is not None else "URL"
117
- idn = doi or idn
118
- _type = (
119
- i.get("type")
120
- if isinstance(i.get("type", None), str)
121
- else wrap(i.get("type", None))[0]
122
- )
123
- return compact(
124
- {
125
- "relatedIdentifier": idn,
126
- "relationType": relation_type,
127
- "relatedIdentifierType": related_identifier_type,
128
- }
129
- )
130
- return None
131
-
132
- return [format_id(i) for i in ids]
133
-
134
-
135
- def normalize_url(url: Optional[str], secure=False, lower=False) -> Optional[str]:
136
- """Normalize URL"""
137
- if url is None or not isinstance(url, str):
138
- return None
139
- if url.endswith("/"):
140
- url = url.strip("/")
141
- if secure is True and url.startswith(HTTP_SCHEME):
142
- url = url.replace(HTTP_SCHEME, HTTPS_SCHEME)
143
- if lower is True:
144
- return url.lower()
145
- return url
146
-
147
-
148
- def normalize_cc_url(url: Optional[str]):
149
- """Normalize Creative Commons URL"""
150
- if url is None or not isinstance(url, str):
151
- return None
152
- url = normalize_url(url, secure=True)
153
- return NORMALIZED_LICENSES.get(url, url)
154
-
155
-
156
- def normalize_ror(ror: Optional[str]) -> Optional[str]:
157
- """Normalize ROR ID"""
158
- ror = validate_ror(ror)
159
- if ror is None:
160
- return None
161
-
162
- # turn ROR ID into URL
163
- return "https://ror.org/" + ror
164
-
165
-
166
- def validate_ror(ror: Optional[str]) -> Optional[str]:
167
- """Validate ROR"""
168
- if ror is None or not isinstance(ror, str):
169
- return None
170
- match = re.search(
171
- r"\A(?:(?:http|https)://ror\.org/)?([0-9a-z]{7}\d{2})\Z",
172
- ror,
173
- )
174
- if match is None:
175
- return None
176
- ror = match.group(1).replace(" ", "-")
177
- return ror
178
-
179
-
180
- def validate_url(url: str) -> Optional[str]:
181
- if url is None:
182
- return None
183
- elif validate_doi(url):
184
- return "DOI"
185
- f = furl(url)
186
- if f and f.scheme in ["http", "https"]:
187
- return "URL"
188
- match = re.search(
189
- r"\A(ISSN|eISSN) (\d{4}-\d{3}[0-9X]+)\Z",
190
- url,
191
- )
192
- if match is not None:
193
- return "ISSN"
194
- return None
195
-
196
-
197
- def normalize_orcid(orcid: Optional[str]) -> Optional[str]:
198
- """Normalize ORCID"""
199
- if orcid is None or not isinstance(orcid, str):
200
- return None
201
- orcid = validate_orcid(orcid)
202
- if orcid is None:
203
- return None
204
- return "https://orcid.org/" + orcid
205
-
206
-
207
- def validate_orcid(orcid: Optional[str]) -> Optional[str]:
208
- """Validate ORCID"""
209
- if orcid is None or not isinstance(orcid, str):
210
- return None
211
- match = re.search(
212
- r"\A(?:(?:http|https)://(?:(?:www|sandbox)?\.)?orcid\.org/)?(\d{4}[ -]\d{4}[ -]\d{4}[ -]\d{3}[0-9X]+)\Z",
213
- orcid,
214
- )
215
- if match is None:
216
- return None
217
- orcid = match.group(1).replace(" ", "-")
218
- return orcid
219
-
220
-
221
- def validate_isni(isni: Optional[str]) -> Optional[str]:
222
- """Validate ISNI"""
223
- if isni is None or not isinstance(isni, str):
224
- return None
225
- match = re.search(
226
- r"\A(?:(?:http|https)://isni\.org/isni/)?(\d{4}([ -])?\d{4}([ -])?\d{4}([ -])?\d{3}[0-9X]+)\Z",
227
- isni,
228
- )
229
- if match is None:
230
- return None
231
- isni = match.group(1).replace(" ", "")
232
- return isni
233
-
234
-
235
- def normalize_isni(isni: Optional[str]) -> Optional[str]:
236
- """Normalize ISNI"""
237
- if isni is None or not isinstance(isni, str):
238
- return None
239
- isni = validate_isni(isni)
240
- if isni is None:
241
- return None
242
- return "https://isni.org/isni/" + isni
243
-
244
-
245
- def normalize_name_identifier(ni: Optional[str]) -> Optional[str]:
246
- """Normalize name identifier"""
247
- if ni is None:
248
- return None
249
- if isinstance(ni, str):
250
- return
251
- if isinstance(ni, dict):
252
- return format_name_identifier(ni)
253
- if isinstance(ni, list):
254
- return next(
255
- (format_name_identifier(i) for i in wrap(ni.get("nameIdentifiers", None))),
256
- None,
257
- )
258
- return None
259
-
260
-
261
- def format_name_identifier(ni):
262
- """format_name_identifier"""
263
- if ni is None:
264
- return None
265
- elif isinstance(ni, str):
266
- return normalize_orcid(ni) or normalize_ror(ni) or normalize_isni(ni)
267
- name_identifier = ni.get("nameIdentifier", None) or ni.get(
268
- "publisherIdentifier", None
269
- )
270
- name_identifier_scheme = ni.get("nameIdentifierScheme", None) or ni.get(
271
- "publisherIdentifierScheme", None
272
- )
273
- scheme_uri = ni.get("schemeURI", None) or ni.get("schemeUri", None)
274
- if name_identifier is None:
275
- return None
276
- elif name_identifier_scheme == "ORCID":
277
- return normalize_orcid(name_identifier)
278
- elif name_identifier_scheme == "ISNI":
279
- return normalize_isni(name_identifier)
280
- elif name_identifier_scheme == "ROR":
281
- return normalize_ror(name_identifier)
282
- elif validate_url(name_identifier) == "URL":
283
- return name_identifier
284
- elif isinstance(name_identifier, str) and scheme_uri is not None:
285
- return scheme_uri + name_identifier
286
- return None
287
-
288
-
289
- def normalize_issn(string, **kwargs):
290
- """Normalize ISSN
291
- Pick electronic issn if there are multiple
292
- Format issn as xxxx-xxxx"""
293
- content = kwargs.get("content", "#text")
294
- if string is None:
295
- return None
296
- if isinstance(string, str):
297
- issn = string
298
- elif isinstance(string, dict):
299
- issn = string.get(content, None)
300
- elif isinstance(string, list):
301
- issn = next(
302
- (i for i in string if i.get("media_type", None) == "electronic"), {}
303
- ).get(content, None)
304
- if issn is None:
305
- return None
306
- if len(issn) == 9:
307
- return issn
308
- if len(issn) == 8:
309
- return issn[0:4] + "-" + issn[4:8]
310
- return None
311
-
312
-
313
- def dict_to_spdx(dct: dict) -> dict:
314
- """Convert a dict to SPDX"""
315
- dct.update({"url": normalize_cc_url(dct.get("url", None))})
316
- file_path = os.path.join(
317
- os.path.dirname(__file__), "resources", "spdx", "licenses.json"
318
- )
319
- with open(file_path, encoding="utf-8") as file:
320
- string = file.read()
321
- spdx = json.loads(string).get("licenses")
322
- license_ = next(
323
- (
324
- lic
325
- for lic in spdx
326
- if lic["licenseId"].casefold() == dct.get("id", "").casefold()
327
- or lic["seeAlso"][0] == dct.get("url", None)
328
- ),
329
- None,
330
- )
331
- if license_ is None:
332
- return compact(dct)
333
- # license = spdx.find do |l|
334
- # l['licenseId'].casecmp?(hsh['rightsIdentifier']) || l['seeAlso'].first == normalize_cc_url(hsh['rightsUri']) || l['name'] == hsh['rights'] || l['seeAlso'].first == normalize_cc_url(hsh['rights'])
335
- # end
336
- return compact(
337
- {
338
- "id": license_["licenseId"],
339
- "url": license_["seeAlso"][0],
340
- }
341
- )
342
-
343
- # else
344
- # {
345
- # 'rights': hsh['#text'] || hsh['rights'],
346
- # 'rightsUri': hsh['rightsUri'] || hsh['rightsUri'],
347
- # 'rightsIdentifier': hsh['rightsIdentifier'].present? ? hsh['rightsIdentifier'].downcase : None,
348
- # 'rightsIdentifierScheme': hsh['rightsIdentifierScheme'],
349
- # 'schemeUri': hsh['schemeUri'],
350
- # 'lang': hsh['lang']
351
- # }.compact
352
- # end
353
- # end
354
-
355
-
356
- def from_json_feed(elements: list) -> list:
357
- """Convert from JSON Feed elements"""
358
-
359
- def format_element(element):
360
- """format element"""
361
- if not isinstance(element, dict):
362
- return None
363
- mapping = {"url": "id"}
364
- for key, value in mapping.items():
365
- if element.get(key, None) is not None:
366
- element[value] = element.pop(key)
367
- return element
368
-
369
- return [format_element(i) for i in elements]
370
-
371
-
372
- def from_inveniordm(elements: list) -> list:
373
- """Convert from inveniordm elements"""
374
-
375
- def format_element(element):
376
- """format element"""
377
- if not isinstance(element, dict):
378
- return None
379
- mapping = {"orcid": "ORCID"}
380
- for key, value in mapping.items():
381
- if element.get(key, None) is not None:
382
- element[value] = element.pop(key)
383
- return element
384
-
385
- return [format_element(i) for i in elements]
386
-
387
-
388
- def from_crossref_xml(elements: list) -> list:
389
- """Convert from crossref_xml elements"""
390
-
391
- def format_affiliation(element):
392
- """Format affiliation"""
393
- return {"name": element}
394
-
395
- def format_element(element):
396
- """format element"""
397
- if element.get("name", None) is not None:
398
- element["type"] = "Organization"
399
- element["name"] = element.get("name")
400
- else:
401
- element["type"] = "Person"
402
- element["givenName"] = element.get("given_name", None)
403
- element["familyName"] = element.get("surname", None)
404
- element["contributorType"] = element.get(
405
- "contributor_role", "author"
406
- ).capitalize()
407
- if element.get("ORCID", None) is not None:
408
- orcid = parse_attributes(element.get("ORCID"))
409
- element["ORCID"] = normalize_orcid(orcid)
410
- element = py_.omit(
411
- element, "given_name", "surname", "sequence", "contributor_role"
412
- )
413
- return compact(element)
414
-
415
- return [format_element(i) for i in elements]
416
-
417
-
418
- def from_kbase(elements: list) -> list:
419
- """Convert from kbase elements"""
420
-
421
- def map_contributor_role(role):
422
- if role.split(":")[0] == "CRediT":
423
- return py_.pascal_case(role.split(":")[1])
424
- elif role.split(":")[0] == "DataCite":
425
- return DATACITE_CONTRIBUTOR_TYPES.get(role.split(":")[1], "Other")
426
- else:
427
- return role.split(":")[1]
428
-
429
- def format_element(element):
430
- """format element"""
431
- if not isinstance(element, dict):
432
- return None
433
- if element.get("contributor_id", None) is not None:
434
- element["ORCID"] = from_curie(element["contributor_id"])
435
- element["contributor_roles"] = [
436
- map_contributor_role(i)
437
- for i in wrap(element.get("contributor_roles", None))
438
- ]
439
- element = py_.omit(element, "contributor_id")
440
- return compact(element)
441
-
442
- return [format_element(i) for i in elements]
443
-
444
-
445
- def from_csl(elements: list) -> list:
446
- """Convert from csl elements"""
447
-
448
- def format_element(element):
449
- """format element"""
450
- if element.get("literal", None) is not None:
451
- element["type"] = "Organization"
452
- element["name"] = element["literal"]
453
- elif element.get("name", None) is not None:
454
- element["type"] = "Organization"
455
- element["name"] = element.get("name")
456
- else:
457
- element["type"] = "Person"
458
- element["name"] = " ".join(
459
- [element.get("given", ""), element.get("family", "")]
460
- )
461
- element["givenName"] = element.get("given", None)
462
- element["familyName"] = element.get("family", None)
463
- element["affiliation"] = element.get("affiliation", None)
464
- element = py_.omit(element, "given", "family", "literal", "sequence")
465
- return compact(element)
466
-
467
- return [format_element(i) for i in elements]
468
-
469
-
470
- def to_csl(elements: list) -> list:
471
- """Convert elements to CSL-JSON"""
472
-
473
- def format_element(i):
474
- """format element"""
475
- element = {}
476
- element["family"] = i.get("familyName", None)
477
- element["given"] = i.get("givenName", None)
478
- element["literal"] = (
479
- i.get("name", None) if i.get("familyName", None) is None else None
480
- )
481
- return compact(element)
482
-
483
- return [format_element(i) for i in elements]
484
-
485
-
486
- def to_ris(elements: Optional[list]) -> list:
487
- """Convert element to RIS"""
488
- if elements is None:
489
- return []
490
-
491
- def format_element(i):
492
- """format element"""
493
- if i.get("familyName", None) and i.get("givenName", None):
494
- element = ", ".join([i["familyName"], i.get("givenName", None)])
495
- else:
496
- element = i.get("name", None)
497
- return element
498
-
499
- return [
500
- format_element(i)
501
- for i in elements
502
- if i.get("name", None) or i.get("familyName", None)
503
- ]
504
-
505
-
506
- def to_schema_org(element: Optional[dict]) -> Optional[dict]:
507
- """Convert a metadata element to Schema.org"""
508
- if not isinstance(element, dict):
509
- return None
510
- mapping = {"type": "@type", "id": "@id", "title": "name"}
511
- for key, value in mapping.items():
512
- if element.get(key, None) is not None:
513
- element[value] = element.pop(key)
514
- return element
515
-
516
-
517
- def to_schema_org_creators(elements: list) -> list():
518
- """Convert creators to Schema.org"""
519
-
520
- def format_element(element):
521
- """format element"""
522
- element["@type"] = element["type"][0:-2] if element.get("type", None) else None
523
- if element.get("familyName", None) and element.get("name", None) is None:
524
- element["name"] = " ".join(
525
- [element.get("givenName", None), element.get("familyName")]
526
- )
527
- element["@type"] = "Person"
528
- else:
529
- element["@type"] = "Organization"
530
- element = py_.omit(element, "type", "contributorRoles")
531
- return compact(element)
532
-
533
- return [format_element(i) for i in elements]
534
-
535
-
536
- def to_schema_org_container(element: Optional[dict], **kwargs) -> Optional[dict]:
537
- """Convert CSL container to Schema.org container"""
538
- if element is None and kwargs.get("container_title", None) is None:
539
- return None
540
- if not isinstance(element, dict):
541
- return None
542
-
543
- return compact(
544
- {
545
- "@id": element.get("identifier", None),
546
- "@type": "DataCatalog"
547
- if kwargs.get("type", None) == "DataRepository"
548
- else "Periodical",
549
- "name": element.get("title", None) or kwargs.get("container_title", None),
550
- }
551
- )
552
-
553
-
554
- def to_schema_org_identifiers(elements: list) -> list:
555
- """Convert identifiers to Schema.org"""
556
-
557
- def format_element(i):
558
- """format element"""
559
- element = {}
560
- element["@type"] = "PropertyValue"
561
- element["propertyID"] = i.get("identifierType", None)
562
- element["value"] = i.get("identifier", None)
563
- return compact(element)
564
-
565
- return [format_element(i) for i in elements]
566
-
567
-
568
- def to_schema_org_relations(related_items: list, relation_type=None):
569
- """Convert relatedItems to Schema.org relations"""
570
-
571
- def format_element(i):
572
- """format element"""
573
- if i["relatedItemIdentifierType"] == "ISSN" and i["relationType"] == "IsPartOf":
574
- return compact({"@type": "Periodical", "issn": i["relatedItemIdentifier"]})
575
- return compact({"@id": normalize_id(i["relatedIdentifier"])})
576
-
577
- # consolidate different relation types
578
- if relation_type == "References":
579
- relation_type = ["References", "Cites"]
580
- else:
581
- relation_type = [relation_type]
582
-
583
- related_items = py_.filter(
584
- wrap(related_items), lambda ri: ri["relationType"] in relation_type
585
- )
586
- return [format_element(i) for i in related_items]
587
-
588
-
589
- def find_from_format(pid=None, string=None, ext=None, dct=None, filename=None):
590
- """Find reader from format"""
591
- if pid is not None:
592
- return find_from_format_by_id(pid)
593
- if string is not None and ext is not None:
594
- return find_from_format_by_ext(ext)
595
- if dct is not None:
596
- return find_from_format_by_dict(dct)
597
- if string is not None:
598
- return find_from_format_by_string(string)
599
- if filename is not None:
600
- return find_from_format_by_filename(filename)
601
- return "datacite"
602
-
603
-
604
- def find_from_format_by_id(pid: str) -> Optional[str]:
605
- """Find reader from format by id"""
606
- doi = validate_doi(pid)
607
- if doi and (registration_agency := get_doi_ra(doi)) is not None:
608
- return registration_agency.lower()
609
- if (
610
- re.match(r"\A(http|https):/(/)?github\.com/(.+)/CITATION.cff\Z", pid)
611
- is not None
612
- ):
613
- return "cff"
614
- if (
615
- re.match(r"\A(http|https):/(/)?github\.com/(.+)/codemeta.json\Z", pid)
616
- is not None
617
- ):
618
- return "codemeta"
619
- if re.match(r"\A(http|https):/(/)?github\.com/(.+)\Z", pid) is not None:
620
- return "cff"
621
- if re.match(r"\Ahttps:/(/)?api\.rogue-scholar\.org/posts/(.+)\Z", pid) is not None:
622
- return "json_feed_item"
623
- if re.match(r"\Ahttps:/(/)?zenodo\.org/api/records/(.+)\Z", pid) is not None:
624
- return "inveniordm"
625
- return "schema_org"
626
-
627
-
628
- def find_from_format_by_ext(ext: str) -> Optional[str]:
629
- """Find reader from format by ext"""
630
- if ext == ".bib":
631
- return "bibtex"
632
- if ext == ".ris":
633
- return "ris"
634
- return None
635
-
636
-
637
- def find_from_format_by_dict(dct: dict) -> Optional[str]:
638
- if dct is None or not isinstance(dct, dict):
639
- return None
640
- """Find reader from format by dict"""
641
- if dct.get("schema_version", "").startswith("https://commonmeta.org"):
642
- return "commonmeta"
643
- if dct.get("@context", None) == "http://schema.org":
644
- return "schema_org"
645
- if dct.get("@context", None) in [
646
- "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld"
647
- ]:
648
- return "codemeta"
649
- if dct.get("guid", None) is not None:
650
- return "json_feed_item"
651
- if dct.get("schemaVersion", "").startswith("http://datacite.org/schema/kernel"):
652
- return "datacite"
653
- if dct.get("source", None) == "Crossref":
654
- return "crossref"
655
- if py_.get(dct, "issued.date-parts") is not None:
656
- return "csl"
657
- if py_.get(dct, "conceptdoi") is not None:
658
- return "inveniordm"
659
- if py_.get(dct, "credit_metadata") is not None:
660
- return "kbase"
661
- return None
662
-
663
-
664
- def find_from_format_by_string(string: str) -> Optional[str]:
665
- """Find reader from format by string"""
666
- if string is None:
667
- return None
668
- try:
669
- data = json.loads(string)
670
- if not isinstance(data, dict):
671
- raise TypeError
672
- if data.get("schema", "").startswith("https://commonmeta.org"):
673
- return "commonmeta"
674
- if data.get("items", None) is not None:
675
- data = data["items"][0]
676
- if data.get("@context", None) == "http://schema.org":
677
- return "schema_org"
678
- if data.get("@context", None) in [
679
- "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld"
680
- ]:
681
- return "codemeta"
682
- if data.get("guid", None) is not None:
683
- return "json_feed_item"
684
- if data.get("schemaVersion", "").startswith(
685
- "http://datacite.org/schema/kernel"
686
- ):
687
- return "datacite"
688
- if data.get("source", None) == "Crossref":
689
- return "crossref"
690
- if py_.get(data, "issued.date-parts") is not None:
691
- return "csl"
692
- if py_.get(data, "conceptdoi") is not None:
693
- return "inveniordm"
694
- if py_.get(data, "credit_metadata") is not None:
695
- return "kbase"
696
- except (TypeError, json.JSONDecodeError):
697
- pass
698
- try:
699
- data = BeautifulSoup(string, "xml")
700
- if data.find("doi_record"):
701
- return "crossref_xml"
702
- if data.find("resource"):
703
- return "datacite_xml"
704
- except ValueError:
705
- pass
706
- try:
707
- data = BeautifulSoup(string, "html.parser")
708
- if (
709
- data.find("script", type="application/ld+json")
710
- or data.find("meta", {"name": "citation_doi"})
711
- or data.find("meta", {"name": "dc.identifier"})
712
- ):
713
- return "schema_org"
714
- except ValueError:
715
- pass
716
- try:
717
- data = yaml.safe_load(string)
718
- if data.get("cff-version", None):
719
- return "cff"
720
- except (yaml.YAMLError, AttributeError):
721
- pass
722
-
723
- if string.startswith("TY - "):
724
- return "ris"
725
- if any(string.startswith(f"@{t}") for t in bibtexparser.bibdatabase.STANDARD_TYPES):
726
- return "bibtex"
727
-
728
- # no format found
729
- return None
730
-
731
-
732
- def find_from_format_by_filename(filename):
733
- """Find reader from format by filename"""
734
- if filename == "CITATION.cff":
735
- return "cff"
736
- return None
737
-
738
-
739
- def from_schema_org(element):
740
- """Convert schema.org to DataCite"""
741
- if element is None:
742
- return None
743
- element["type"] = element.get("@type", None)
744
- element["id"] = element.get("@id", None)
745
- return compact(py_.omit(element, ["@type", "@id"]))
746
-
747
-
748
- def from_schema_org_creators(elements: list) -> list:
749
- """Convert schema.org creators to commonmeta"""
750
-
751
- def format_element(i):
752
- """format element"""
753
- element = {}
754
- if isinstance(i, str):
755
- return {"name": i}
756
- if urlparse(i.get("@id", None)).hostname == "orcid.org":
757
- element["id"] = i.get("@id")
758
- element["type"] = "Person"
759
- elif isinstance(i.get("@type", None), str):
760
- element["type"] = i.get("@type")
761
- elif isinstance(i.get("@type", None), list):
762
- element["type"] = py_.find(
763
- i["@type"], lambda x: x in ["Person", "Organization"]
764
- )
765
-
766
- # strip text after comma if suffix is an academic title
767
- if str(i["name"]).split(", ", maxsplit=1)[-1] in [
768
- "MD",
769
- "PhD",
770
- "DVM",
771
- "DDS",
772
- "DMD",
773
- "JD",
774
- "MBA",
775
- "MPH",
776
- "MS",
777
- "MA",
778
- "MFA",
779
- "MSc",
780
- "MEd",
781
- "MEng",
782
- "MPhil",
783
- "MRes",
784
- "LLM",
785
- "LLB",
786
- "BSc",
787
- "BA",
788
- "BFA",
789
- "BEd",
790
- "BEng",
791
- "BPhil",
792
- ]:
793
- i["name"] = str(i["name"]).split(", ", maxsplit=1)[0]
794
- length = len(str(i["name"]).split(" "))
795
- if i.get("givenName", None):
796
- element["givenName"] = i.get("givenName", None)
797
- if i.get("familyName", None):
798
- element["familyName"] = i.get("familyName", None)
799
- element["type"] = "Person"
800
- # parentheses around the last word indicate an organization
801
- elif length > 1 and not str(i["name"]).rsplit(" ", maxsplit=1)[-1].startswith(
802
- "("
803
- ):
804
- element["givenName"] = " ".join(str(i["name"]).split(" ")[0 : length - 1])
805
- element["familyName"] = str(i["name"]).rsplit(" ", maxsplit=1)[1:]
806
- if not element.get("familyName", None):
807
- element["creatorName"] = compact(
808
- {
809
- "type": i.get("@type", None),
810
- "#text": i.get("name", None),
811
- }
812
- )
813
-
814
- if isinstance(i.get("affiliation", None), str):
815
- element["affiliation"] = {"type": "Organization", "name": i["affiliation"]}
816
- elif urlparse(py_.get(i, "affiliation.@id", "")).hostname in [
817
- "ror.org",
818
- "isni.org",
819
- ]:
820
- element["affiliation"] = {
821
- "id": i["affiliation"]["@id"],
822
- "type": "Organization",
823
- "name": i["affiliation"]["name"],
824
- }
825
- return compact(element)
826
-
827
- return [format_element(i) for i in wrap(elements)]
828
-
829
-
830
- def github_from_url(url: str) -> dict:
831
- """Get github owner, repo, release and path from url"""
832
-
833
- match = re.match(
834
- r"\Ahttps://(github|raw\.githubusercontent)\.com/(.+)(?:/)?(.+)?(?:/tree/)?(.*)\Z",
835
- url,
836
- )
837
- if match is None:
838
- return {}
839
- words = urlparse(url).path.lstrip("/").split("/")
840
- owner = words[0] if len(words) > 0 else None
841
- repo = words[1] if len(words) > 1 else None
842
- release = words[3] if len(words) > 3 else None
843
- path = "/".join(words[4:]) if len(words) > 3 else ""
844
- if len(path) == 0:
845
- path = None
846
-
847
- return compact({"owner": owner, "repo": repo, "release": release, "path": path})
848
-
849
-
850
- def github_repo_from_url(url: str) -> Optional[str]:
851
- """Get github repo from url"""
852
- return github_from_url(url).get("repo", None)
853
-
854
-
855
- def github_release_from_url(url: str) -> Optional[str]:
856
- """Get github release from url"""
857
- return github_from_url(url).get("release", None)
858
-
859
-
860
- def github_owner_from_url(url: str) -> Optional[str]:
861
- """Get github owner from url"""
862
- return github_from_url(url).get("owner", None)
863
-
864
-
865
- def github_as_owner_url(url: str) -> Optional[str]:
866
- """Get github owner url from url"""
867
- github_dict = github_from_url(url)
868
- if github_dict.get("owner", None) is None:
869
- return None
870
- return f"https://github.com/{github_dict.get('owner')}"
871
-
872
-
873
- def github_as_repo_url(url) -> Optional[str]:
874
- """Get github repo url from url"""
875
- github_dict = github_from_url(url)
876
- if github_dict.get("repo", None) is None:
877
- return None
878
- return f"https://github.com/{github_dict.get('owner')}/{github_dict.get('repo')}"
879
-
880
-
881
- def github_as_release_url(url: str) -> Optional[str]:
882
- """Get github release url from url"""
883
- github_dict = github_from_url(url)
884
- if github_dict.get("release", None) is None:
885
- return None
886
- return f"https://github.com/{github_dict.get('owner')}/{github_dict.get('repo')}/tree/{github_dict.get('release')}"
887
-
888
-
889
- def github_as_codemeta_url(url: str) -> Optional[str]:
890
- """Get github codemeta.json url from url"""
891
- github_dict = github_from_url(url)
892
-
893
- if github_dict.get("path", None) and github_dict.get("path").endswith(
894
- "codemeta.json"
895
- ):
896
- return f"https://raw.githubusercontent.com/{github_dict.get('owner')}/{github_dict.get('repo')}/{github_dict.get('release')}/{github_dict.get('path')}"
897
- elif github_dict.get("owner", None):
898
- return f"https://raw.githubusercontent.com/{github_dict.get('owner')}/{github_dict.get('repo')}/master/codemeta.json"
899
- else:
900
- return None
901
-
902
-
903
- def github_as_cff_url(url: str) -> Optional[str]:
904
- """Get github CITATION.cff url from url"""
905
- github_dict = github_from_url(url)
906
-
907
- if github_dict.get("path", None) and github_dict.get("path").endswith(
908
- "CITATION.cff"
909
- ):
910
- return f"https://raw.githubusercontent.com/{github_dict.get('owner')}/{github_dict.get('repo')}/{github_dict.get('release')}/{github_dict.get('path')}"
911
- if github_dict.get("owner", None):
912
- return f"https://raw.githubusercontent.com/{github_dict.get('owner')}/{github_dict.get('repo')}/main/CITATION.cff"
913
- return None
914
-
915
-
916
- def pages_as_string(
917
- container: Optional[dict], page_range_separator="-"
918
- ) -> Optional[str]:
919
- """Parse pages for BibTeX"""
920
- if container is None:
921
- return None
922
- if container.get("firstPage", None) is None:
923
- return None
924
- if container.get("lastPage", None) is None:
925
- return container.get("firstPage", None)
926
-
927
- return page_range_separator.join(
928
- [container.get("firstPage"), container.get("lastPage", None)]
929
- )
930
-
931
-
932
- def subjects_as_string(subjects):
933
- """convert subject list to string, e.g. for bibtex"""
934
- if subjects is None:
935
- return None
936
-
937
- keywords = []
938
- for subject in wrap(subjects):
939
- keywords.append(subject.get("subject", None))
940
- return ", ".join(keywords)
941
-
942
-
943
- # def reverse():
944
- # return { 'citation': wrap(related_identifiers).select do |ri|
945
- # ri['relationType'] == 'IsReferencedBy'
946
- # end.map do |r|
947
- # { '@id': normalize_doi(r['relatedIdentifier']),
948
- # '@type': r['resourceTypeGeneral'] validate_orcid 'ScholarlyArticle',
949
- # 'identifier': r['relatedIdentifierType'] == 'DOI' ? nil : to_identifier(r) }.compact
950
- # end.unwrap,
951
- # 'isBasedOn': wrap(related_identifiers).select do |ri|
952
- # ri['relationType'] == 'IsSupplementTo'
953
- # end.map do |r|
954
- # { '@id': normalize_doi(r['relatedIdentifier']),
955
- # '@type': r['resourceTypeGeneral'] or 'ScholarlyArticle',
956
- # 'identifier': r['relatedIdentifierType'] == 'DOI' ? nil : to_identifier(r) }.compact
957
- # end.unwrap }.compact
958
-
959
-
960
- def name_to_fos(name: str) -> Optional[dict]:
961
- """Convert name to Fields of Science (OECD) subject"""
962
- # # first find subject in Fields of Science (OECD)
963
- # fos = JSON.load(File.read(File.expand_path('../../resources/oecd/fos-mappings.json',
964
- # __dir__))).fetch('fosFields')
965
-
966
- # subject = fos.find { |l| l['fosLabel'] == name || 'FOS: ' + l['fosLabel'] == name }
967
-
968
- # if subject
969
- # return [{
970
- # 'subject': sanitize(name).downcase
971
- # },
972
- # {
973
- # 'subject': 'FOS: ' + subject['fosLabel'],
974
- # 'subjectScheme': 'Fields of Science and Technology (FOS)',
975
- # 'schemeUri': 'http://www.oecd.org/science/inno/38235147.pdf'
976
- # }]
977
- # end
978
-
979
- # # if not found, look in Fields of Research (Australian and New Zealand Standard Research Classification)
980
- # # and map to Fields of Science. Add an extra entry for the latter
981
- # fores = JSON.load(File.read(File.expand_path('../../resources/oecd/for-mappings.json',
982
- # __dir__)))
983
- # for_fields = fores.fetch('forFields')
984
- # for_disciplines = fores.fetch('forDisciplines')
985
-
986
- # subject = for_fields.find { |l| l['forLabel'] == name } ||
987
- # for_disciplines.find { |l| l['forLabel'] == name }
988
-
989
- # if subject
990
- # [{
991
- # 'subject': sanitize(name).downcase
992
- # },
993
- # {
994
- # 'subject': 'FOS: ' + subject['fosLabel'],
995
- # 'subjectScheme': 'Fields of Science and Technology (FOS)',
996
- # 'schemeUri': 'http://www.oecd.org/science/inno/38235147.pdf'
997
- # }]
998
- # else
999
-
1000
- return {"subject": name.strip()}
1001
-
1002
-
1003
- def encode_doi(prefix):
1004
- """Generate a DOI using the DOI prefix and a random base32 suffix"""
1005
- suffix = base32.generate(length=10, split_every=5, checksum=True)
1006
- return f"https://doi.org/{prefix}/{suffix}"
1007
-
1008
-
1009
- def decode_doi(doi: str) -> int:
1010
- """Decode a DOI to a number"""
1011
- suffix = doi.split("/", maxsplit=5)[-1]
1012
- return base32.decode(suffix)
1013
-
1014
-
1015
- def from_curie(id: Optional[str]) -> Optional[str]:
1016
- """from CURIE"""
1017
- if id is None:
1018
- return None
1019
- _type = id.split(":")[0]
1020
- if _type == "DOI":
1021
- return doi_as_url(id.split(":")[1])
1022
- elif _type == "ROR":
1023
- return "https://ror.org/" + id.split(":")[1]
1024
- elif _type == "ISNI":
1025
- return "https://isni.org/isni/" + id.split(":")[1]
1026
- elif _type == "ORCID":
1027
- return normalize_orcid(id.split(":")[1])
1028
- elif _type == "URL":
1029
- return normalize_url(id.split(":")[1])
1030
- elif _type == "JDP":
1031
- return id.split(":")[1]
1032
- # TODO: resolvable url for other identifier types
1033
- return None
1034
-
1035
-
1036
- def issn_as_url(issn: str) -> Optional[str]:
1037
- """ISSN as URL"""
1038
- if issn is None:
1039
- return None
1040
- return f"https://portal.issn.org/resource/ISSN/{issn}"
1041
-
1042
-
1043
- def get_language(lang: str) -> Optional[dict]:
1044
- """Provide a language object based on ISO 639, with either a name in English,
1045
- ISO 639-1, or ISO 639-3 code as input.
1046
- """
1047
- if not lang:
1048
- return None
1049
- if len(lang) == 2:
1050
- language = pycountry.languages.get(alpha_2=lang)
1051
- elif len(lang) == 3:
1052
- language = pycountry.languages.get(alpha_3=lang)
1053
- else:
1054
- language = pycountry.languages.get(name=lang)
1055
- return language
1056
-
1057
-
1058
- def start_case(content: str) -> str:
1059
- """Capitalize first letter of each word without lowercasing the rest"""
1060
- words = content.split(" ")
1061
- content = " ".join([word[0].upper() + word[1:] for word in words])
1062
- return content
1063
-
1064
-
1065
- def timer_func(func):
1066
- def function_timer(*args, **kwargs):
1067
- start = time.time()
1068
- value = func(*args, **kwargs)
1069
- end = time.time()
1070
- runtime = end - start
1071
- msg = "{func} took {time} seconds to complete its execution."
1072
- print(msg.format(func=func.__name__, time=runtime))
1073
- return value
1074
-
1075
- return function_timer