commonmeta-py 0.23__py3-none-any.whl → 0.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- commonmeta/__init__.py +96 -0
- commonmeta/api_utils.py +77 -0
- commonmeta/author_utils.py +260 -0
- commonmeta/base_utils.py +121 -0
- commonmeta/cli.py +200 -0
- commonmeta/constants.py +587 -0
- commonmeta/crossref_utils.py +575 -0
- commonmeta/date_utils.py +193 -0
- commonmeta/doi_utils.py +273 -0
- commonmeta/metadata.py +320 -0
- commonmeta/readers/__init__.py +1 -0
- commonmeta/readers/cff_reader.py +199 -0
- commonmeta/readers/codemeta_reader.py +112 -0
- commonmeta/readers/commonmeta_reader.py +13 -0
- commonmeta/readers/crossref_reader.py +409 -0
- commonmeta/readers/crossref_xml_reader.py +505 -0
- commonmeta/readers/csl_reader.py +98 -0
- commonmeta/readers/datacite_reader.py +390 -0
- commonmeta/readers/datacite_xml_reader.py +359 -0
- commonmeta/readers/inveniordm_reader.py +218 -0
- commonmeta/readers/json_feed_reader.py +420 -0
- commonmeta/readers/kbase_reader.py +205 -0
- commonmeta/readers/ris_reader.py +103 -0
- commonmeta/readers/schema_org_reader.py +506 -0
- commonmeta/resources/cff_v1.2.0.json +1827 -0
- commonmeta/resources/commonmeta_v0.12.json +601 -0
- commonmeta/resources/commonmeta_v0.13.json +559 -0
- commonmeta/resources/commonmeta_v0.14.json +573 -0
- commonmeta/resources/crossref/AccessIndicators.xsd +47 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-3d2-mathml3-elements.xsd +10130 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-3d2-mathml3.xsd +48 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-elements.xsd +8705 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-mathml3-elements.xsd +8608 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-mathml3.xsd +49 -0
- commonmeta/resources/crossref/JATS-journalpublishing1.xsd +6176 -0
- commonmeta/resources/crossref/clinicaltrials.xsd +61 -0
- commonmeta/resources/crossref/common5.3.1.xsd +1538 -0
- commonmeta/resources/crossref/crossref5.3.1.xsd +1949 -0
- commonmeta/resources/crossref/crossref_query_output3.0.xsd +1097 -0
- commonmeta/resources/crossref/fundref.xsd +49 -0
- commonmeta/resources/crossref/module-ali.xsd +39 -0
- commonmeta/resources/crossref/relations.xsd +444 -0
- commonmeta/resources/crossref-v0.2.json +60 -0
- commonmeta/resources/csl-data.json +538 -0
- commonmeta/resources/datacite-v4.5.json +829 -0
- commonmeta/resources/datacite-v4.5pr.json +608 -0
- commonmeta/resources/ietf-bcp-47.json +3025 -0
- commonmeta/resources/iso-8601.json +3182 -0
- commonmeta/resources/spdx/licenses.json +4851 -0
- commonmeta/resources/spdx-schema..json +903 -0
- commonmeta/resources/styles/apa.csl +1697 -0
- commonmeta/resources/styles/chicago-author-date.csl +684 -0
- commonmeta/resources/styles/harvard-cite-them-right.csl +321 -0
- commonmeta/resources/styles/ieee.csl +468 -0
- commonmeta/resources/styles/modern-language-association.csl +341 -0
- commonmeta/resources/styles/vancouver.csl +376 -0
- commonmeta/schema_utils.py +27 -0
- commonmeta/translators.py +47 -0
- commonmeta/utils.py +1108 -0
- commonmeta/writers/__init__.py +1 -0
- commonmeta/writers/bibtex_writer.py +149 -0
- commonmeta/writers/citation_writer.py +70 -0
- commonmeta/writers/commonmeta_writer.py +68 -0
- commonmeta/writers/crossref_xml_writer.py +17 -0
- commonmeta/writers/csl_writer.py +79 -0
- commonmeta/writers/datacite_writer.py +193 -0
- commonmeta/writers/inveniordm_writer.py +94 -0
- commonmeta/writers/ris_writer.py +58 -0
- commonmeta/writers/schema_org_writer.py +146 -0
- {commonmeta_py-0.23.dist-info → commonmeta_py-0.25.dist-info}/METADATA +56 -45
- commonmeta_py-0.25.dist-info/RECORD +75 -0
- {commonmeta_py-0.23.dist-info → commonmeta_py-0.25.dist-info}/WHEEL +1 -1
- commonmeta_py-0.25.dist-info/entry_points.txt +3 -0
- commonmeta_py-0.23.dist-info/RECORD +0 -5
- /commonmeta_py/__init__.py → /commonmeta/readers/bibtex_reader.py +0 -0
- {commonmeta_py-0.23.dist-info/licenses → commonmeta_py-0.25.dist-info}/LICENSE +0 -0
commonmeta/__init__.py
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
# commonmeta-py
|
4
|
+
|
5
|
+
"""
|
6
|
+
commonmeta-py library
|
7
|
+
~~~~~~~~~~~~~~~~~~~~~
|
8
|
+
|
9
|
+
commonmeta-py is a Python library to convert scholarly metadata
|
10
|
+
"""
|
11
|
+
|
12
|
+
__title__ = "commonmeta-py"
|
13
|
+
__version__ = "0.24"
|
14
|
+
__author__ = "Martin Fenner"
|
15
|
+
__license__ = "MIT"
|
16
|
+
|
17
|
+
# ruff: noqa: F401
|
18
|
+
from .metadata import Metadata, MetadataList
|
19
|
+
from .readers import (
|
20
|
+
cff_reader,
|
21
|
+
codemeta_reader,
|
22
|
+
crossref_reader,
|
23
|
+
crossref_xml_reader,
|
24
|
+
datacite_reader,
|
25
|
+
datacite_xml_reader,
|
26
|
+
inveniordm_reader,
|
27
|
+
json_feed_reader,
|
28
|
+
kbase_reader,
|
29
|
+
ris_reader,
|
30
|
+
schema_org_reader,
|
31
|
+
)
|
32
|
+
from .writers import (
|
33
|
+
bibtex_writer,
|
34
|
+
citation_writer,
|
35
|
+
commonmeta_writer,
|
36
|
+
csl_writer,
|
37
|
+
datacite_writer,
|
38
|
+
ris_writer,
|
39
|
+
schema_org_writer,
|
40
|
+
)
|
41
|
+
from .utils import (
|
42
|
+
dict_to_spdx,
|
43
|
+
from_csl,
|
44
|
+
from_schema_org,
|
45
|
+
get_language,
|
46
|
+
normalize_cc_url,
|
47
|
+
normalize_id,
|
48
|
+
normalize_ids,
|
49
|
+
normalize_orcid,
|
50
|
+
normalize_url,
|
51
|
+
normalize_ror,
|
52
|
+
pages_as_string,
|
53
|
+
to_csl,
|
54
|
+
validate_orcid,
|
55
|
+
validate_url,
|
56
|
+
get_language,
|
57
|
+
encode_doi,
|
58
|
+
name_to_fos,
|
59
|
+
from_json_feed,
|
60
|
+
)
|
61
|
+
from .author_utils import (
|
62
|
+
authors_as_string,
|
63
|
+
cleanup_author,
|
64
|
+
get_affiliations,
|
65
|
+
get_authors,
|
66
|
+
get_one_author,
|
67
|
+
is_personal_name,
|
68
|
+
)
|
69
|
+
from .base_utils import (
|
70
|
+
wrap,
|
71
|
+
unwrap,
|
72
|
+
compact,
|
73
|
+
presence,
|
74
|
+
parse_attributes,
|
75
|
+
sanitize,
|
76
|
+
)
|
77
|
+
from .date_utils import (
|
78
|
+
get_date_from_crossref_parts,
|
79
|
+
get_date_from_date_parts,
|
80
|
+
get_date_from_unix_timestamp,
|
81
|
+
get_date_parts,
|
82
|
+
get_iso8601_date,
|
83
|
+
strip_milliseconds,
|
84
|
+
)
|
85
|
+
from .doi_utils import (
|
86
|
+
crossref_api_url,
|
87
|
+
crossref_xml_api_url,
|
88
|
+
doi_from_url,
|
89
|
+
doi_as_url,
|
90
|
+
doi_resolver,
|
91
|
+
datacite_api_url,
|
92
|
+
get_doi_ra,
|
93
|
+
normalize_doi,
|
94
|
+
validate_doi,
|
95
|
+
validate_prefix,
|
96
|
+
)
|
commonmeta/api_utils.py
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
"""API Utils module for commonmeta-py"""
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
from datetime import datetime as date
|
5
|
+
import httpx
|
6
|
+
from furl import furl
|
7
|
+
import jwt
|
8
|
+
|
9
|
+
from .doi_utils import validate_doi, doi_as_url
|
10
|
+
from .readers.json_feed_reader import get_json_feed_item_uuid
|
11
|
+
|
12
|
+
|
13
|
+
def generate_ghost_token(key: str) -> str:
|
14
|
+
"""Generate a short-lived JWT for the Ghost Admin API.
|
15
|
+
From https://ghost.org/docs/admin-api/#token-authentication"""
|
16
|
+
|
17
|
+
# Split the key into ID and SECRET
|
18
|
+
_id, secret = key.split(":")
|
19
|
+
|
20
|
+
# Prepare header and payload
|
21
|
+
iat = int(date.now().timestamp())
|
22
|
+
|
23
|
+
header = {"alg": "HS256", "typ": "JWT", "kid": _id}
|
24
|
+
payload = {"iat": iat, "exp": iat + 5 * 60, "aud": "/admin/"}
|
25
|
+
|
26
|
+
# Create and return the token (including decoding secret)
|
27
|
+
return jwt.encode(payload, bytes.fromhex(secret), algorithm="HS256", headers=header)
|
28
|
+
|
29
|
+
|
30
|
+
def update_ghost_post_via_api(
|
31
|
+
_id: str, api_key: Optional[str] = None, api_url: Optional[str] = None
|
32
|
+
) -> dict[str, str]:
|
33
|
+
"""Update Ghost post via API"""
|
34
|
+
# get post doi and url from Rogue Scholar API
|
35
|
+
# post url is needed to find post via Ghost API
|
36
|
+
post = get_json_feed_item_uuid(_id)
|
37
|
+
if post.get("error", None):
|
38
|
+
return post
|
39
|
+
doi = validate_doi(post.get("doi", None))
|
40
|
+
doi = doi_as_url(doi)
|
41
|
+
url = post.get("url", None)
|
42
|
+
if not doi or not url:
|
43
|
+
return {"error": "DOI or URL not found"}
|
44
|
+
|
45
|
+
# get post_id and updated_at from ghost api
|
46
|
+
token = generate_ghost_token(api_key)
|
47
|
+
headers = {
|
48
|
+
"Authorization": f"Ghost {token}",
|
49
|
+
"Content-Type": "application/json",
|
50
|
+
"Accept-Version": "v5",
|
51
|
+
}
|
52
|
+
f = furl(url)
|
53
|
+
slug = f.path.segments[-1]
|
54
|
+
ghost_url = f"{api_url}/ghost/api/admin/posts/slug/{slug}/"
|
55
|
+
response = httpx.get(ghost_url, headers=headers, timeout=10)
|
56
|
+
if response.status_code != 200:
|
57
|
+
return {"error": "Error fetching post"}
|
58
|
+
ghost_post = response.json().get("posts")[0]
|
59
|
+
guid = ghost_post.get("id")
|
60
|
+
updated_at = ghost_post.get("updated_at")
|
61
|
+
if not guid or not updated_at:
|
62
|
+
return {"error": "guid or updated_at not found"}
|
63
|
+
|
64
|
+
# update post canonical_url using doi. This requires sending
|
65
|
+
# the updated_at timestamp to avoid conflicts, and must use guid
|
66
|
+
# rather than url for put requests
|
67
|
+
ghost_url = f"{api_url}/ghost/api/admin/posts/{guid}/"
|
68
|
+
|
69
|
+
json = {"posts": [{"canonical_url": doi, "updated_at": updated_at}]}
|
70
|
+
response = httpx.put(
|
71
|
+
ghost_url,
|
72
|
+
headers=headers,
|
73
|
+
json=json,
|
74
|
+
)
|
75
|
+
if response.status_code != 200:
|
76
|
+
return {"error": "Error updating post"}
|
77
|
+
return {"message": f"DOI {doi} added", "guid": guid, "updated_at": updated_at}
|
@@ -0,0 +1,260 @@
|
|
1
|
+
"""Author utils module for commonmeta-py"""
|
2
|
+
import re
|
3
|
+
from typing import List
|
4
|
+
from nameparser import HumanName
|
5
|
+
from pydash import py_
|
6
|
+
from furl import furl
|
7
|
+
|
8
|
+
from .utils import (
|
9
|
+
normalize_orcid,
|
10
|
+
normalize_id,
|
11
|
+
normalize_ror,
|
12
|
+
normalize_isni,
|
13
|
+
format_name_identifier,
|
14
|
+
validate_ror,
|
15
|
+
validate_orcid,
|
16
|
+
)
|
17
|
+
from .base_utils import parse_attributes, wrap, presence, compact
|
18
|
+
|
19
|
+
from .constants import (
|
20
|
+
COMMONMETA_CONTRIBUTOR_ROLES,
|
21
|
+
)
|
22
|
+
|
23
|
+
|
24
|
+
def get_one_author(author, **kwargs):
|
25
|
+
"""parse one author string into commonmeta format"""
|
26
|
+
# if author is a string
|
27
|
+
if isinstance(author, str):
|
28
|
+
author = {"creatorName": author}
|
29
|
+
|
30
|
+
# malformed XML
|
31
|
+
if isinstance(author.get("creatorName", None), list):
|
32
|
+
return None
|
33
|
+
|
34
|
+
name = (
|
35
|
+
parse_attributes(author.get("creatorName", None))
|
36
|
+
or parse_attributes(author.get("contributorName", None))
|
37
|
+
or parse_attributes(author.get("name", None))
|
38
|
+
)
|
39
|
+
given_name = parse_attributes(author.get("givenName", None)) or parse_attributes(
|
40
|
+
author.get("given", None)
|
41
|
+
)
|
42
|
+
family_name = parse_attributes(author.get("familyName", None)) or parse_attributes(
|
43
|
+
author.get("family", None)
|
44
|
+
)
|
45
|
+
|
46
|
+
name = cleanup_author(name)
|
47
|
+
|
48
|
+
# make sure we have a name
|
49
|
+
if not name and not given_name and not family_name:
|
50
|
+
return None
|
51
|
+
|
52
|
+
# parse contributor roles, checking for roles supported by commonmeta
|
53
|
+
contributor_roles = wrap(
|
54
|
+
parse_attributes(author.get("contributorType", None))
|
55
|
+
) or wrap(parse_attributes(author.get("contributor_roles", None)))
|
56
|
+
contributor_roles = [
|
57
|
+
i for i in contributor_roles if i in COMMONMETA_CONTRIBUTOR_ROLES
|
58
|
+
] or ["Author"]
|
59
|
+
|
60
|
+
# parse author type, i.e. "Person", "Organization" or not specified
|
61
|
+
_type = parse_attributes(
|
62
|
+
author.get("creatorName", None), content="type", first=True
|
63
|
+
) or parse_attributes(
|
64
|
+
author.get("contributorName", None), content="type", first=True
|
65
|
+
)
|
66
|
+
print(author)
|
67
|
+
# also handle Crossref, JSON Feed, or DataCite metadata
|
68
|
+
_id = (
|
69
|
+
author.get("id", None)
|
70
|
+
or author.get("ORCID", None)
|
71
|
+
or author.get("url", None)
|
72
|
+
or next(
|
73
|
+
(
|
74
|
+
format_name_identifier(i)
|
75
|
+
for i in wrap(author.get("nameIdentifiers", None or author.get("identifiers", None)))
|
76
|
+
),
|
77
|
+
None,
|
78
|
+
)
|
79
|
+
)
|
80
|
+
_id = normalize_orcid(_id) or normalize_ror(_id) or normalize_isni(_id) or _id
|
81
|
+
|
82
|
+
# DataCite metadata
|
83
|
+
if isinstance(_type, str) and _type.endswith("al"):
|
84
|
+
_type = _type[:-3]
|
85
|
+
|
86
|
+
if not _type and isinstance(_id, str) and validate_ror(_id) is not None:
|
87
|
+
_type = "Organization"
|
88
|
+
elif not _type and isinstance(_id, str) and validate_orcid(_id) is not None:
|
89
|
+
_type = "Person"
|
90
|
+
elif not _type and (given_name or family_name):
|
91
|
+
_type = "Person"
|
92
|
+
elif not _type and name and kwargs.get("via", None) == "crossref":
|
93
|
+
_type = "Organization"
|
94
|
+
elif not _type and is_personal_name(name):
|
95
|
+
_type = "Person"
|
96
|
+
elif not _type and name:
|
97
|
+
_type = "Organization"
|
98
|
+
|
99
|
+
# split name for type Person into given/family name if not already provided
|
100
|
+
if _type == "Person" and name and not given_name and not family_name:
|
101
|
+
names = HumanName(name)
|
102
|
+
|
103
|
+
if names:
|
104
|
+
given_name = (
|
105
|
+
" ".join([names.first, names.middle]).strip() if names.first else None
|
106
|
+
)
|
107
|
+
family_name = names.last if names.last else None
|
108
|
+
else:
|
109
|
+
given_name = None
|
110
|
+
family_name = None
|
111
|
+
|
112
|
+
# support various keys for affiliations
|
113
|
+
affiliations = author.get("affiliation", None) or author.get("affiliations", None)
|
114
|
+
|
115
|
+
# return author in commonmeta format, using name vs. given/family name
|
116
|
+
# depending on type
|
117
|
+
return compact(
|
118
|
+
{
|
119
|
+
"id": _id,
|
120
|
+
"type": _type,
|
121
|
+
"contributorRoles": contributor_roles,
|
122
|
+
"name": name if _type == "Organization" else None,
|
123
|
+
"givenName": given_name if _type == "Person" else None,
|
124
|
+
"familyName": family_name if _type == "Person" else None,
|
125
|
+
"affiliations": presence(get_affiliations(wrap(affiliations))),
|
126
|
+
}
|
127
|
+
)
|
128
|
+
|
129
|
+
|
130
|
+
def is_personal_name(name):
|
131
|
+
"""is_personal_name"""
|
132
|
+
# personal names are not allowed to contain semicolons
|
133
|
+
if ";" in name:
|
134
|
+
return False
|
135
|
+
|
136
|
+
# check if a name has only one word, e.g. "FamousOrganization", not including commas
|
137
|
+
if len(name.split(" ")) == 1 and "," not in name:
|
138
|
+
return False
|
139
|
+
|
140
|
+
# check if name contains words known to be used in organization names
|
141
|
+
if any(
|
142
|
+
word in name
|
143
|
+
for word in [
|
144
|
+
"University",
|
145
|
+
"College",
|
146
|
+
"Institute",
|
147
|
+
"School",
|
148
|
+
"Center",
|
149
|
+
"Department",
|
150
|
+
"Laboratory",
|
151
|
+
"Library",
|
152
|
+
"Museum",
|
153
|
+
"Foundation",
|
154
|
+
"Society",
|
155
|
+
"Association",
|
156
|
+
"Company",
|
157
|
+
"Corporation",
|
158
|
+
"Collaboration",
|
159
|
+
"Consortium",
|
160
|
+
"Incorporated",
|
161
|
+
"Inc.",
|
162
|
+
"Institut",
|
163
|
+
"Research",
|
164
|
+
"Science",
|
165
|
+
"Team",
|
166
|
+
"Ministry",
|
167
|
+
"Government",
|
168
|
+
]
|
169
|
+
):
|
170
|
+
return False
|
171
|
+
|
172
|
+
# check for suffixes, e.g. "John Smith, MD"
|
173
|
+
if name.split(", ")[-1] in ["MD", "PhD", "BS"]:
|
174
|
+
return True
|
175
|
+
|
176
|
+
# check of name can be parsed into given/family name
|
177
|
+
names = HumanName(name)
|
178
|
+
if names and (names.first or names.last):
|
179
|
+
return True
|
180
|
+
|
181
|
+
return False
|
182
|
+
|
183
|
+
|
184
|
+
def cleanup_author(author):
|
185
|
+
"""clean up author string"""
|
186
|
+
if author is None:
|
187
|
+
return None
|
188
|
+
|
189
|
+
if author.startswith(","):
|
190
|
+
return None
|
191
|
+
|
192
|
+
# detect pattern "Smith J.", but not "Smith, John K."
|
193
|
+
if "," not in author:
|
194
|
+
author = re.sub(r"/([A-Z]\.)?(-?[A-Z]\.)/", ", \1\2", author)
|
195
|
+
|
196
|
+
# remove spaces around hyphens
|
197
|
+
author = author.replace(" - ", "-")
|
198
|
+
|
199
|
+
# remove non-standard space characters
|
200
|
+
author = re.sub("/[ \t\r\n\v\f]/", " ", author)
|
201
|
+
return author
|
202
|
+
|
203
|
+
|
204
|
+
def get_authors(authors, **kwargs):
|
205
|
+
"""transform array of author dicts into commonmeta format"""
|
206
|
+
return py_.uniq(py_.compact([get_one_author(i, **kwargs) for i in authors]))
|
207
|
+
|
208
|
+
|
209
|
+
def authors_as_string(authors: List[dict]) -> str:
|
210
|
+
"""convert authors list to string, e.g. for bibtex"""
|
211
|
+
|
212
|
+
def format_author(author):
|
213
|
+
if author.get("familyName", None) and author.get("givenName", None):
|
214
|
+
return f"{author['familyName']}, {author['givenName']}"
|
215
|
+
elif author.get("familyName", None):
|
216
|
+
return author["familyName"]
|
217
|
+
return author.get("name", None)
|
218
|
+
|
219
|
+
return " and ".join([format_author(i) for i in wrap(authors) if i is not None])
|
220
|
+
|
221
|
+
|
222
|
+
def get_affiliations(affiliations: List[dict]) -> List[dict]:
|
223
|
+
"""parse array of affiliation strings into commonmeta format"""
|
224
|
+
|
225
|
+
def format_element(i):
|
226
|
+
"""format single affiliation element"""
|
227
|
+
affiliation_identifier = None
|
228
|
+
if isinstance(i, str):
|
229
|
+
name = i
|
230
|
+
scheme_uri = None
|
231
|
+
else:
|
232
|
+
if i.get("affiliationIdentifier", None) is not None:
|
233
|
+
affiliation_identifier = i["affiliationIdentifier"]
|
234
|
+
if i.get("schemeURI", None) is not None:
|
235
|
+
scheme_uri = (
|
236
|
+
i["schemeURI"]
|
237
|
+
if i["schemeURI"].endswith("/")
|
238
|
+
else "{affiliation['schemeURI']}/"
|
239
|
+
)
|
240
|
+
affiliation_identifier = (
|
241
|
+
normalize_id(scheme_uri + affiliation_identifier)
|
242
|
+
if (
|
243
|
+
not affiliation_identifier.startswith("https://")
|
244
|
+
and scheme_uri is not None
|
245
|
+
)
|
246
|
+
else normalize_id(affiliation_identifier)
|
247
|
+
)
|
248
|
+
elif i.get("id", None) is not None:
|
249
|
+
f = furl(i.get("id"))
|
250
|
+
if f.scheme in ["http", "https"]:
|
251
|
+
affiliation_identifier = i.get("id")
|
252
|
+
name = i.get("name", None) or i.get("#text", None)
|
253
|
+
return compact(
|
254
|
+
{
|
255
|
+
"id": affiliation_identifier,
|
256
|
+
"name": name,
|
257
|
+
}
|
258
|
+
)
|
259
|
+
|
260
|
+
return py_.uniq(py_.compact([format_element(i) for i in affiliations]))
|
commonmeta/base_utils.py
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
"""Base utilities for commonmeta-py"""
|
2
|
+
import html
|
3
|
+
from os import path
|
4
|
+
import re
|
5
|
+
import xmltodict
|
6
|
+
from typing import Optional, Union
|
7
|
+
import nh3
|
8
|
+
|
9
|
+
|
10
|
+
def wrap(item) -> list:
|
11
|
+
"""Turn None, dict, or list into list"""
|
12
|
+
if item is None:
|
13
|
+
return []
|
14
|
+
if isinstance(item, list):
|
15
|
+
return item
|
16
|
+
return [item]
|
17
|
+
|
18
|
+
|
19
|
+
def unwrap(lst: list) -> Optional[Union[dict, list]]:
|
20
|
+
"""Turn list into dict or None, depending on list size"""
|
21
|
+
if len(lst) == 0:
|
22
|
+
return None
|
23
|
+
if len(lst) == 1:
|
24
|
+
return lst[0]
|
25
|
+
return lst
|
26
|
+
|
27
|
+
|
28
|
+
def presence(
|
29
|
+
item: Optional[Union[dict, list, str]],
|
30
|
+
) -> Optional[Union[dict, list, str]]:
|
31
|
+
"""Turn empty list, dict or str into None"""
|
32
|
+
return None if item is None or len(item) == 0 or item == [{}] else item
|
33
|
+
|
34
|
+
|
35
|
+
def compact(dict_or_list: Union[dict, list]) -> Optional[Union[dict, list]]:
|
36
|
+
"""Remove None from dict or list"""
|
37
|
+
if isinstance(dict_or_list, dict):
|
38
|
+
return {k: v for k, v in dict_or_list.items() if v is not None}
|
39
|
+
if isinstance(dict_or_list, list):
|
40
|
+
lst = [compact(i) for i in dict_or_list]
|
41
|
+
return lst if len(lst) > 0 else None
|
42
|
+
|
43
|
+
return None
|
44
|
+
|
45
|
+
|
46
|
+
def parse_attributes(
|
47
|
+
element: Union[str, dict, list], **kwargs
|
48
|
+
) -> Optional[Union[str, list]]:
|
49
|
+
"""extract attributes from a string, dict or list"""
|
50
|
+
|
51
|
+
def parse_item(item):
|
52
|
+
if isinstance(item, dict):
|
53
|
+
return item.get(html.unescape(content), None)
|
54
|
+
return html.unescape(item)
|
55
|
+
|
56
|
+
content = kwargs.get("content", "#text")
|
57
|
+
if isinstance(element, str) and kwargs.get("content", None) is None:
|
58
|
+
return html.unescape(element)
|
59
|
+
if isinstance(element, dict):
|
60
|
+
return element.get(html.unescape(content), None)
|
61
|
+
if isinstance(element, list):
|
62
|
+
arr = [parse_item(i) for i in element if i]
|
63
|
+
arr = arr[0] if len(arr) > 0 and kwargs.get("first") else unwrap(arr)
|
64
|
+
return arr
|
65
|
+
|
66
|
+
|
67
|
+
def parse_xml(string: Optional[str], **kwargs) -> Optional[Union[dict, list]]:
|
68
|
+
"""Parse XML into dict. Set default options, and options for Crossref XML"""
|
69
|
+
if string is None:
|
70
|
+
return None
|
71
|
+
if path.exists(string):
|
72
|
+
with open(string, encoding="utf-8") as file:
|
73
|
+
string = file.read()
|
74
|
+
|
75
|
+
if kwargs.get("dialect", None) == "crossref":
|
76
|
+
# remove namespaces from xml
|
77
|
+
namespaces = {
|
78
|
+
"http://www.crossref.org/schema/5.3.1": None,
|
79
|
+
"http://www.crossref.org/qrschema/3.0": None,
|
80
|
+
"http://www.crossref.org/xschema/1.0": None,
|
81
|
+
"http://www.crossref.org/xschema/1.1": None,
|
82
|
+
"http://www.crossref.org/AccessIndicators.xsd": None,
|
83
|
+
"http://www.crossref.org/relations.xsd": None,
|
84
|
+
"http://www.crossref.org/fundref.xsd": None,
|
85
|
+
"http://www.ncbi.nlm.nih.gov/JATS1": None,
|
86
|
+
}
|
87
|
+
|
88
|
+
kwargs["process_namespaces"] = True
|
89
|
+
kwargs["namespaces"] = namespaces
|
90
|
+
kwargs["force_list"] = {
|
91
|
+
"person_name",
|
92
|
+
"organization",
|
93
|
+
"titles",
|
94
|
+
"item",
|
95
|
+
"citation",
|
96
|
+
"program",
|
97
|
+
}
|
98
|
+
|
99
|
+
kwargs["attr_prefix"] = ""
|
100
|
+
kwargs["dict_constructor"] = dict
|
101
|
+
kwargs.pop("dialect", None)
|
102
|
+
return xmltodict.parse(string, **kwargs)
|
103
|
+
|
104
|
+
|
105
|
+
def sanitize(text: str, **kwargs) -> str:
|
106
|
+
"""Sanitize text"""
|
107
|
+
# default whitelisted HTML tags
|
108
|
+
tags = kwargs.get("tags", None) or {
|
109
|
+
"b",
|
110
|
+
"br",
|
111
|
+
"code",
|
112
|
+
"em",
|
113
|
+
"i",
|
114
|
+
"sub",
|
115
|
+
"sup",
|
116
|
+
"strong",
|
117
|
+
}
|
118
|
+
attributes = kwargs.get("attributes", None)
|
119
|
+
string = nh3.clean(text, tags=tags, attributes=attributes, link_rel=None)
|
120
|
+
# remove excessive internal whitespace
|
121
|
+
return " ".join(re.split(r"\s+", string, flags=re.UNICODE))
|