commonmeta-py 0.22__py3-none-any.whl → 0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. commonmeta/__init__.py +96 -0
  2. commonmeta/api_utils.py +77 -0
  3. commonmeta/author_utils.py +260 -0
  4. commonmeta/base_utils.py +121 -0
  5. commonmeta/cli.py +200 -0
  6. commonmeta/constants.py +587 -0
  7. commonmeta/crossref_utils.py +575 -0
  8. commonmeta/date_utils.py +193 -0
  9. commonmeta/doi_utils.py +273 -0
  10. commonmeta/metadata.py +320 -0
  11. commonmeta/readers/__init__.py +1 -0
  12. commonmeta/readers/bibtex_reader.py +0 -0
  13. commonmeta/readers/cff_reader.py +199 -0
  14. commonmeta/readers/codemeta_reader.py +112 -0
  15. commonmeta/readers/commonmeta_reader.py +13 -0
  16. commonmeta/readers/crossref_reader.py +409 -0
  17. commonmeta/readers/crossref_xml_reader.py +505 -0
  18. commonmeta/readers/csl_reader.py +98 -0
  19. commonmeta/readers/datacite_reader.py +390 -0
  20. commonmeta/readers/datacite_xml_reader.py +359 -0
  21. commonmeta/readers/inveniordm_reader.py +218 -0
  22. commonmeta/readers/json_feed_reader.py +420 -0
  23. commonmeta/readers/kbase_reader.py +205 -0
  24. commonmeta/readers/ris_reader.py +103 -0
  25. commonmeta/readers/schema_org_reader.py +506 -0
  26. commonmeta/resources/cff_v1.2.0.json +1827 -0
  27. commonmeta/resources/commonmeta_v0.12.json +601 -0
  28. commonmeta/resources/commonmeta_v0.13.json +559 -0
  29. commonmeta/resources/commonmeta_v0.14.json +573 -0
  30. commonmeta/resources/crossref/AccessIndicators.xsd +47 -0
  31. commonmeta/resources/crossref/JATS-journalpublishing1-3d2-mathml3-elements.xsd +10130 -0
  32. commonmeta/resources/crossref/JATS-journalpublishing1-3d2-mathml3.xsd +48 -0
  33. commonmeta/resources/crossref/JATS-journalpublishing1-elements.xsd +8705 -0
  34. commonmeta/resources/crossref/JATS-journalpublishing1-mathml3-elements.xsd +8608 -0
  35. commonmeta/resources/crossref/JATS-journalpublishing1-mathml3.xsd +49 -0
  36. commonmeta/resources/crossref/JATS-journalpublishing1.xsd +6176 -0
  37. commonmeta/resources/crossref/clinicaltrials.xsd +61 -0
  38. commonmeta/resources/crossref/common5.3.1.xsd +1538 -0
  39. commonmeta/resources/crossref/crossref5.3.1.xsd +1949 -0
  40. commonmeta/resources/crossref/crossref_query_output3.0.xsd +1097 -0
  41. commonmeta/resources/crossref/fundref.xsd +49 -0
  42. commonmeta/resources/crossref/module-ali.xsd +39 -0
  43. commonmeta/resources/crossref/relations.xsd +444 -0
  44. commonmeta/resources/crossref-v0.2.json +60 -0
  45. commonmeta/resources/csl-data.json +538 -0
  46. commonmeta/resources/datacite-v4.5.json +829 -0
  47. commonmeta/resources/datacite-v4.5pr.json +608 -0
  48. commonmeta/resources/ietf-bcp-47.json +3025 -0
  49. commonmeta/resources/iso-8601.json +3182 -0
  50. commonmeta/resources/spdx/licenses.json +4851 -0
  51. commonmeta/resources/spdx-schema..json +903 -0
  52. commonmeta/resources/styles/apa.csl +1697 -0
  53. commonmeta/resources/styles/chicago-author-date.csl +684 -0
  54. commonmeta/resources/styles/harvard-cite-them-right.csl +321 -0
  55. commonmeta/resources/styles/ieee.csl +468 -0
  56. commonmeta/resources/styles/modern-language-association.csl +341 -0
  57. commonmeta/resources/styles/vancouver.csl +376 -0
  58. commonmeta/schema_utils.py +27 -0
  59. commonmeta/translators.py +47 -0
  60. commonmeta/utils.py +1108 -0
  61. commonmeta/writers/__init__.py +1 -0
  62. commonmeta/writers/bibtex_writer.py +149 -0
  63. commonmeta/writers/citation_writer.py +70 -0
  64. commonmeta/writers/commonmeta_writer.py +68 -0
  65. commonmeta/writers/crossref_xml_writer.py +17 -0
  66. commonmeta/writers/csl_writer.py +79 -0
  67. commonmeta/writers/datacite_writer.py +193 -0
  68. commonmeta/writers/inveniordm_writer.py +94 -0
  69. commonmeta/writers/ris_writer.py +58 -0
  70. commonmeta/writers/schema_org_writer.py +146 -0
  71. {commonmeta_py-0.22.dist-info → commonmeta_py-0.24.dist-info}/METADATA +56 -45
  72. commonmeta_py-0.24.dist-info/RECORD +75 -0
  73. {commonmeta_py-0.22.dist-info → commonmeta_py-0.24.dist-info}/WHEEL +1 -1
  74. commonmeta_py-0.24.dist-info/entry_points.txt +3 -0
  75. commonmeta_py/__init__.py +0 -2
  76. commonmeta_py-0.22.dist-info/RECORD +0 -5
  77. {commonmeta_py-0.22.dist-info/licenses → commonmeta_py-0.24.dist-info}/LICENSE +0 -0
commonmeta/__init__.py ADDED
@@ -0,0 +1,96 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # commonmeta-py
4
+
5
+ """
6
+ commonmeta-py library
7
+ ~~~~~~~~~~~~~~~~~~~~~
8
+
9
+ commonmeta-py is a Python library to convert scholarly metadata
10
+ """
11
+
12
+ __title__ = "commonmeta-py"
13
+ __version__ = "0.24"
14
+ __author__ = "Martin Fenner"
15
+ __license__ = "MIT"
16
+
17
+ # ruff: noqa: F401
18
+ from .metadata import Metadata, MetadataList
19
+ from .readers import (
20
+ cff_reader,
21
+ codemeta_reader,
22
+ crossref_reader,
23
+ crossref_xml_reader,
24
+ datacite_reader,
25
+ datacite_xml_reader,
26
+ inveniordm_reader,
27
+ json_feed_reader,
28
+ kbase_reader,
29
+ ris_reader,
30
+ schema_org_reader,
31
+ )
32
+ from .writers import (
33
+ bibtex_writer,
34
+ citation_writer,
35
+ commonmeta_writer,
36
+ csl_writer,
37
+ datacite_writer,
38
+ ris_writer,
39
+ schema_org_writer,
40
+ )
41
+ from .utils import (
42
+ dict_to_spdx,
43
+ from_csl,
44
+ from_schema_org,
45
+ get_language,
46
+ normalize_cc_url,
47
+ normalize_id,
48
+ normalize_ids,
49
+ normalize_orcid,
50
+ normalize_url,
51
+ normalize_ror,
52
+ pages_as_string,
53
+ to_csl,
54
+ validate_orcid,
55
+ validate_url,
56
+ get_language,
57
+ encode_doi,
58
+ name_to_fos,
59
+ from_json_feed,
60
+ )
61
+ from .author_utils import (
62
+ authors_as_string,
63
+ cleanup_author,
64
+ get_affiliations,
65
+ get_authors,
66
+ get_one_author,
67
+ is_personal_name,
68
+ )
69
+ from .base_utils import (
70
+ wrap,
71
+ unwrap,
72
+ compact,
73
+ presence,
74
+ parse_attributes,
75
+ sanitize,
76
+ )
77
+ from .date_utils import (
78
+ get_date_from_crossref_parts,
79
+ get_date_from_date_parts,
80
+ get_date_from_unix_timestamp,
81
+ get_date_parts,
82
+ get_iso8601_date,
83
+ strip_milliseconds,
84
+ )
85
+ from .doi_utils import (
86
+ crossref_api_url,
87
+ crossref_xml_api_url,
88
+ doi_from_url,
89
+ doi_as_url,
90
+ doi_resolver,
91
+ datacite_api_url,
92
+ get_doi_ra,
93
+ normalize_doi,
94
+ validate_doi,
95
+ validate_prefix,
96
+ )
@@ -0,0 +1,77 @@
1
+ """API Utils module for commonmeta-py"""
2
+
3
+ from typing import Optional
4
+ from datetime import datetime as date
5
+ import httpx
6
+ from furl import furl
7
+ import jwt
8
+
9
+ from .doi_utils import validate_doi, doi_as_url
10
+ from .readers.json_feed_reader import get_json_feed_item_uuid
11
+
12
+
13
+ def generate_ghost_token(key: str) -> str:
14
+ """Generate a short-lived JWT for the Ghost Admin API.
15
+ From https://ghost.org/docs/admin-api/#token-authentication"""
16
+
17
+ # Split the key into ID and SECRET
18
+ _id, secret = key.split(":")
19
+
20
+ # Prepare header and payload
21
+ iat = int(date.now().timestamp())
22
+
23
+ header = {"alg": "HS256", "typ": "JWT", "kid": _id}
24
+ payload = {"iat": iat, "exp": iat + 5 * 60, "aud": "/admin/"}
25
+
26
+ # Create and return the token (including decoding secret)
27
+ return jwt.encode(payload, bytes.fromhex(secret), algorithm="HS256", headers=header)
28
+
29
+
30
+ def update_ghost_post_via_api(
31
+ _id: str, api_key: Optional[str] = None, api_url: Optional[str] = None
32
+ ) -> dict[str, str]:
33
+ """Update Ghost post via API"""
34
+ # get post doi and url from Rogue Scholar API
35
+ # post url is needed to find post via Ghost API
36
+ post = get_json_feed_item_uuid(_id)
37
+ if post.get("error", None):
38
+ return post
39
+ doi = validate_doi(post.get("doi", None))
40
+ doi = doi_as_url(doi)
41
+ url = post.get("url", None)
42
+ if not doi or not url:
43
+ return {"error": "DOI or URL not found"}
44
+
45
+ # get post_id and updated_at from ghost api
46
+ token = generate_ghost_token(api_key)
47
+ headers = {
48
+ "Authorization": f"Ghost {token}",
49
+ "Content-Type": "application/json",
50
+ "Accept-Version": "v5",
51
+ }
52
+ f = furl(url)
53
+ slug = f.path.segments[-1]
54
+ ghost_url = f"{api_url}/ghost/api/admin/posts/slug/{slug}/"
55
+ response = httpx.get(ghost_url, headers=headers, timeout=10)
56
+ if response.status_code != 200:
57
+ return {"error": "Error fetching post"}
58
+ ghost_post = response.json().get("posts")[0]
59
+ guid = ghost_post.get("id")
60
+ updated_at = ghost_post.get("updated_at")
61
+ if not guid or not updated_at:
62
+ return {"error": "guid or updated_at not found"}
63
+
64
+ # update post canonical_url using doi. This requires sending
65
+ # the updated_at timestamp to avoid conflicts, and must use guid
66
+ # rather than url for put requests
67
+ ghost_url = f"{api_url}/ghost/api/admin/posts/{guid}/"
68
+
69
+ json = {"posts": [{"canonical_url": doi, "updated_at": updated_at}]}
70
+ response = httpx.put(
71
+ ghost_url,
72
+ headers=headers,
73
+ json=json,
74
+ )
75
+ if response.status_code != 200:
76
+ return {"error": "Error updating post"}
77
+ return {"message": f"DOI {doi} added", "guid": guid, "updated_at": updated_at}
@@ -0,0 +1,260 @@
1
+ """Author utils module for commonmeta-py"""
2
+ import re
3
+ from typing import List
4
+ from nameparser import HumanName
5
+ from pydash import py_
6
+ from furl import furl
7
+
8
+ from .utils import (
9
+ normalize_orcid,
10
+ normalize_id,
11
+ normalize_ror,
12
+ normalize_isni,
13
+ format_name_identifier,
14
+ validate_ror,
15
+ validate_orcid,
16
+ )
17
+ from .base_utils import parse_attributes, wrap, presence, compact
18
+
19
+ from .constants import (
20
+ COMMONMETA_CONTRIBUTOR_ROLES,
21
+ )
22
+
23
+
24
+ def get_one_author(author, **kwargs):
25
+ """parse one author string into commonmeta format"""
26
+ # if author is a string
27
+ if isinstance(author, str):
28
+ author = {"creatorName": author}
29
+
30
+ # malformed XML
31
+ if isinstance(author.get("creatorName", None), list):
32
+ return None
33
+
34
+ name = (
35
+ parse_attributes(author.get("creatorName", None))
36
+ or parse_attributes(author.get("contributorName", None))
37
+ or parse_attributes(author.get("name", None))
38
+ )
39
+ given_name = parse_attributes(author.get("givenName", None)) or parse_attributes(
40
+ author.get("given", None)
41
+ )
42
+ family_name = parse_attributes(author.get("familyName", None)) or parse_attributes(
43
+ author.get("family", None)
44
+ )
45
+
46
+ name = cleanup_author(name)
47
+
48
+ # make sure we have a name
49
+ if not name and not given_name and not family_name:
50
+ return None
51
+
52
+ # parse contributor roles, checking for roles supported by commonmeta
53
+ contributor_roles = wrap(
54
+ parse_attributes(author.get("contributorType", None))
55
+ ) or wrap(parse_attributes(author.get("contributor_roles", None)))
56
+ contributor_roles = [
57
+ i for i in contributor_roles if i in COMMONMETA_CONTRIBUTOR_ROLES
58
+ ] or ["Author"]
59
+
60
+ # parse author type, i.e. "Person", "Organization" or not specified
61
+ _type = parse_attributes(
62
+ author.get("creatorName", None), content="type", first=True
63
+ ) or parse_attributes(
64
+ author.get("contributorName", None), content="type", first=True
65
+ )
66
+ print(author)
67
+ # also handle Crossref, JSON Feed, or DataCite metadata
68
+ _id = (
69
+ author.get("id", None)
70
+ or author.get("ORCID", None)
71
+ or author.get("url", None)
72
+ or next(
73
+ (
74
+ format_name_identifier(i)
75
+ for i in wrap(author.get("nameIdentifiers", None or author.get("identifiers", None)))
76
+ ),
77
+ None,
78
+ )
79
+ )
80
+ _id = normalize_orcid(_id) or normalize_ror(_id) or normalize_isni(_id) or _id
81
+
82
+ # DataCite metadata
83
+ if isinstance(_type, str) and _type.endswith("al"):
84
+ _type = _type[:-3]
85
+
86
+ if not _type and isinstance(_id, str) and validate_ror(_id) is not None:
87
+ _type = "Organization"
88
+ elif not _type and isinstance(_id, str) and validate_orcid(_id) is not None:
89
+ _type = "Person"
90
+ elif not _type and (given_name or family_name):
91
+ _type = "Person"
92
+ elif not _type and name and kwargs.get("via", None) == "crossref":
93
+ _type = "Organization"
94
+ elif not _type and is_personal_name(name):
95
+ _type = "Person"
96
+ elif not _type and name:
97
+ _type = "Organization"
98
+
99
+ # split name for type Person into given/family name if not already provided
100
+ if _type == "Person" and name and not given_name and not family_name:
101
+ names = HumanName(name)
102
+
103
+ if names:
104
+ given_name = (
105
+ " ".join([names.first, names.middle]).strip() if names.first else None
106
+ )
107
+ family_name = names.last if names.last else None
108
+ else:
109
+ given_name = None
110
+ family_name = None
111
+
112
+ # support various keys for affiliations
113
+ affiliations = author.get("affiliation", None) or author.get("affiliations", None)
114
+
115
+ # return author in commonmeta format, using name vs. given/family name
116
+ # depending on type
117
+ return compact(
118
+ {
119
+ "id": _id,
120
+ "type": _type,
121
+ "contributorRoles": contributor_roles,
122
+ "name": name if _type == "Organization" else None,
123
+ "givenName": given_name if _type == "Person" else None,
124
+ "familyName": family_name if _type == "Person" else None,
125
+ "affiliations": presence(get_affiliations(wrap(affiliations))),
126
+ }
127
+ )
128
+
129
+
130
+ def is_personal_name(name):
131
+ """is_personal_name"""
132
+ # personal names are not allowed to contain semicolons
133
+ if ";" in name:
134
+ return False
135
+
136
+ # check if a name has only one word, e.g. "FamousOrganization", not including commas
137
+ if len(name.split(" ")) == 1 and "," not in name:
138
+ return False
139
+
140
+ # check if name contains words known to be used in organization names
141
+ if any(
142
+ word in name
143
+ for word in [
144
+ "University",
145
+ "College",
146
+ "Institute",
147
+ "School",
148
+ "Center",
149
+ "Department",
150
+ "Laboratory",
151
+ "Library",
152
+ "Museum",
153
+ "Foundation",
154
+ "Society",
155
+ "Association",
156
+ "Company",
157
+ "Corporation",
158
+ "Collaboration",
159
+ "Consortium",
160
+ "Incorporated",
161
+ "Inc.",
162
+ "Institut",
163
+ "Research",
164
+ "Science",
165
+ "Team",
166
+ "Ministry",
167
+ "Government",
168
+ ]
169
+ ):
170
+ return False
171
+
172
+ # check for suffixes, e.g. "John Smith, MD"
173
+ if name.split(", ")[-1] in ["MD", "PhD", "BS"]:
174
+ return True
175
+
176
+ # check of name can be parsed into given/family name
177
+ names = HumanName(name)
178
+ if names and (names.first or names.last):
179
+ return True
180
+
181
+ return False
182
+
183
+
184
+ def cleanup_author(author):
185
+ """clean up author string"""
186
+ if author is None:
187
+ return None
188
+
189
+ if author.startswith(","):
190
+ return None
191
+
192
+ # detect pattern "Smith J.", but not "Smith, John K."
193
+ if "," not in author:
194
+ author = re.sub(r"/([A-Z]\.)?(-?[A-Z]\.)/", ", \1\2", author)
195
+
196
+ # remove spaces around hyphens
197
+ author = author.replace(" - ", "-")
198
+
199
+ # remove non-standard space characters
200
+ author = re.sub("/[ \t\r\n\v\f]/", " ", author)
201
+ return author
202
+
203
+
204
+ def get_authors(authors, **kwargs):
205
+ """transform array of author dicts into commonmeta format"""
206
+ return py_.uniq(py_.compact([get_one_author(i, **kwargs) for i in authors]))
207
+
208
+
209
+ def authors_as_string(authors: List[dict]) -> str:
210
+ """convert authors list to string, e.g. for bibtex"""
211
+
212
+ def format_author(author):
213
+ if author.get("familyName", None) and author.get("givenName", None):
214
+ return f"{author['familyName']}, {author['givenName']}"
215
+ elif author.get("familyName", None):
216
+ return author["familyName"]
217
+ return author.get("name", None)
218
+
219
+ return " and ".join([format_author(i) for i in wrap(authors) if i is not None])
220
+
221
+
222
+ def get_affiliations(affiliations: List[dict]) -> List[dict]:
223
+ """parse array of affiliation strings into commonmeta format"""
224
+
225
+ def format_element(i):
226
+ """format single affiliation element"""
227
+ affiliation_identifier = None
228
+ if isinstance(i, str):
229
+ name = i
230
+ scheme_uri = None
231
+ else:
232
+ if i.get("affiliationIdentifier", None) is not None:
233
+ affiliation_identifier = i["affiliationIdentifier"]
234
+ if i.get("schemeURI", None) is not None:
235
+ scheme_uri = (
236
+ i["schemeURI"]
237
+ if i["schemeURI"].endswith("/")
238
+ else "{affiliation['schemeURI']}/"
239
+ )
240
+ affiliation_identifier = (
241
+ normalize_id(scheme_uri + affiliation_identifier)
242
+ if (
243
+ not affiliation_identifier.startswith("https://")
244
+ and scheme_uri is not None
245
+ )
246
+ else normalize_id(affiliation_identifier)
247
+ )
248
+ elif i.get("id", None) is not None:
249
+ f = furl(i.get("id"))
250
+ if f.scheme in ["http", "https"]:
251
+ affiliation_identifier = i.get("id")
252
+ name = i.get("name", None) or i.get("#text", None)
253
+ return compact(
254
+ {
255
+ "id": affiliation_identifier,
256
+ "name": name,
257
+ }
258
+ )
259
+
260
+ return py_.uniq(py_.compact([format_element(i) for i in affiliations]))
@@ -0,0 +1,121 @@
1
+ """Base utilities for commonmeta-py"""
2
+ import html
3
+ from os import path
4
+ import re
5
+ import xmltodict
6
+ from typing import Optional, Union
7
+ import nh3
8
+
9
+
10
+ def wrap(item) -> list:
11
+ """Turn None, dict, or list into list"""
12
+ if item is None:
13
+ return []
14
+ if isinstance(item, list):
15
+ return item
16
+ return [item]
17
+
18
+
19
+ def unwrap(lst: list) -> Optional[Union[dict, list]]:
20
+ """Turn list into dict or None, depending on list size"""
21
+ if len(lst) == 0:
22
+ return None
23
+ if len(lst) == 1:
24
+ return lst[0]
25
+ return lst
26
+
27
+
28
+ def presence(
29
+ item: Optional[Union[dict, list, str]],
30
+ ) -> Optional[Union[dict, list, str]]:
31
+ """Turn empty list, dict or str into None"""
32
+ return None if item is None or len(item) == 0 or item == [{}] else item
33
+
34
+
35
+ def compact(dict_or_list: Union[dict, list]) -> Optional[Union[dict, list]]:
36
+ """Remove None from dict or list"""
37
+ if isinstance(dict_or_list, dict):
38
+ return {k: v for k, v in dict_or_list.items() if v is not None}
39
+ if isinstance(dict_or_list, list):
40
+ lst = [compact(i) for i in dict_or_list]
41
+ return lst if len(lst) > 0 else None
42
+
43
+ return None
44
+
45
+
46
+ def parse_attributes(
47
+ element: Union[str, dict, list], **kwargs
48
+ ) -> Optional[Union[str, list]]:
49
+ """extract attributes from a string, dict or list"""
50
+
51
+ def parse_item(item):
52
+ if isinstance(item, dict):
53
+ return item.get(html.unescape(content), None)
54
+ return html.unescape(item)
55
+
56
+ content = kwargs.get("content", "#text")
57
+ if isinstance(element, str) and kwargs.get("content", None) is None:
58
+ return html.unescape(element)
59
+ if isinstance(element, dict):
60
+ return element.get(html.unescape(content), None)
61
+ if isinstance(element, list):
62
+ arr = [parse_item(i) for i in element if i]
63
+ arr = arr[0] if len(arr) > 0 and kwargs.get("first") else unwrap(arr)
64
+ return arr
65
+
66
+
67
+ def parse_xml(string: Optional[str], **kwargs) -> Optional[Union[dict, list]]:
68
+ """Parse XML into dict. Set default options, and options for Crossref XML"""
69
+ if string is None:
70
+ return None
71
+ if path.exists(string):
72
+ with open(string, encoding="utf-8") as file:
73
+ string = file.read()
74
+
75
+ if kwargs.get("dialect", None) == "crossref":
76
+ # remove namespaces from xml
77
+ namespaces = {
78
+ "http://www.crossref.org/schema/5.3.1": None,
79
+ "http://www.crossref.org/qrschema/3.0": None,
80
+ "http://www.crossref.org/xschema/1.0": None,
81
+ "http://www.crossref.org/xschema/1.1": None,
82
+ "http://www.crossref.org/AccessIndicators.xsd": None,
83
+ "http://www.crossref.org/relations.xsd": None,
84
+ "http://www.crossref.org/fundref.xsd": None,
85
+ "http://www.ncbi.nlm.nih.gov/JATS1": None,
86
+ }
87
+
88
+ kwargs["process_namespaces"] = True
89
+ kwargs["namespaces"] = namespaces
90
+ kwargs["force_list"] = {
91
+ "person_name",
92
+ "organization",
93
+ "titles",
94
+ "item",
95
+ "citation",
96
+ "program",
97
+ }
98
+
99
+ kwargs["attr_prefix"] = ""
100
+ kwargs["dict_constructor"] = dict
101
+ kwargs.pop("dialect", None)
102
+ return xmltodict.parse(string, **kwargs)
103
+
104
+
105
+ def sanitize(text: str, **kwargs) -> str:
106
+ """Sanitize text"""
107
+ # default whitelisted HTML tags
108
+ tags = kwargs.get("tags", None) or {
109
+ "b",
110
+ "br",
111
+ "code",
112
+ "em",
113
+ "i",
114
+ "sub",
115
+ "sup",
116
+ "strong",
117
+ }
118
+ attributes = kwargs.get("attributes", None)
119
+ string = nh3.clean(text, tags=tags, attributes=attributes, link_rel=None)
120
+ # remove excessive internal whitespace
121
+ return " ".join(re.split(r"\s+", string, flags=re.UNICODE))