commonmeta-py 0.22__py3-none-any.whl → 0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- commonmeta/__init__.py +96 -0
- commonmeta/api_utils.py +77 -0
- commonmeta/author_utils.py +260 -0
- commonmeta/base_utils.py +121 -0
- commonmeta/cli.py +200 -0
- commonmeta/constants.py +587 -0
- commonmeta/crossref_utils.py +575 -0
- commonmeta/date_utils.py +193 -0
- commonmeta/doi_utils.py +273 -0
- commonmeta/metadata.py +320 -0
- commonmeta/readers/__init__.py +1 -0
- commonmeta/readers/bibtex_reader.py +0 -0
- commonmeta/readers/cff_reader.py +199 -0
- commonmeta/readers/codemeta_reader.py +112 -0
- commonmeta/readers/commonmeta_reader.py +13 -0
- commonmeta/readers/crossref_reader.py +409 -0
- commonmeta/readers/crossref_xml_reader.py +505 -0
- commonmeta/readers/csl_reader.py +98 -0
- commonmeta/readers/datacite_reader.py +390 -0
- commonmeta/readers/datacite_xml_reader.py +359 -0
- commonmeta/readers/inveniordm_reader.py +218 -0
- commonmeta/readers/json_feed_reader.py +420 -0
- commonmeta/readers/kbase_reader.py +205 -0
- commonmeta/readers/ris_reader.py +103 -0
- commonmeta/readers/schema_org_reader.py +506 -0
- commonmeta/resources/cff_v1.2.0.json +1827 -0
- commonmeta/resources/commonmeta_v0.12.json +601 -0
- commonmeta/resources/commonmeta_v0.13.json +559 -0
- commonmeta/resources/commonmeta_v0.14.json +573 -0
- commonmeta/resources/crossref/AccessIndicators.xsd +47 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-3d2-mathml3-elements.xsd +10130 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-3d2-mathml3.xsd +48 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-elements.xsd +8705 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-mathml3-elements.xsd +8608 -0
- commonmeta/resources/crossref/JATS-journalpublishing1-mathml3.xsd +49 -0
- commonmeta/resources/crossref/JATS-journalpublishing1.xsd +6176 -0
- commonmeta/resources/crossref/clinicaltrials.xsd +61 -0
- commonmeta/resources/crossref/common5.3.1.xsd +1538 -0
- commonmeta/resources/crossref/crossref5.3.1.xsd +1949 -0
- commonmeta/resources/crossref/crossref_query_output3.0.xsd +1097 -0
- commonmeta/resources/crossref/fundref.xsd +49 -0
- commonmeta/resources/crossref/module-ali.xsd +39 -0
- commonmeta/resources/crossref/relations.xsd +444 -0
- commonmeta/resources/crossref-v0.2.json +60 -0
- commonmeta/resources/csl-data.json +538 -0
- commonmeta/resources/datacite-v4.5.json +829 -0
- commonmeta/resources/datacite-v4.5pr.json +608 -0
- commonmeta/resources/ietf-bcp-47.json +3025 -0
- commonmeta/resources/iso-8601.json +3182 -0
- commonmeta/resources/spdx/licenses.json +4851 -0
- commonmeta/resources/spdx-schema..json +903 -0
- commonmeta/resources/styles/apa.csl +1697 -0
- commonmeta/resources/styles/chicago-author-date.csl +684 -0
- commonmeta/resources/styles/harvard-cite-them-right.csl +321 -0
- commonmeta/resources/styles/ieee.csl +468 -0
- commonmeta/resources/styles/modern-language-association.csl +341 -0
- commonmeta/resources/styles/vancouver.csl +376 -0
- commonmeta/schema_utils.py +27 -0
- commonmeta/translators.py +47 -0
- commonmeta/utils.py +1108 -0
- commonmeta/writers/__init__.py +1 -0
- commonmeta/writers/bibtex_writer.py +149 -0
- commonmeta/writers/citation_writer.py +70 -0
- commonmeta/writers/commonmeta_writer.py +68 -0
- commonmeta/writers/crossref_xml_writer.py +17 -0
- commonmeta/writers/csl_writer.py +79 -0
- commonmeta/writers/datacite_writer.py +193 -0
- commonmeta/writers/inveniordm_writer.py +94 -0
- commonmeta/writers/ris_writer.py +58 -0
- commonmeta/writers/schema_org_writer.py +146 -0
- {commonmeta_py-0.22.dist-info → commonmeta_py-0.24.dist-info}/METADATA +56 -45
- commonmeta_py-0.24.dist-info/RECORD +75 -0
- {commonmeta_py-0.22.dist-info → commonmeta_py-0.24.dist-info}/WHEEL +1 -1
- commonmeta_py-0.24.dist-info/entry_points.txt +3 -0
- commonmeta_py/__init__.py +0 -2
- commonmeta_py-0.22.dist-info/RECORD +0 -5
- {commonmeta_py-0.22.dist-info/licenses → commonmeta_py-0.24.dist-info}/LICENSE +0 -0
@@ -0,0 +1,103 @@
|
|
1
|
+
"""RIS reader for commonmeta-py"""
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from ..utils import compact, normalize_url, wrap
|
5
|
+
from ..base_utils import presence
|
6
|
+
from ..author_utils import get_authors
|
7
|
+
from ..date_utils import get_date_from_parts
|
8
|
+
from ..doi_utils import normalize_doi, doi_from_url
|
9
|
+
from ..constants import RIS_TO_CM_TRANSLATIONS, Commonmeta
|
10
|
+
|
11
|
+
|
12
|
+
def read_ris(data: Optional[str], **kwargs) -> Commonmeta:
|
13
|
+
"""read_ris"""
|
14
|
+
|
15
|
+
meta = ris_meta(data=data)
|
16
|
+
read_options = kwargs or {}
|
17
|
+
|
18
|
+
if not isinstance(meta, dict):
|
19
|
+
return {"state": "not_found"}
|
20
|
+
|
21
|
+
_id = read_options.get("doi", None) or normalize_doi(meta.get("DO", None))
|
22
|
+
_type = RIS_TO_CM_TRANSLATIONS.get(meta.get("TY", None), "Other")
|
23
|
+
container_type = "Journal" if _type == "JournalArticle" else None
|
24
|
+
|
25
|
+
def get_author(author):
|
26
|
+
"""get_author"""
|
27
|
+
return {"creatorName": author}
|
28
|
+
|
29
|
+
authors = [get_author(i) for i in wrap(meta.get("AU", None))]
|
30
|
+
contributors = get_authors(authors)
|
31
|
+
date = {}
|
32
|
+
if meta.get("PY", None) is not None:
|
33
|
+
date["published"] = get_date_from_parts(*str(meta.get("PY", None)).split("/"))
|
34
|
+
if meta.get("Y1", None) is not None:
|
35
|
+
date["created"] = get_date_from_parts(*str(meta.get("Y1", None)).split("/"))
|
36
|
+
# related_identifiers = if meta.fetch('T2', nil).present? & & meta.fetch('SN', nil).present?
|
37
|
+
# [{'type' = > 'Periodical',
|
38
|
+
# 'id'= > meta.fetch('SN', nil),
|
39
|
+
# 'relatedIdentifierType'= > 'ISSN',
|
40
|
+
# 'relationType'= > 'IsPartOf',
|
41
|
+
# 'title' = > meta.fetch('T2', nil)}.compact]
|
42
|
+
# else
|
43
|
+
# []
|
44
|
+
# end
|
45
|
+
descriptions = None
|
46
|
+
if meta.get("AB", None) is not None:
|
47
|
+
descriptions = [{"description": meta.get("AB"), "type": "Abstract"}]
|
48
|
+
if meta.get("T2", None) is not None:
|
49
|
+
container = compact(
|
50
|
+
{
|
51
|
+
"type": container_type,
|
52
|
+
"title": meta.get("T2", None),
|
53
|
+
"volume": meta.get("VL", None),
|
54
|
+
"issue": meta.get("IS", None),
|
55
|
+
"firstPage": meta.get("SP", None),
|
56
|
+
"lastPage": meta.get("EP", None),
|
57
|
+
}
|
58
|
+
)
|
59
|
+
else:
|
60
|
+
container = None
|
61
|
+
if meta.get("PB", None) is not None:
|
62
|
+
publisher = {"name": meta.get("PB")}
|
63
|
+
else:
|
64
|
+
publisher = None
|
65
|
+
subjects = wrap(meta.get("KW", None))
|
66
|
+
state = "findable" if meta.get("DO", None) or read_options else "not_found"
|
67
|
+
|
68
|
+
return {
|
69
|
+
"id": _id,
|
70
|
+
"type": _type,
|
71
|
+
"doi": doi_from_url(_id),
|
72
|
+
"url": normalize_url(meta.get("UR", None)),
|
73
|
+
"titles": [{"title": meta.get("T1", None)}],
|
74
|
+
"descriptions": descriptions,
|
75
|
+
"contributors": presence(contributors),
|
76
|
+
"publisher": presence(publisher),
|
77
|
+
"container": container,
|
78
|
+
# 'related_identifiers': related_identifiers,
|
79
|
+
"date": date,
|
80
|
+
"subjects": subjects,
|
81
|
+
"language": meta.get("LA", None),
|
82
|
+
"state": state,
|
83
|
+
} | read_options
|
84
|
+
|
85
|
+
|
86
|
+
def ris_meta(data):
|
87
|
+
"""ris_meta"""
|
88
|
+
meta = {}
|
89
|
+
if data is None:
|
90
|
+
return meta
|
91
|
+
for line in data.split("\n"):
|
92
|
+
values = line.split("-", 2)
|
93
|
+
key = values[0].strip()
|
94
|
+
if len(values) == 1:
|
95
|
+
continue
|
96
|
+
if meta.get(key, None) is None:
|
97
|
+
meta[key] = values[1].strip()
|
98
|
+
elif isinstance(meta[key], str):
|
99
|
+
meta[key] = [meta[key]]
|
100
|
+
elif isinstance(meta[key], list):
|
101
|
+
meta[key].append(values[1].strip())
|
102
|
+
|
103
|
+
return meta
|
@@ -0,0 +1,506 @@
|
|
1
|
+
"""schema_org reader for commonmeta-py"""
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
import io
|
5
|
+
import orjson as json
|
6
|
+
from datetime import datetime
|
7
|
+
from collections import defaultdict
|
8
|
+
import httpx
|
9
|
+
from pydash import py_
|
10
|
+
from bs4 import BeautifulSoup
|
11
|
+
import pikepdf
|
12
|
+
|
13
|
+
from ..utils import (
|
14
|
+
dict_to_spdx,
|
15
|
+
normalize_cc_url,
|
16
|
+
from_schema_org,
|
17
|
+
from_schema_org_creators,
|
18
|
+
normalize_id,
|
19
|
+
normalize_ids,
|
20
|
+
normalize_url,
|
21
|
+
name_to_fos,
|
22
|
+
get_language,
|
23
|
+
)
|
24
|
+
from ..readers.crossref_reader import get_crossref
|
25
|
+
from ..readers.datacite_reader import get_datacite
|
26
|
+
from ..base_utils import wrap, compact, presence, parse_attributes, sanitize
|
27
|
+
from ..author_utils import get_authors
|
28
|
+
from ..date_utils import (
|
29
|
+
get_iso8601_date,
|
30
|
+
strip_milliseconds,
|
31
|
+
get_datetime_from_pdf_time,
|
32
|
+
)
|
33
|
+
from ..doi_utils import doi_from_url, get_doi_ra, validate_doi
|
34
|
+
from ..translators import web_translator
|
35
|
+
from ..constants import (
|
36
|
+
SO_TO_CM_TRANSLATIONS,
|
37
|
+
SO_TO_DC_RELATION_TYPES,
|
38
|
+
SO_TO_DC_REVERSE_RELATION_TYPES,
|
39
|
+
Commonmeta,
|
40
|
+
)
|
41
|
+
|
42
|
+
|
43
|
+
def get_schema_org(pid: str, **kwargs) -> dict:
|
44
|
+
"""get_schema_org"""
|
45
|
+
if pid is None:
|
46
|
+
return {"state": "not_found"}
|
47
|
+
url = pid
|
48
|
+
|
49
|
+
# if pid represents a DOI, get metadata from Crossref or DataCite
|
50
|
+
if doi_from_url(pid):
|
51
|
+
return get_doi_meta(doi_from_url(pid))
|
52
|
+
try:
|
53
|
+
response = httpx.get(url, timeout=10, follow_redirects=True, **kwargs)
|
54
|
+
except httpx.ConnectError as error:
|
55
|
+
return {
|
56
|
+
"@id": url,
|
57
|
+
"@type": "WebPage",
|
58
|
+
"state": "not_found",
|
59
|
+
"via": "schema_org",
|
60
|
+
"errors": [str(error)],
|
61
|
+
}
|
62
|
+
|
63
|
+
if response.status_code >= 400:
|
64
|
+
if response.status_code in [404, 410]:
|
65
|
+
state = "not_found"
|
66
|
+
elif response.status_code in [401, 403]:
|
67
|
+
state = "forbidden"
|
68
|
+
else:
|
69
|
+
state = "bad_request"
|
70
|
+
return {"@id": url, "@type": "WebPage", "state": state, "via": "schema_org"}
|
71
|
+
elif response.headers.get("content-type") == "application/pdf":
|
72
|
+
try:
|
73
|
+
pdf = pikepdf.Pdf.open(io.BytesIO(response.content))
|
74
|
+
meta = pdf.docinfo if pdf.docinfo else {}
|
75
|
+
if meta.get("/doi", None) is not None:
|
76
|
+
return get_doi_meta(meta.get("/doi"))
|
77
|
+
date_modified = (
|
78
|
+
get_datetime_from_pdf_time(meta.get("/ModDate"))
|
79
|
+
if meta.get("/ModDate", None)
|
80
|
+
else None
|
81
|
+
)
|
82
|
+
name = meta.get("/Title", None)
|
83
|
+
return compact(
|
84
|
+
{
|
85
|
+
"@id": url,
|
86
|
+
"@type": "DigitalDocument",
|
87
|
+
"via": "schema_org",
|
88
|
+
"name": str(name),
|
89
|
+
"datePublished": date_modified,
|
90
|
+
"dateAccessed": datetime.now().isoformat("T", "seconds")
|
91
|
+
if date_modified is None
|
92
|
+
else None,
|
93
|
+
}
|
94
|
+
)
|
95
|
+
except Exception as error:
|
96
|
+
print(error)
|
97
|
+
return {
|
98
|
+
"@id": url,
|
99
|
+
"@type": "WebPage",
|
100
|
+
"state": "bad_request",
|
101
|
+
"via": "schema_org",
|
102
|
+
}
|
103
|
+
|
104
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
105
|
+
|
106
|
+
# load html meta tags
|
107
|
+
data = get_html_meta(soup)
|
108
|
+
|
109
|
+
# load site-specific metadata
|
110
|
+
data |= web_translator(soup, url)
|
111
|
+
|
112
|
+
# load schema.org metadata. If there are multiple schema.org blocks, load them all,
|
113
|
+
# and pick the first one with a supported type
|
114
|
+
list = [
|
115
|
+
json.loads(x.text) for x in soup.find_all("script", type="application/ld+json")
|
116
|
+
]
|
117
|
+
json_ld = next(
|
118
|
+
(i for i in list if i.get("@type", None) in SO_TO_CM_TRANSLATIONS),
|
119
|
+
None,
|
120
|
+
)
|
121
|
+
if json_ld is not None:
|
122
|
+
data |= json_ld
|
123
|
+
|
124
|
+
# if @id is a DOI, get metadata from Crossref or DataCite
|
125
|
+
if validate_doi(data.get("@id", None)):
|
126
|
+
return get_doi_meta(data.get("@id", None))
|
127
|
+
|
128
|
+
# if @id is None, use url
|
129
|
+
elif data.get("@id", None) is None:
|
130
|
+
data["@id"] = url
|
131
|
+
|
132
|
+
# if @type is None, use WebSite
|
133
|
+
elif data.get("@type", None) is None:
|
134
|
+
data["@type"] = "WebSite"
|
135
|
+
|
136
|
+
# author and creator are synonyms
|
137
|
+
if data.get("author", None) is None and data.get("creator", None) is not None:
|
138
|
+
data["author"] = data["creator"]
|
139
|
+
|
140
|
+
return data | {"via": "schema_org", "state": "findable"}
|
141
|
+
|
142
|
+
|
143
|
+
def read_schema_org(data: Optional[dict], **kwargs) -> Commonmeta:
|
144
|
+
"""read_schema_org"""
|
145
|
+
if (
|
146
|
+
data is None
|
147
|
+
or isinstance(data, dict)
|
148
|
+
and data.get("state", None) in ["not_found", "forbidden", "bad_request"]
|
149
|
+
):
|
150
|
+
return from_schema_org(data)
|
151
|
+
meta = data
|
152
|
+
|
153
|
+
read_options = kwargs or {}
|
154
|
+
|
155
|
+
_id = meta.get("@id", None)
|
156
|
+
if _id is None:
|
157
|
+
_id = meta.get("identifier", None)
|
158
|
+
_id = normalize_id(_id)
|
159
|
+
_type = SO_TO_CM_TRANSLATIONS.get(meta.get("@type", None), "WebPage")
|
160
|
+
additional_type = meta.get("additionalType", None)
|
161
|
+
url = normalize_url(meta.get("url", None)) or _id
|
162
|
+
|
163
|
+
# Authors should be list of objects or strings
|
164
|
+
authors = wrap(meta.get("author", None))
|
165
|
+
contributors = get_authors(from_schema_org_creators(authors))
|
166
|
+
contrib = presence(
|
167
|
+
get_authors(from_schema_org_creators(wrap(meta.get("editor", None))))
|
168
|
+
)
|
169
|
+
if contrib:
|
170
|
+
contributors = contributors + contrib
|
171
|
+
|
172
|
+
if meta.get("name", None) is not None:
|
173
|
+
titles = [{"title": meta.get("name")}]
|
174
|
+
elif meta.get("headline", None) is not None:
|
175
|
+
titles = [{"title": meta.get("headline")}]
|
176
|
+
else:
|
177
|
+
titles = None
|
178
|
+
|
179
|
+
date: dict = defaultdict(list)
|
180
|
+
date["published"] = strip_milliseconds(meta.get("datePublished", None))
|
181
|
+
date["updated"] = strip_milliseconds(meta.get("dateModified", None))
|
182
|
+
# if no date is found, use today's date
|
183
|
+
if date == {"published": None, "updated": None}:
|
184
|
+
date["accessed"] = read_options.get(
|
185
|
+
"dateAccessed", None
|
186
|
+
) or datetime.now().isoformat("T", "seconds")
|
187
|
+
|
188
|
+
publisher = meta.get("publisher", None)
|
189
|
+
if publisher is not None:
|
190
|
+
publisher = py_.omit(
|
191
|
+
publisher, ["@type", "logo", "url", "disambiguatingDescription"]
|
192
|
+
)
|
193
|
+
|
194
|
+
license_ = meta.get("license", None)
|
195
|
+
if license_ is not None:
|
196
|
+
license_ = normalize_cc_url(license_)
|
197
|
+
license_ = dict_to_spdx({"url": license_}) if license_ else None
|
198
|
+
|
199
|
+
if _type == "Dataset":
|
200
|
+
container_url = parse_attributes(
|
201
|
+
from_schema_org(meta.get("includedInDataCatalog", None)),
|
202
|
+
content="url",
|
203
|
+
first=True,
|
204
|
+
)
|
205
|
+
container = compact(
|
206
|
+
{
|
207
|
+
"type": "DataRepository",
|
208
|
+
"title": parse_attributes(
|
209
|
+
from_schema_org(meta.get("includedInDataCatalog", None)),
|
210
|
+
content="name",
|
211
|
+
first=True,
|
212
|
+
),
|
213
|
+
"identifier": container_url,
|
214
|
+
"identifierType": "URL" if container_url is not None else None,
|
215
|
+
"volume": meta.get("volumeNumber", None),
|
216
|
+
"issue": meta.get("issueNumber", None),
|
217
|
+
"firstPage": meta.get("pageStart", None),
|
218
|
+
"lastPage": meta.get("pageEnd", None),
|
219
|
+
}
|
220
|
+
)
|
221
|
+
elif _type == "Article":
|
222
|
+
issn = py_.get(meta, "isPartOf.issn")
|
223
|
+
container_url = py_.get(meta, "publisher.url")
|
224
|
+
container = compact(
|
225
|
+
{
|
226
|
+
"type": "Periodical",
|
227
|
+
"title": py_.get(meta, "isPartOf.name"),
|
228
|
+
"identifier": issn
|
229
|
+
if issn is not None
|
230
|
+
else container_url
|
231
|
+
if container_url is not None
|
232
|
+
else None,
|
233
|
+
"identifierType": "ISSN"
|
234
|
+
if issn is not None
|
235
|
+
else "URL"
|
236
|
+
if container_url is not None
|
237
|
+
else None,
|
238
|
+
}
|
239
|
+
)
|
240
|
+
else:
|
241
|
+
container = {}
|
242
|
+
|
243
|
+
references = wrap(schema_org_references(meta))
|
244
|
+
funding_references = [
|
245
|
+
get_funding_reference(i) for i in wrap(meta.get("funder", None))
|
246
|
+
]
|
247
|
+
|
248
|
+
descriptions = [
|
249
|
+
{
|
250
|
+
"description": sanitize(i),
|
251
|
+
"type": "Abstract",
|
252
|
+
}
|
253
|
+
for i in wrap(meta.get("description"))
|
254
|
+
]
|
255
|
+
|
256
|
+
# convert keywords as comma-separated string into list
|
257
|
+
subj = meta.get("keywords", None)
|
258
|
+
if isinstance(subj, str):
|
259
|
+
subj = subj.lower().split(", ")
|
260
|
+
subjects = [name_to_fos(i) for i in wrap(subj)]
|
261
|
+
|
262
|
+
if isinstance(meta.get("inLanguage"), str):
|
263
|
+
language = meta.get("inLanguage")
|
264
|
+
elif isinstance(meta.get("inLanguage"), list):
|
265
|
+
language = py_.get(meta, "inLanguage.0")
|
266
|
+
elif isinstance(meta.get("inLanguage"), dict):
|
267
|
+
language = py_.get(meta, "inLanguage.alternateName") or py_.get(
|
268
|
+
meta, "inLanguage.name"
|
269
|
+
)
|
270
|
+
else:
|
271
|
+
language = None
|
272
|
+
|
273
|
+
geo_locations = [
|
274
|
+
schema_org_geolocation(i) for i in wrap(meta.get("spatialCoverage", None))
|
275
|
+
]
|
276
|
+
identifiers = None
|
277
|
+
provider = (
|
278
|
+
get_doi_ra(_id)
|
279
|
+
if doi_from_url(_id)
|
280
|
+
else parse_attributes(meta.get("provider", None), content="name", first=True)
|
281
|
+
)
|
282
|
+
state = "findable"
|
283
|
+
|
284
|
+
return {
|
285
|
+
# required attributes
|
286
|
+
"id": _id,
|
287
|
+
"type": _type,
|
288
|
+
"url": url,
|
289
|
+
"contributors": presence(contributors),
|
290
|
+
"titles": titles,
|
291
|
+
"publisher": publisher,
|
292
|
+
"date": compact(date),
|
293
|
+
# recommended and optional attributes
|
294
|
+
"additional_type": additional_type,
|
295
|
+
"subjects": presence(subjects),
|
296
|
+
"language": get_language(language),
|
297
|
+
"identifiers": identifiers,
|
298
|
+
"sizes": None,
|
299
|
+
"formats": None,
|
300
|
+
"version": meta.get("version", None),
|
301
|
+
"license": license_,
|
302
|
+
"descriptions": presence(descriptions),
|
303
|
+
"geo_locations": presence(geo_locations),
|
304
|
+
"funding_references": presence(funding_references),
|
305
|
+
"references": presence(references),
|
306
|
+
# optional attributes
|
307
|
+
"container": container,
|
308
|
+
"provider": provider,
|
309
|
+
"state": state,
|
310
|
+
} | read_options
|
311
|
+
|
312
|
+
|
313
|
+
def get_doi_meta(doi: str) -> Optional[dict]:
|
314
|
+
"""get_doi_meta"""
|
315
|
+
ra = get_doi_ra(doi)
|
316
|
+
if ra == "Crossref":
|
317
|
+
return get_crossref(doi)
|
318
|
+
elif ra == "DataCite":
|
319
|
+
return get_datacite(doi)
|
320
|
+
return None
|
321
|
+
|
322
|
+
|
323
|
+
def schema_org_related_item(meta, relation_type=None):
|
324
|
+
"""Related items"""
|
325
|
+
normalize_ids(
|
326
|
+
ids=wrap(meta.get(relation_type, None)),
|
327
|
+
relation_type=SO_TO_DC_RELATION_TYPES.get(relation_type),
|
328
|
+
)
|
329
|
+
|
330
|
+
|
331
|
+
def schema_org_reverse_related_item(meta, relation_type=None):
|
332
|
+
"""Reverse related items"""
|
333
|
+
normalize_ids(
|
334
|
+
ids=wrap(py_.get(meta, f"@reverse.{relation_type}")),
|
335
|
+
relation_type=SO_TO_DC_REVERSE_RELATION_TYPES.get(relation_type),
|
336
|
+
)
|
337
|
+
|
338
|
+
|
339
|
+
def schema_org_is_identical_to(meta):
|
340
|
+
"""isIdenticalTo is a special case because it can be a string or an object."""
|
341
|
+
schema_org_related_item(meta, relation_type="sameAs")
|
342
|
+
|
343
|
+
|
344
|
+
def schema_org_is_part_of(meta):
|
345
|
+
"""isPartOf is a special case because it can be a string or an object."""
|
346
|
+
schema_org_related_item(meta, relation_type="isPartOf")
|
347
|
+
|
348
|
+
|
349
|
+
def schema_org_has_part(meta):
|
350
|
+
"""hasPart is a special case because it can be a string or an object."""
|
351
|
+
schema_org_related_item(meta, relation_type="hasPart")
|
352
|
+
|
353
|
+
|
354
|
+
def schema_org_is_previous_version_of(meta):
|
355
|
+
"""isPreviousVersionOf is a special case because it can be a string or an object."""
|
356
|
+
schema_org_related_item(meta, relation_type="PredecessorOf")
|
357
|
+
|
358
|
+
|
359
|
+
def schema_org_is_new_version_of(meta):
|
360
|
+
"""isNewVersionOf is a special case because it can be a string or an object."""
|
361
|
+
schema_org_related_item(meta, relation_type="SuccessorOf")
|
362
|
+
|
363
|
+
|
364
|
+
def schema_org_references(meta):
|
365
|
+
"""references is a special case because it can be a string or an object."""
|
366
|
+
schema_org_related_item(meta, relation_type="citation")
|
367
|
+
|
368
|
+
|
369
|
+
def schema_org_is_referenced_by(meta):
|
370
|
+
"""isReferencedBy is a special case because it can be a string or an object."""
|
371
|
+
schema_org_reverse_related_item(meta, relation_type="citation")
|
372
|
+
|
373
|
+
|
374
|
+
def schema_org_is_supplement_to(meta):
|
375
|
+
"""isSupplementTo is a special case because it can be a string or an object."""
|
376
|
+
schema_org_reverse_related_item(meta, relation_type="isBasedOn")
|
377
|
+
|
378
|
+
|
379
|
+
def schema_org_is_supplemented_by(meta):
|
380
|
+
"""isSupplementedBy is a special case because it can be a string or an object."""
|
381
|
+
schema_org_related_item(meta, relation_type="isBasedOn")
|
382
|
+
|
383
|
+
|
384
|
+
def schema_org_geolocation(geo_location: Optional[dict]) -> Optional[dict]:
|
385
|
+
"""Geolocations in Schema.org format"""
|
386
|
+
if not isinstance(geo_location, dict):
|
387
|
+
return None
|
388
|
+
|
389
|
+
_type = py_.get(geo_location, "geo.@type")
|
390
|
+
longitude = py_.get(geo_location, "geo.longitude")
|
391
|
+
latitude = py_.get(geo_location, "geo.latitude")
|
392
|
+
|
393
|
+
if _type == "GeoCoordinates":
|
394
|
+
return {
|
395
|
+
"geoLocationPoint": {"pointLongitude": longitude, "pointLatitude": latitude}
|
396
|
+
}
|
397
|
+
return None
|
398
|
+
|
399
|
+
|
400
|
+
def get_html_meta(soup):
|
401
|
+
"""Get metadata from HTML meta tags"""
|
402
|
+
data = {}
|
403
|
+
pid = (
|
404
|
+
soup.select_one("meta[name='citation_doi']")
|
405
|
+
or soup.select_one("meta[name='dc.identifier']")
|
406
|
+
or soup.select_one("meta[name='DC.identifier']")
|
407
|
+
or soup.select_one("meta[name='bepress_citation_doi']")
|
408
|
+
or soup.select_one('[rel="canonical"]')
|
409
|
+
)
|
410
|
+
if pid is not None:
|
411
|
+
pid = pid.get("content", None) or pid.get("href", None)
|
412
|
+
data["@id"] = normalize_id(pid)
|
413
|
+
|
414
|
+
_type = (
|
415
|
+
soup.select_one("meta[property='og:type']")
|
416
|
+
or soup.select_one("meta[name='dc.type']")
|
417
|
+
or soup.select_one("meta[name='DC.type']")
|
418
|
+
)
|
419
|
+
data["@type"] = _type["content"].capitalize() if _type else None
|
420
|
+
|
421
|
+
url = soup.select_one("meta[property='og:url']") or soup.select_one(
|
422
|
+
"meta[name='twitter:url']"
|
423
|
+
)
|
424
|
+
data["url"] = url["content"] if url else None
|
425
|
+
if pid is None and url is not None:
|
426
|
+
data["@id"] = url["content"]
|
427
|
+
|
428
|
+
title = (
|
429
|
+
soup.select_one("meta[name='citation_title']")
|
430
|
+
or soup.select_one("meta[name='dc.title']")
|
431
|
+
or soup.select_one("meta[name='DC.title']")
|
432
|
+
or soup.select_one("meta[property='og:title']")
|
433
|
+
or soup.select_one("meta[name='twitter:title']")
|
434
|
+
)
|
435
|
+
data["name"] = title["content"] if title else None
|
436
|
+
|
437
|
+
author = soup.select("meta[name='citation_author']")
|
438
|
+
data["author"] = [i["content"] for i in author] if author else None
|
439
|
+
|
440
|
+
description = soup.select_one("meta[name='citation_abstract']") or soup.select_one(
|
441
|
+
"meta[name='dc.description']"
|
442
|
+
or soup.select_one("meta[property='og:description']")
|
443
|
+
or soup.select_one("meta[name='twitter:description']")
|
444
|
+
)
|
445
|
+
data["description"] = description["content"] if description else None
|
446
|
+
|
447
|
+
keywords = soup.select_one("meta[name='citation_keywords']")
|
448
|
+
data["keywords"] = (
|
449
|
+
str(keywords["content"]).replace(";", ",").rstrip(", ") if keywords else None
|
450
|
+
)
|
451
|
+
|
452
|
+
date_published = (
|
453
|
+
soup.select_one("meta[name='citation_publication_date']")
|
454
|
+
or soup.select_one("meta[name='dc.date']")
|
455
|
+
or soup.select_one("meta[property='article:published_time']")
|
456
|
+
)
|
457
|
+
data["datePublished"] = (
|
458
|
+
get_iso8601_date(date_published["content"]) if date_published else None
|
459
|
+
)
|
460
|
+
date_modified = soup.select_one(
|
461
|
+
"meta[property='og:updated_time']"
|
462
|
+
or soup.select_one("meta[property='article:modified_time']")
|
463
|
+
)
|
464
|
+
data["dateModified"] = (
|
465
|
+
get_iso8601_date(date_modified["content"]) if date_modified else None
|
466
|
+
)
|
467
|
+
license_ = soup.select_one("meta[name='dc.rights']")
|
468
|
+
data["license"] = license_["content"] if license_ else None
|
469
|
+
|
470
|
+
lang = soup.select_one("meta[name='dc.language']") or soup.select_one(
|
471
|
+
"meta[name='citation_language']"
|
472
|
+
)
|
473
|
+
if lang is not None:
|
474
|
+
data["inLanguage"] = lang["content"]
|
475
|
+
else:
|
476
|
+
html = soup.select_one("html")
|
477
|
+
if html is not None:
|
478
|
+
lang = html.get("lang", None)
|
479
|
+
if lang is not None:
|
480
|
+
data["inLanguage"] = lang
|
481
|
+
|
482
|
+
publisher = soup.select_one("meta[property='og:site_name']")
|
483
|
+
data["publisher"] = {"name": publisher["content"]} if publisher else None
|
484
|
+
|
485
|
+
name = soup.select_one("meta[property='og:site_name']")
|
486
|
+
issn = soup.select_one("meta[name='citation_issn']")
|
487
|
+
data["isPartOf"] = compact(
|
488
|
+
{
|
489
|
+
"name": name["content"] if name else None,
|
490
|
+
"issn": issn["content"] if issn else None,
|
491
|
+
}
|
492
|
+
)
|
493
|
+
return data
|
494
|
+
|
495
|
+
|
496
|
+
def get_funding_reference(dct):
|
497
|
+
"""Get funding reference"""
|
498
|
+
return compact(
|
499
|
+
{
|
500
|
+
"funderName": dct.get("name", None),
|
501
|
+
"funderIdentifier": dct.get("@id", None),
|
502
|
+
"funderIdentifierType": "Crossref Funder ID"
|
503
|
+
if dct.get("@id", None)
|
504
|
+
else None,
|
505
|
+
}
|
506
|
+
)
|