commonmeta-py 0.23__py3-none-any.whl → 0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. commonmeta/__init__.py +96 -0
  2. commonmeta/api_utils.py +77 -0
  3. commonmeta/author_utils.py +260 -0
  4. commonmeta/base_utils.py +121 -0
  5. commonmeta/cli.py +200 -0
  6. commonmeta/constants.py +587 -0
  7. commonmeta/crossref_utils.py +575 -0
  8. commonmeta/date_utils.py +193 -0
  9. commonmeta/doi_utils.py +273 -0
  10. commonmeta/metadata.py +320 -0
  11. commonmeta/readers/__init__.py +1 -0
  12. commonmeta/readers/cff_reader.py +199 -0
  13. commonmeta/readers/codemeta_reader.py +112 -0
  14. commonmeta/readers/commonmeta_reader.py +13 -0
  15. commonmeta/readers/crossref_reader.py +409 -0
  16. commonmeta/readers/crossref_xml_reader.py +505 -0
  17. commonmeta/readers/csl_reader.py +98 -0
  18. commonmeta/readers/datacite_reader.py +390 -0
  19. commonmeta/readers/datacite_xml_reader.py +359 -0
  20. commonmeta/readers/inveniordm_reader.py +218 -0
  21. commonmeta/readers/json_feed_reader.py +420 -0
  22. commonmeta/readers/kbase_reader.py +205 -0
  23. commonmeta/readers/ris_reader.py +103 -0
  24. commonmeta/readers/schema_org_reader.py +506 -0
  25. commonmeta/resources/cff_v1.2.0.json +1827 -0
  26. commonmeta/resources/commonmeta_v0.12.json +601 -0
  27. commonmeta/resources/commonmeta_v0.13.json +559 -0
  28. commonmeta/resources/commonmeta_v0.14.json +573 -0
  29. commonmeta/resources/crossref/AccessIndicators.xsd +47 -0
  30. commonmeta/resources/crossref/JATS-journalpublishing1-3d2-mathml3-elements.xsd +10130 -0
  31. commonmeta/resources/crossref/JATS-journalpublishing1-3d2-mathml3.xsd +48 -0
  32. commonmeta/resources/crossref/JATS-journalpublishing1-elements.xsd +8705 -0
  33. commonmeta/resources/crossref/JATS-journalpublishing1-mathml3-elements.xsd +8608 -0
  34. commonmeta/resources/crossref/JATS-journalpublishing1-mathml3.xsd +49 -0
  35. commonmeta/resources/crossref/JATS-journalpublishing1.xsd +6176 -0
  36. commonmeta/resources/crossref/clinicaltrials.xsd +61 -0
  37. commonmeta/resources/crossref/common5.3.1.xsd +1538 -0
  38. commonmeta/resources/crossref/crossref5.3.1.xsd +1949 -0
  39. commonmeta/resources/crossref/crossref_query_output3.0.xsd +1097 -0
  40. commonmeta/resources/crossref/fundref.xsd +49 -0
  41. commonmeta/resources/crossref/module-ali.xsd +39 -0
  42. commonmeta/resources/crossref/relations.xsd +444 -0
  43. commonmeta/resources/crossref-v0.2.json +60 -0
  44. commonmeta/resources/csl-data.json +538 -0
  45. commonmeta/resources/datacite-v4.5.json +829 -0
  46. commonmeta/resources/datacite-v4.5pr.json +608 -0
  47. commonmeta/resources/ietf-bcp-47.json +3025 -0
  48. commonmeta/resources/iso-8601.json +3182 -0
  49. commonmeta/resources/spdx/licenses.json +4851 -0
  50. commonmeta/resources/spdx-schema..json +903 -0
  51. commonmeta/resources/styles/apa.csl +1697 -0
  52. commonmeta/resources/styles/chicago-author-date.csl +684 -0
  53. commonmeta/resources/styles/harvard-cite-them-right.csl +321 -0
  54. commonmeta/resources/styles/ieee.csl +468 -0
  55. commonmeta/resources/styles/modern-language-association.csl +341 -0
  56. commonmeta/resources/styles/vancouver.csl +376 -0
  57. commonmeta/schema_utils.py +27 -0
  58. commonmeta/translators.py +47 -0
  59. commonmeta/utils.py +1108 -0
  60. commonmeta/writers/__init__.py +1 -0
  61. commonmeta/writers/bibtex_writer.py +149 -0
  62. commonmeta/writers/citation_writer.py +70 -0
  63. commonmeta/writers/commonmeta_writer.py +68 -0
  64. commonmeta/writers/crossref_xml_writer.py +17 -0
  65. commonmeta/writers/csl_writer.py +79 -0
  66. commonmeta/writers/datacite_writer.py +193 -0
  67. commonmeta/writers/inveniordm_writer.py +94 -0
  68. commonmeta/writers/ris_writer.py +58 -0
  69. commonmeta/writers/schema_org_writer.py +146 -0
  70. {commonmeta_py-0.23.dist-info → commonmeta_py-0.24.dist-info}/METADATA +56 -45
  71. commonmeta_py-0.24.dist-info/RECORD +75 -0
  72. {commonmeta_py-0.23.dist-info → commonmeta_py-0.24.dist-info}/WHEEL +1 -1
  73. commonmeta_py-0.24.dist-info/entry_points.txt +3 -0
  74. commonmeta_py-0.23.dist-info/RECORD +0 -5
  75. /commonmeta_py/__init__.py → /commonmeta/readers/bibtex_reader.py +0 -0
  76. {commonmeta_py-0.23.dist-info/licenses → commonmeta_py-0.24.dist-info}/LICENSE +0 -0
@@ -0,0 +1,103 @@
1
+ """RIS reader for commonmeta-py"""
2
+ from typing import Optional
3
+
4
+ from ..utils import compact, normalize_url, wrap
5
+ from ..base_utils import presence
6
+ from ..author_utils import get_authors
7
+ from ..date_utils import get_date_from_parts
8
+ from ..doi_utils import normalize_doi, doi_from_url
9
+ from ..constants import RIS_TO_CM_TRANSLATIONS, Commonmeta
10
+
11
+
12
+ def read_ris(data: Optional[str], **kwargs) -> Commonmeta:
13
+ """read_ris"""
14
+
15
+ meta = ris_meta(data=data)
16
+ read_options = kwargs or {}
17
+
18
+ if not isinstance(meta, dict):
19
+ return {"state": "not_found"}
20
+
21
+ _id = read_options.get("doi", None) or normalize_doi(meta.get("DO", None))
22
+ _type = RIS_TO_CM_TRANSLATIONS.get(meta.get("TY", None), "Other")
23
+ container_type = "Journal" if _type == "JournalArticle" else None
24
+
25
+ def get_author(author):
26
+ """get_author"""
27
+ return {"creatorName": author}
28
+
29
+ authors = [get_author(i) for i in wrap(meta.get("AU", None))]
30
+ contributors = get_authors(authors)
31
+ date = {}
32
+ if meta.get("PY", None) is not None:
33
+ date["published"] = get_date_from_parts(*str(meta.get("PY", None)).split("/"))
34
+ if meta.get("Y1", None) is not None:
35
+ date["created"] = get_date_from_parts(*str(meta.get("Y1", None)).split("/"))
36
+ # related_identifiers = if meta.fetch('T2', nil).present? & & meta.fetch('SN', nil).present?
37
+ # [{'type' = > 'Periodical',
38
+ # 'id'= > meta.fetch('SN', nil),
39
+ # 'relatedIdentifierType'= > 'ISSN',
40
+ # 'relationType'= > 'IsPartOf',
41
+ # 'title' = > meta.fetch('T2', nil)}.compact]
42
+ # else
43
+ # []
44
+ # end
45
+ descriptions = None
46
+ if meta.get("AB", None) is not None:
47
+ descriptions = [{"description": meta.get("AB"), "type": "Abstract"}]
48
+ if meta.get("T2", None) is not None:
49
+ container = compact(
50
+ {
51
+ "type": container_type,
52
+ "title": meta.get("T2", None),
53
+ "volume": meta.get("VL", None),
54
+ "issue": meta.get("IS", None),
55
+ "firstPage": meta.get("SP", None),
56
+ "lastPage": meta.get("EP", None),
57
+ }
58
+ )
59
+ else:
60
+ container = None
61
+ if meta.get("PB", None) is not None:
62
+ publisher = {"name": meta.get("PB")}
63
+ else:
64
+ publisher = None
65
+ subjects = wrap(meta.get("KW", None))
66
+ state = "findable" if meta.get("DO", None) or read_options else "not_found"
67
+
68
+ return {
69
+ "id": _id,
70
+ "type": _type,
71
+ "doi": doi_from_url(_id),
72
+ "url": normalize_url(meta.get("UR", None)),
73
+ "titles": [{"title": meta.get("T1", None)}],
74
+ "descriptions": descriptions,
75
+ "contributors": presence(contributors),
76
+ "publisher": presence(publisher),
77
+ "container": container,
78
+ # 'related_identifiers': related_identifiers,
79
+ "date": date,
80
+ "subjects": subjects,
81
+ "language": meta.get("LA", None),
82
+ "state": state,
83
+ } | read_options
84
+
85
+
86
+ def ris_meta(data):
87
+ """ris_meta"""
88
+ meta = {}
89
+ if data is None:
90
+ return meta
91
+ for line in data.split("\n"):
92
+ values = line.split("-", 2)
93
+ key = values[0].strip()
94
+ if len(values) == 1:
95
+ continue
96
+ if meta.get(key, None) is None:
97
+ meta[key] = values[1].strip()
98
+ elif isinstance(meta[key], str):
99
+ meta[key] = [meta[key]]
100
+ elif isinstance(meta[key], list):
101
+ meta[key].append(values[1].strip())
102
+
103
+ return meta
@@ -0,0 +1,506 @@
1
+ """schema_org reader for commonmeta-py"""
2
+
3
+ from typing import Optional
4
+ import io
5
+ import orjson as json
6
+ from datetime import datetime
7
+ from collections import defaultdict
8
+ import httpx
9
+ from pydash import py_
10
+ from bs4 import BeautifulSoup
11
+ import pikepdf
12
+
13
+ from ..utils import (
14
+ dict_to_spdx,
15
+ normalize_cc_url,
16
+ from_schema_org,
17
+ from_schema_org_creators,
18
+ normalize_id,
19
+ normalize_ids,
20
+ normalize_url,
21
+ name_to_fos,
22
+ get_language,
23
+ )
24
+ from ..readers.crossref_reader import get_crossref
25
+ from ..readers.datacite_reader import get_datacite
26
+ from ..base_utils import wrap, compact, presence, parse_attributes, sanitize
27
+ from ..author_utils import get_authors
28
+ from ..date_utils import (
29
+ get_iso8601_date,
30
+ strip_milliseconds,
31
+ get_datetime_from_pdf_time,
32
+ )
33
+ from ..doi_utils import doi_from_url, get_doi_ra, validate_doi
34
+ from ..translators import web_translator
35
+ from ..constants import (
36
+ SO_TO_CM_TRANSLATIONS,
37
+ SO_TO_DC_RELATION_TYPES,
38
+ SO_TO_DC_REVERSE_RELATION_TYPES,
39
+ Commonmeta,
40
+ )
41
+
42
+
43
+ def get_schema_org(pid: str, **kwargs) -> dict:
44
+ """get_schema_org"""
45
+ if pid is None:
46
+ return {"state": "not_found"}
47
+ url = pid
48
+
49
+ # if pid represents a DOI, get metadata from Crossref or DataCite
50
+ if doi_from_url(pid):
51
+ return get_doi_meta(doi_from_url(pid))
52
+ try:
53
+ response = httpx.get(url, timeout=10, follow_redirects=True, **kwargs)
54
+ except httpx.ConnectError as error:
55
+ return {
56
+ "@id": url,
57
+ "@type": "WebPage",
58
+ "state": "not_found",
59
+ "via": "schema_org",
60
+ "errors": [str(error)],
61
+ }
62
+
63
+ if response.status_code >= 400:
64
+ if response.status_code in [404, 410]:
65
+ state = "not_found"
66
+ elif response.status_code in [401, 403]:
67
+ state = "forbidden"
68
+ else:
69
+ state = "bad_request"
70
+ return {"@id": url, "@type": "WebPage", "state": state, "via": "schema_org"}
71
+ elif response.headers.get("content-type") == "application/pdf":
72
+ try:
73
+ pdf = pikepdf.Pdf.open(io.BytesIO(response.content))
74
+ meta = pdf.docinfo if pdf.docinfo else {}
75
+ if meta.get("/doi", None) is not None:
76
+ return get_doi_meta(meta.get("/doi"))
77
+ date_modified = (
78
+ get_datetime_from_pdf_time(meta.get("/ModDate"))
79
+ if meta.get("/ModDate", None)
80
+ else None
81
+ )
82
+ name = meta.get("/Title", None)
83
+ return compact(
84
+ {
85
+ "@id": url,
86
+ "@type": "DigitalDocument",
87
+ "via": "schema_org",
88
+ "name": str(name),
89
+ "datePublished": date_modified,
90
+ "dateAccessed": datetime.now().isoformat("T", "seconds")
91
+ if date_modified is None
92
+ else None,
93
+ }
94
+ )
95
+ except Exception as error:
96
+ print(error)
97
+ return {
98
+ "@id": url,
99
+ "@type": "WebPage",
100
+ "state": "bad_request",
101
+ "via": "schema_org",
102
+ }
103
+
104
+ soup = BeautifulSoup(response.text, "html.parser")
105
+
106
+ # load html meta tags
107
+ data = get_html_meta(soup)
108
+
109
+ # load site-specific metadata
110
+ data |= web_translator(soup, url)
111
+
112
+ # load schema.org metadata. If there are multiple schema.org blocks, load them all,
113
+ # and pick the first one with a supported type
114
+ list = [
115
+ json.loads(x.text) for x in soup.find_all("script", type="application/ld+json")
116
+ ]
117
+ json_ld = next(
118
+ (i for i in list if i.get("@type", None) in SO_TO_CM_TRANSLATIONS),
119
+ None,
120
+ )
121
+ if json_ld is not None:
122
+ data |= json_ld
123
+
124
+ # if @id is a DOI, get metadata from Crossref or DataCite
125
+ if validate_doi(data.get("@id", None)):
126
+ return get_doi_meta(data.get("@id", None))
127
+
128
+ # if @id is None, use url
129
+ elif data.get("@id", None) is None:
130
+ data["@id"] = url
131
+
132
+ # if @type is None, use WebSite
133
+ elif data.get("@type", None) is None:
134
+ data["@type"] = "WebSite"
135
+
136
+ # author and creator are synonyms
137
+ if data.get("author", None) is None and data.get("creator", None) is not None:
138
+ data["author"] = data["creator"]
139
+
140
+ return data | {"via": "schema_org", "state": "findable"}
141
+
142
+
143
+ def read_schema_org(data: Optional[dict], **kwargs) -> Commonmeta:
144
+ """read_schema_org"""
145
+ if (
146
+ data is None
147
+ or isinstance(data, dict)
148
+ and data.get("state", None) in ["not_found", "forbidden", "bad_request"]
149
+ ):
150
+ return from_schema_org(data)
151
+ meta = data
152
+
153
+ read_options = kwargs or {}
154
+
155
+ _id = meta.get("@id", None)
156
+ if _id is None:
157
+ _id = meta.get("identifier", None)
158
+ _id = normalize_id(_id)
159
+ _type = SO_TO_CM_TRANSLATIONS.get(meta.get("@type", None), "WebPage")
160
+ additional_type = meta.get("additionalType", None)
161
+ url = normalize_url(meta.get("url", None)) or _id
162
+
163
+ # Authors should be list of objects or strings
164
+ authors = wrap(meta.get("author", None))
165
+ contributors = get_authors(from_schema_org_creators(authors))
166
+ contrib = presence(
167
+ get_authors(from_schema_org_creators(wrap(meta.get("editor", None))))
168
+ )
169
+ if contrib:
170
+ contributors = contributors + contrib
171
+
172
+ if meta.get("name", None) is not None:
173
+ titles = [{"title": meta.get("name")}]
174
+ elif meta.get("headline", None) is not None:
175
+ titles = [{"title": meta.get("headline")}]
176
+ else:
177
+ titles = None
178
+
179
+ date: dict = defaultdict(list)
180
+ date["published"] = strip_milliseconds(meta.get("datePublished", None))
181
+ date["updated"] = strip_milliseconds(meta.get("dateModified", None))
182
+ # if no date is found, use today's date
183
+ if date == {"published": None, "updated": None}:
184
+ date["accessed"] = read_options.get(
185
+ "dateAccessed", None
186
+ ) or datetime.now().isoformat("T", "seconds")
187
+
188
+ publisher = meta.get("publisher", None)
189
+ if publisher is not None:
190
+ publisher = py_.omit(
191
+ publisher, ["@type", "logo", "url", "disambiguatingDescription"]
192
+ )
193
+
194
+ license_ = meta.get("license", None)
195
+ if license_ is not None:
196
+ license_ = normalize_cc_url(license_)
197
+ license_ = dict_to_spdx({"url": license_}) if license_ else None
198
+
199
+ if _type == "Dataset":
200
+ container_url = parse_attributes(
201
+ from_schema_org(meta.get("includedInDataCatalog", None)),
202
+ content="url",
203
+ first=True,
204
+ )
205
+ container = compact(
206
+ {
207
+ "type": "DataRepository",
208
+ "title": parse_attributes(
209
+ from_schema_org(meta.get("includedInDataCatalog", None)),
210
+ content="name",
211
+ first=True,
212
+ ),
213
+ "identifier": container_url,
214
+ "identifierType": "URL" if container_url is not None else None,
215
+ "volume": meta.get("volumeNumber", None),
216
+ "issue": meta.get("issueNumber", None),
217
+ "firstPage": meta.get("pageStart", None),
218
+ "lastPage": meta.get("pageEnd", None),
219
+ }
220
+ )
221
+ elif _type == "Article":
222
+ issn = py_.get(meta, "isPartOf.issn")
223
+ container_url = py_.get(meta, "publisher.url")
224
+ container = compact(
225
+ {
226
+ "type": "Periodical",
227
+ "title": py_.get(meta, "isPartOf.name"),
228
+ "identifier": issn
229
+ if issn is not None
230
+ else container_url
231
+ if container_url is not None
232
+ else None,
233
+ "identifierType": "ISSN"
234
+ if issn is not None
235
+ else "URL"
236
+ if container_url is not None
237
+ else None,
238
+ }
239
+ )
240
+ else:
241
+ container = {}
242
+
243
+ references = wrap(schema_org_references(meta))
244
+ funding_references = [
245
+ get_funding_reference(i) for i in wrap(meta.get("funder", None))
246
+ ]
247
+
248
+ descriptions = [
249
+ {
250
+ "description": sanitize(i),
251
+ "type": "Abstract",
252
+ }
253
+ for i in wrap(meta.get("description"))
254
+ ]
255
+
256
+ # convert keywords as comma-separated string into list
257
+ subj = meta.get("keywords", None)
258
+ if isinstance(subj, str):
259
+ subj = subj.lower().split(", ")
260
+ subjects = [name_to_fos(i) for i in wrap(subj)]
261
+
262
+ if isinstance(meta.get("inLanguage"), str):
263
+ language = meta.get("inLanguage")
264
+ elif isinstance(meta.get("inLanguage"), list):
265
+ language = py_.get(meta, "inLanguage.0")
266
+ elif isinstance(meta.get("inLanguage"), dict):
267
+ language = py_.get(meta, "inLanguage.alternateName") or py_.get(
268
+ meta, "inLanguage.name"
269
+ )
270
+ else:
271
+ language = None
272
+
273
+ geo_locations = [
274
+ schema_org_geolocation(i) for i in wrap(meta.get("spatialCoverage", None))
275
+ ]
276
+ identifiers = None
277
+ provider = (
278
+ get_doi_ra(_id)
279
+ if doi_from_url(_id)
280
+ else parse_attributes(meta.get("provider", None), content="name", first=True)
281
+ )
282
+ state = "findable"
283
+
284
+ return {
285
+ # required attributes
286
+ "id": _id,
287
+ "type": _type,
288
+ "url": url,
289
+ "contributors": presence(contributors),
290
+ "titles": titles,
291
+ "publisher": publisher,
292
+ "date": compact(date),
293
+ # recommended and optional attributes
294
+ "additional_type": additional_type,
295
+ "subjects": presence(subjects),
296
+ "language": get_language(language),
297
+ "identifiers": identifiers,
298
+ "sizes": None,
299
+ "formats": None,
300
+ "version": meta.get("version", None),
301
+ "license": license_,
302
+ "descriptions": presence(descriptions),
303
+ "geo_locations": presence(geo_locations),
304
+ "funding_references": presence(funding_references),
305
+ "references": presence(references),
306
+ # optional attributes
307
+ "container": container,
308
+ "provider": provider,
309
+ "state": state,
310
+ } | read_options
311
+
312
+
313
+ def get_doi_meta(doi: str) -> Optional[dict]:
314
+ """get_doi_meta"""
315
+ ra = get_doi_ra(doi)
316
+ if ra == "Crossref":
317
+ return get_crossref(doi)
318
+ elif ra == "DataCite":
319
+ return get_datacite(doi)
320
+ return None
321
+
322
+
323
+ def schema_org_related_item(meta, relation_type=None):
324
+ """Related items"""
325
+ normalize_ids(
326
+ ids=wrap(meta.get(relation_type, None)),
327
+ relation_type=SO_TO_DC_RELATION_TYPES.get(relation_type),
328
+ )
329
+
330
+
331
+ def schema_org_reverse_related_item(meta, relation_type=None):
332
+ """Reverse related items"""
333
+ normalize_ids(
334
+ ids=wrap(py_.get(meta, f"@reverse.{relation_type}")),
335
+ relation_type=SO_TO_DC_REVERSE_RELATION_TYPES.get(relation_type),
336
+ )
337
+
338
+
339
+ def schema_org_is_identical_to(meta):
340
+ """isIdenticalTo is a special case because it can be a string or an object."""
341
+ schema_org_related_item(meta, relation_type="sameAs")
342
+
343
+
344
+ def schema_org_is_part_of(meta):
345
+ """isPartOf is a special case because it can be a string or an object."""
346
+ schema_org_related_item(meta, relation_type="isPartOf")
347
+
348
+
349
+ def schema_org_has_part(meta):
350
+ """hasPart is a special case because it can be a string or an object."""
351
+ schema_org_related_item(meta, relation_type="hasPart")
352
+
353
+
354
+ def schema_org_is_previous_version_of(meta):
355
+ """isPreviousVersionOf is a special case because it can be a string or an object."""
356
+ schema_org_related_item(meta, relation_type="PredecessorOf")
357
+
358
+
359
+ def schema_org_is_new_version_of(meta):
360
+ """isNewVersionOf is a special case because it can be a string or an object."""
361
+ schema_org_related_item(meta, relation_type="SuccessorOf")
362
+
363
+
364
+ def schema_org_references(meta):
365
+ """references is a special case because it can be a string or an object."""
366
+ schema_org_related_item(meta, relation_type="citation")
367
+
368
+
369
+ def schema_org_is_referenced_by(meta):
370
+ """isReferencedBy is a special case because it can be a string or an object."""
371
+ schema_org_reverse_related_item(meta, relation_type="citation")
372
+
373
+
374
+ def schema_org_is_supplement_to(meta):
375
+ """isSupplementTo is a special case because it can be a string or an object."""
376
+ schema_org_reverse_related_item(meta, relation_type="isBasedOn")
377
+
378
+
379
+ def schema_org_is_supplemented_by(meta):
380
+ """isSupplementedBy is a special case because it can be a string or an object."""
381
+ schema_org_related_item(meta, relation_type="isBasedOn")
382
+
383
+
384
+ def schema_org_geolocation(geo_location: Optional[dict]) -> Optional[dict]:
385
+ """Geolocations in Schema.org format"""
386
+ if not isinstance(geo_location, dict):
387
+ return None
388
+
389
+ _type = py_.get(geo_location, "geo.@type")
390
+ longitude = py_.get(geo_location, "geo.longitude")
391
+ latitude = py_.get(geo_location, "geo.latitude")
392
+
393
+ if _type == "GeoCoordinates":
394
+ return {
395
+ "geoLocationPoint": {"pointLongitude": longitude, "pointLatitude": latitude}
396
+ }
397
+ return None
398
+
399
+
400
+ def get_html_meta(soup):
401
+ """Get metadata from HTML meta tags"""
402
+ data = {}
403
+ pid = (
404
+ soup.select_one("meta[name='citation_doi']")
405
+ or soup.select_one("meta[name='dc.identifier']")
406
+ or soup.select_one("meta[name='DC.identifier']")
407
+ or soup.select_one("meta[name='bepress_citation_doi']")
408
+ or soup.select_one('[rel="canonical"]')
409
+ )
410
+ if pid is not None:
411
+ pid = pid.get("content", None) or pid.get("href", None)
412
+ data["@id"] = normalize_id(pid)
413
+
414
+ _type = (
415
+ soup.select_one("meta[property='og:type']")
416
+ or soup.select_one("meta[name='dc.type']")
417
+ or soup.select_one("meta[name='DC.type']")
418
+ )
419
+ data["@type"] = _type["content"].capitalize() if _type else None
420
+
421
+ url = soup.select_one("meta[property='og:url']") or soup.select_one(
422
+ "meta[name='twitter:url']"
423
+ )
424
+ data["url"] = url["content"] if url else None
425
+ if pid is None and url is not None:
426
+ data["@id"] = url["content"]
427
+
428
+ title = (
429
+ soup.select_one("meta[name='citation_title']")
430
+ or soup.select_one("meta[name='dc.title']")
431
+ or soup.select_one("meta[name='DC.title']")
432
+ or soup.select_one("meta[property='og:title']")
433
+ or soup.select_one("meta[name='twitter:title']")
434
+ )
435
+ data["name"] = title["content"] if title else None
436
+
437
+ author = soup.select("meta[name='citation_author']")
438
+ data["author"] = [i["content"] for i in author] if author else None
439
+
440
+ description = soup.select_one("meta[name='citation_abstract']") or soup.select_one(
441
+ "meta[name='dc.description']"
442
+ or soup.select_one("meta[property='og:description']")
443
+ or soup.select_one("meta[name='twitter:description']")
444
+ )
445
+ data["description"] = description["content"] if description else None
446
+
447
+ keywords = soup.select_one("meta[name='citation_keywords']")
448
+ data["keywords"] = (
449
+ str(keywords["content"]).replace(";", ",").rstrip(", ") if keywords else None
450
+ )
451
+
452
+ date_published = (
453
+ soup.select_one("meta[name='citation_publication_date']")
454
+ or soup.select_one("meta[name='dc.date']")
455
+ or soup.select_one("meta[property='article:published_time']")
456
+ )
457
+ data["datePublished"] = (
458
+ get_iso8601_date(date_published["content"]) if date_published else None
459
+ )
460
+ date_modified = soup.select_one(
461
+ "meta[property='og:updated_time']"
462
+ or soup.select_one("meta[property='article:modified_time']")
463
+ )
464
+ data["dateModified"] = (
465
+ get_iso8601_date(date_modified["content"]) if date_modified else None
466
+ )
467
+ license_ = soup.select_one("meta[name='dc.rights']")
468
+ data["license"] = license_["content"] if license_ else None
469
+
470
+ lang = soup.select_one("meta[name='dc.language']") or soup.select_one(
471
+ "meta[name='citation_language']"
472
+ )
473
+ if lang is not None:
474
+ data["inLanguage"] = lang["content"]
475
+ else:
476
+ html = soup.select_one("html")
477
+ if html is not None:
478
+ lang = html.get("lang", None)
479
+ if lang is not None:
480
+ data["inLanguage"] = lang
481
+
482
+ publisher = soup.select_one("meta[property='og:site_name']")
483
+ data["publisher"] = {"name": publisher["content"]} if publisher else None
484
+
485
+ name = soup.select_one("meta[property='og:site_name']")
486
+ issn = soup.select_one("meta[name='citation_issn']")
487
+ data["isPartOf"] = compact(
488
+ {
489
+ "name": name["content"] if name else None,
490
+ "issn": issn["content"] if issn else None,
491
+ }
492
+ )
493
+ return data
494
+
495
+
496
+ def get_funding_reference(dct):
497
+ """Get funding reference"""
498
+ return compact(
499
+ {
500
+ "funderName": dct.get("name", None),
501
+ "funderIdentifier": dct.get("@id", None),
502
+ "funderIdentifierType": "Crossref Funder ID"
503
+ if dct.get("@id", None)
504
+ else None,
505
+ }
506
+ )