commonmeta-py 0.23__py3-none-any.whl → 0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. commonmeta/__init__.py +96 -0
  2. commonmeta/api_utils.py +77 -0
  3. commonmeta/author_utils.py +260 -0
  4. commonmeta/base_utils.py +121 -0
  5. commonmeta/cli.py +200 -0
  6. commonmeta/constants.py +587 -0
  7. commonmeta/crossref_utils.py +575 -0
  8. commonmeta/date_utils.py +193 -0
  9. commonmeta/doi_utils.py +273 -0
  10. commonmeta/metadata.py +320 -0
  11. commonmeta/readers/__init__.py +1 -0
  12. commonmeta/readers/cff_reader.py +199 -0
  13. commonmeta/readers/codemeta_reader.py +112 -0
  14. commonmeta/readers/commonmeta_reader.py +13 -0
  15. commonmeta/readers/crossref_reader.py +409 -0
  16. commonmeta/readers/crossref_xml_reader.py +505 -0
  17. commonmeta/readers/csl_reader.py +98 -0
  18. commonmeta/readers/datacite_reader.py +390 -0
  19. commonmeta/readers/datacite_xml_reader.py +359 -0
  20. commonmeta/readers/inveniordm_reader.py +218 -0
  21. commonmeta/readers/json_feed_reader.py +420 -0
  22. commonmeta/readers/kbase_reader.py +205 -0
  23. commonmeta/readers/ris_reader.py +103 -0
  24. commonmeta/readers/schema_org_reader.py +506 -0
  25. commonmeta/resources/cff_v1.2.0.json +1827 -0
  26. commonmeta/resources/commonmeta_v0.12.json +601 -0
  27. commonmeta/resources/commonmeta_v0.13.json +559 -0
  28. commonmeta/resources/commonmeta_v0.14.json +573 -0
  29. commonmeta/resources/crossref/AccessIndicators.xsd +47 -0
  30. commonmeta/resources/crossref/JATS-journalpublishing1-3d2-mathml3-elements.xsd +10130 -0
  31. commonmeta/resources/crossref/JATS-journalpublishing1-3d2-mathml3.xsd +48 -0
  32. commonmeta/resources/crossref/JATS-journalpublishing1-elements.xsd +8705 -0
  33. commonmeta/resources/crossref/JATS-journalpublishing1-mathml3-elements.xsd +8608 -0
  34. commonmeta/resources/crossref/JATS-journalpublishing1-mathml3.xsd +49 -0
  35. commonmeta/resources/crossref/JATS-journalpublishing1.xsd +6176 -0
  36. commonmeta/resources/crossref/clinicaltrials.xsd +61 -0
  37. commonmeta/resources/crossref/common5.3.1.xsd +1538 -0
  38. commonmeta/resources/crossref/crossref5.3.1.xsd +1949 -0
  39. commonmeta/resources/crossref/crossref_query_output3.0.xsd +1097 -0
  40. commonmeta/resources/crossref/fundref.xsd +49 -0
  41. commonmeta/resources/crossref/module-ali.xsd +39 -0
  42. commonmeta/resources/crossref/relations.xsd +444 -0
  43. commonmeta/resources/crossref-v0.2.json +60 -0
  44. commonmeta/resources/csl-data.json +538 -0
  45. commonmeta/resources/datacite-v4.5.json +829 -0
  46. commonmeta/resources/datacite-v4.5pr.json +608 -0
  47. commonmeta/resources/ietf-bcp-47.json +3025 -0
  48. commonmeta/resources/iso-8601.json +3182 -0
  49. commonmeta/resources/spdx/licenses.json +4851 -0
  50. commonmeta/resources/spdx-schema..json +903 -0
  51. commonmeta/resources/styles/apa.csl +1697 -0
  52. commonmeta/resources/styles/chicago-author-date.csl +684 -0
  53. commonmeta/resources/styles/harvard-cite-them-right.csl +321 -0
  54. commonmeta/resources/styles/ieee.csl +468 -0
  55. commonmeta/resources/styles/modern-language-association.csl +341 -0
  56. commonmeta/resources/styles/vancouver.csl +376 -0
  57. commonmeta/schema_utils.py +27 -0
  58. commonmeta/translators.py +47 -0
  59. commonmeta/utils.py +1108 -0
  60. commonmeta/writers/__init__.py +1 -0
  61. commonmeta/writers/bibtex_writer.py +149 -0
  62. commonmeta/writers/citation_writer.py +70 -0
  63. commonmeta/writers/commonmeta_writer.py +68 -0
  64. commonmeta/writers/crossref_xml_writer.py +17 -0
  65. commonmeta/writers/csl_writer.py +79 -0
  66. commonmeta/writers/datacite_writer.py +193 -0
  67. commonmeta/writers/inveniordm_writer.py +94 -0
  68. commonmeta/writers/ris_writer.py +58 -0
  69. commonmeta/writers/schema_org_writer.py +146 -0
  70. {commonmeta_py-0.23.dist-info → commonmeta_py-0.24.dist-info}/METADATA +56 -45
  71. commonmeta_py-0.24.dist-info/RECORD +75 -0
  72. {commonmeta_py-0.23.dist-info → commonmeta_py-0.24.dist-info}/WHEEL +1 -1
  73. commonmeta_py-0.24.dist-info/entry_points.txt +3 -0
  74. commonmeta_py-0.23.dist-info/RECORD +0 -5
  75. /commonmeta_py/__init__.py → /commonmeta/readers/bibtex_reader.py +0 -0
  76. {commonmeta_py-0.23.dist-info/licenses → commonmeta_py-0.24.dist-info}/LICENSE +0 -0
@@ -0,0 +1,193 @@
1
+ """Date utils for commonmeta-py"""
2
+ import datetime
3
+ from datetime import datetime as dt
4
+ from typing import Optional, Union
5
+ import dateparser
6
+ import pydash as py_
7
+
8
+ from .base_utils import compact
9
+
10
+ MONTH_NAMES = {
11
+ "01": "jan",
12
+ "02": "feb",
13
+ "03": "mar",
14
+ "04": "apr",
15
+ "05": "may",
16
+ "06": "jun",
17
+ "07": "jul",
18
+ "08": "aug",
19
+ "09": "sep",
20
+ "10": "oct",
21
+ "11": "nov",
22
+ "12": "dec",
23
+ }
24
+
25
+ MONTH_SHORT_NAMES = [
26
+ "jan",
27
+ "feb",
28
+ "mar",
29
+ "apr",
30
+ "may",
31
+ "jun",
32
+ "jul",
33
+ "aug",
34
+ "sep",
35
+ "oct",
36
+ "nov",
37
+ "dec",
38
+ ]
39
+
40
+ ISO8601_DATE_FORMAT = "%Y-%m-%d"
41
+
42
+
43
+ def get_iso8601_date(date: Union[datetime.datetime, datetime.date, str, int]) -> str:
44
+ """Get ISO 8601 date without time"""
45
+ if date is None:
46
+ return ""
47
+ if isinstance(date, (datetime.datetime, datetime.date)):
48
+ return date.strftime(ISO8601_DATE_FORMAT)
49
+ if isinstance(date, str):
50
+ length = len(date)
51
+ if length == 7:
52
+ return dateparser.parse(date).strftime("%Y-%m")
53
+ if length == 4:
54
+ return dateparser.parse(date).strftime("%Y")
55
+ else:
56
+ return dateparser.parse(date).strftime(ISO8601_DATE_FORMAT)
57
+ if isinstance(date, int):
58
+ return datetime.datetime.fromtimestamp(date).strftime(ISO8601_DATE_FORMAT)
59
+ return ""
60
+
61
+
62
+ def get_date_parts(iso8601_time: Optional[str]) -> dict:
63
+ """Get date parts"""
64
+ if iso8601_time is None:
65
+ return {"date-parts": [[]]}
66
+
67
+ # add 0s to the end of the date if it is incomplete
68
+ if len(iso8601_time) < 10:
69
+ iso8601_time = iso8601_time.ljust(10, "0")
70
+
71
+ year = int(iso8601_time[0:4])
72
+ month = int(iso8601_time[5:7])
73
+ day = int(iso8601_time[8:10])
74
+
75
+ date_parts = py_.reject([year, month, day], lambda x: x == 0)
76
+ return {"date-parts": [date_parts]}
77
+
78
+
79
+ def get_date_from_unix_timestamp(timestamp: Optional[int]) -> Optional[str]:
80
+ """Get date from unix timestamp"""
81
+ if timestamp is None:
82
+ return None
83
+ return datetime.datetime.fromtimestamp(timestamp).replace(microsecond=0).isoformat()
84
+
85
+
86
+ def get_date_from_date_parts(date_as_parts: Optional[dict]) -> Optional[str]:
87
+ """Get date from date parts"""
88
+ if date_as_parts is None:
89
+ return None
90
+ date_parts = date_as_parts.get("date-parts", [])
91
+ if len(date_parts) == 0:
92
+ return None
93
+ date_parts = date_parts[0]
94
+ if date_parts[0] is None:
95
+ return None
96
+ year = date_parts[0] if len(date_parts) > 0 else 0
97
+ month = date_parts[1] if len(date_parts) > 1 else 0
98
+ day = date_parts[2] if len(date_parts) > 2 else 0
99
+ return get_date_from_parts(year, month, day)
100
+
101
+
102
+ def get_date_from_crossref_parts(date_parts: dict):
103
+ """Get date from Crossref XML date parts"""
104
+ if isinstance(date_parts, list):
105
+ date_parts = date_parts[0]
106
+ year = date_parts.get("year", None)
107
+ if year is None:
108
+ return None
109
+ month = date_parts.get("month", 0)
110
+ day = date_parts.get("day", 0)
111
+ return get_date_from_parts(year, month, day)
112
+
113
+
114
+ def get_date_from_parts(year=0, month=0, day=0) -> Optional[str]:
115
+ """Get date from parts"""
116
+ arr = [str(year).rjust(4, "0"), str(month).rjust(2, "0"), str(day).rjust(2, "0")]
117
+ arr = [e for i, e in enumerate(arr) if (e not in ["00", "0000"])]
118
+ return None if len(arr) == 0 else "-".join(arr)
119
+
120
+
121
+ def get_month_from_date(
122
+ date: Optional[Union[str, int, datetime.datetime, datetime.date]],
123
+ ) -> Optional[str]:
124
+ """Get month from date"""
125
+ if date is None:
126
+ return None
127
+ # if date type is not recognized
128
+ if not isinstance(date, (str, int, datetime.datetime, datetime.date)):
129
+ return None
130
+ if isinstance(date, str):
131
+ date = dateparser.parse(date).strftime(ISO8601_DATE_FORMAT)
132
+ if isinstance(date, int):
133
+ date = datetime.datetime.fromtimestamp(date).strftime(ISO8601_DATE_FORMAT)
134
+ if isinstance(date, (datetime.datetime, datetime.date)):
135
+ date = date.strftime(ISO8601_DATE_FORMAT)
136
+ date = date.split("-")
137
+ return MONTH_NAMES.get(date[1], None) if len(date) > 1 else None
138
+
139
+
140
+ def strip_milliseconds(iso8601_time: Optional[str]) -> Optional[str]:
141
+ """strip milliseconds if there is a time, as it interferes with edtc parsing"""
142
+ if iso8601_time is None or len(iso8601_time) == 0:
143
+ return None
144
+ if "T00:00:00" in iso8601_time:
145
+ return iso8601_time.split("T")[0]
146
+ if "." in iso8601_time:
147
+ return iso8601_time.split(".")[0] + "Z"
148
+ if "+00:00" in iso8601_time:
149
+ return iso8601_time.split("+")[0] + "Z"
150
+ return iso8601_time
151
+
152
+
153
+ def get_datetime_from_time(time: str) -> Optional[str]:
154
+ """iso8601 datetime without hyphens and colons, used by Crossref"""
155
+ try:
156
+ return dt.strptime(time, "%Y%m%d%H%M%S").strftime("%Y-%m-%dT%H:%M:%SZ")
157
+ except ValueError:
158
+ return None
159
+
160
+
161
+ def get_datetime_from_pdf_time(time: str) -> Optional[str]:
162
+ """iso8601 datetime in slightly different format, used in PDF metadata"""
163
+ try:
164
+ time = str(time).replace("D:", "").replace("'", "")
165
+ return dt.strptime(time, "%Y%m%d%H%M%S%z").strftime("%Y-%m-%dT%H:%M:%SZ")
166
+ except ValueError as e:
167
+ print(e)
168
+ return None
169
+
170
+
171
+ def normalize_date_dict(data: dict) -> dict:
172
+ """Normalize date dict
173
+
174
+ Supported date types in commonmeta:
175
+ - created
176
+ - submitted
177
+ - accepted
178
+ - published
179
+ - available
180
+ - updated
181
+ - withdrawn
182
+ """
183
+ return compact(
184
+ {
185
+ "created": data.get("Created", None),
186
+ "submitted": data.get("Submitted", None),
187
+ "accepted": data.get("Accepted", None),
188
+ "published": data.get("Issued", None),
189
+ "available": data.get("Available", None),
190
+ "updated": data.get("Updated", None),
191
+ "withdrawn": data.get("Withdrawn", None),
192
+ }
193
+ )
@@ -0,0 +1,273 @@
1
+ """Doi utils for commonmeta-py"""
2
+
3
+ import re
4
+ from typing import Optional
5
+ import httpx
6
+ from furl import furl
7
+
8
+ from .base_utils import compact
9
+
10
+
11
+ def validate_doi(doi: Optional[str]) -> Optional[str]:
12
+ """Validate a DOI"""
13
+ if doi is None:
14
+ return None
15
+ match = re.search(
16
+ r"\A(?:(http|https):/(/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org|handle\.test\.datacite\.org)/)?(doi:)?(10\.\d{4,5}/.+)\Z", # noqa: E501
17
+ doi,
18
+ )
19
+ if match is None:
20
+ return None
21
+ return match.group(6)
22
+
23
+
24
+ def validate_prefix(doi: Optional[str]) -> Optional[str]:
25
+ """Validate a DOI prefix for a given DOI"""
26
+ if doi is None:
27
+ return None
28
+ match = re.search(
29
+ r"\A(?:(http|https):/(/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org|handle\.test\.datacite\.org)/)?(doi:)?(10\.\d{4,5}).*\Z", # noqa: E501
30
+ doi,
31
+ )
32
+ if match is None:
33
+ return None
34
+ return match.group(6)
35
+
36
+
37
+ def validate_suffix(doi: Optional[str]) -> Optional[str]:
38
+ """Validate a DOI suffix for a given DOI"""
39
+ if doi is None:
40
+ return None
41
+ match = re.search(
42
+ r"\A(?:(http|https):/(/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org|handle\.test\.datacite\.org)/)?(doi:)?(10\.\d{4,5})/(.+)\Z", # noqa: E501
43
+ doi,
44
+ )
45
+ if match is None:
46
+ return None
47
+ return match.group(7)
48
+
49
+
50
+ def doi_from_url(url: Optional[str]) -> Optional[str]:
51
+ """Return a DOI from a URL"""
52
+ if url is None:
53
+ return None
54
+
55
+ f = furl(url)
56
+ # check for allowed scheme if string is a URL
57
+ if f.host is not None and f.scheme not in ["http", "https", "ftp"]:
58
+ return None
59
+
60
+ # url is for a short DOI
61
+ if f.host == "doi.org" and not f.path.segments[0].startswith("10."):
62
+ return short_doi_as_doi(url)
63
+
64
+ # special rules for specific hosts
65
+ if f.host == "onlinelibrary.wiley.com":
66
+ if f.path.segments[-1] in ["epdf"]:
67
+ f.path.segments.pop()
68
+ elif f.host == "www.plosone.org":
69
+ if (
70
+ f.path.segments[-1] in ["fetchobject.action"]
71
+ and f.args.get("uri", None) is not None
72
+ ):
73
+ f.path = f.args.get("uri")
74
+ path = str(f.path)
75
+ match = re.search(
76
+ r"(10\.\d{4,5}/.+)\Z",
77
+ path,
78
+ )
79
+ if match is None:
80
+ return None
81
+ return match.group(0).lower()
82
+
83
+
84
+ def short_doi_as_doi(doi: Optional[str]) -> Optional[str]:
85
+ """Resolve a short DOI"""
86
+ if doi is None:
87
+ return None
88
+ response = httpx.head(doi_as_url(doi), timeout=10)
89
+ if response.status_code != 301:
90
+ return doi_as_url(doi)
91
+ return response.headers.get("Location")
92
+
93
+
94
+ def doi_as_url(doi: Optional[str]) -> Optional[str]:
95
+ """Return a DOI as a URL"""
96
+ if doi is None:
97
+ return None
98
+ if furl(doi).host == "doi.org":
99
+ return doi.lower()
100
+ return "https://doi.org/" + doi.lower()
101
+
102
+
103
+ def normalize_doi(doi: Optional[str], **kwargs) -> Optional[str]:
104
+ """Normalize a DOI"""
105
+ doi_str = validate_doi(doi)
106
+ if not doi_str:
107
+ return None
108
+ return doi_resolver(doi, **kwargs) + doi_str.lower()
109
+
110
+
111
+ def doi_resolver(doi, **kwargs):
112
+ """Return a DOI resolver for a given DOI"""
113
+ if doi is None:
114
+ return None
115
+ match = re.match(
116
+ r"\A(http|https):/(/)?handle\.stage\.datacite\.org", doi, re.IGNORECASE
117
+ )
118
+ if match is not None or kwargs.get("sandbox", False):
119
+ return "https://handle.stage.datacite.org/"
120
+ return "https://doi.org/"
121
+
122
+
123
+ def get_doi_ra(doi) -> Optional[str]:
124
+ """Return the DOI registration agency for a given DOI"""
125
+ prefix = validate_prefix(doi)
126
+ if prefix is None:
127
+ return None
128
+ response = httpx.get("https://doi.org/ra/" + prefix, timeout=10)
129
+ if response.status_code != 200:
130
+ return None
131
+ return response.json()[0].get("RA", None)
132
+
133
+
134
+ def get_crossref_member(member_id) -> Optional[dict]:
135
+ """Return the Crossref member for a given member_id"""
136
+ response = httpx.get("https://api.crossref.org/members/" + member_id, timeout=10)
137
+ if response.status_code != 200:
138
+ return None
139
+ data = response.json().get("message", None)
140
+ name = data.get("primary-name", None)
141
+ return {"id": "https://api.crossref.org/members/" + member_id, "name": name}
142
+
143
+
144
+ def crossref_api_url(doi: str) -> str:
145
+ """Return the Crossref API URL for a given DOI"""
146
+ return "https://api.crossref.org/works/" + doi
147
+
148
+
149
+ def crossref_xml_api_url(doi: str) -> str:
150
+ """Return the Crossref XML API URL for a given DOI"""
151
+ return f"https://api.crossref.org/works/{doi}/transform/application/vnd.crossref.unixsd+xml"
152
+
153
+
154
+ def crossref_api_query_url(query: dict) -> str:
155
+ """Return the Crossref API query URL"""
156
+ url = "https://api.crossref.org/works"
157
+ f = furl(url)
158
+ rows = min(int(query.get("rows", 20)), 1000)
159
+ queries = []
160
+ filters = []
161
+ if query.get("query", None) is not None:
162
+ queries += [query.get("query")]
163
+ for key, value in query.items():
164
+ if key in [
165
+ "query.bibliographic",
166
+ "query.author",
167
+ "query.title",
168
+ "query.container-title",
169
+ ]:
170
+ queries += [f"{key}:{value}"]
171
+ _query = ",".join(queries) if len(queries) > 0 else None
172
+
173
+ for key, value in query.items():
174
+ if key in [
175
+ "prefix",
176
+ "member",
177
+ "type",
178
+ "has-full-text",
179
+ "has-references",
180
+ "has-orcid",
181
+ "has-funder",
182
+ "has-license",
183
+ ]:
184
+ filters += [f"{key}:{value}"]
185
+ _filter = ",".join(filters) if len(filters) > 0 else None
186
+ f.args = compact({"rows": rows, "query": _query, "filter": _filter})
187
+
188
+ return f.url
189
+
190
+
191
+ def crossref_api_sample_url(number: int = 1, **kwargs) -> str:
192
+ """Return the Crossref API URL for a sample of works"""
193
+ types = [
194
+ "book-section",
195
+ "monograph",
196
+ "report-component",
197
+ "report",
198
+ "peer-review",
199
+ "book-track",
200
+ "journal-article",
201
+ "book-part",
202
+ "other",
203
+ "book",
204
+ "journal-volume",
205
+ "book-set",
206
+ "reference-entry",
207
+ "proceedings-article",
208
+ "journal",
209
+ "component",
210
+ "book-chapter",
211
+ "proceedings-series",
212
+ "report-series",
213
+ "proceedings",
214
+ "database",
215
+ "standard",
216
+ "reference-book",
217
+ "posted-content",
218
+ "journal-issue",
219
+ "dissertation",
220
+ "grant",
221
+ "dataset",
222
+ "book-series",
223
+ "edited-book",
224
+ "journal-section",
225
+ "monograph-series",
226
+ "journal-meta",
227
+ "book-series-meta",
228
+ "component-list",
229
+ "journal-issue-meta",
230
+ "journal-meta",
231
+ "book-part-meta",
232
+ "book-meta",
233
+ "proceedings-meta",
234
+ "book-series-meta",
235
+ "book-set",
236
+ ]
237
+ url = f"https://api.crossref.org/works?sample={number}"
238
+ if kwargs.get("prefix", None) and validate_prefix(kwargs.get("prefix")):
239
+ url += f"&filter=prefix:{kwargs.get('prefix')}"
240
+ if kwargs.get("_type", None) and kwargs.get("_type") in types:
241
+ url += f"&filter=type:{kwargs.get('_type')}"
242
+ return url
243
+
244
+
245
+ def datacite_api_url(doi: str, **kwargs) -> str:
246
+ """Return the DataCite API URL for a given DOI"""
247
+ match = re.match(
248
+ r"\A(http|https):/(/)?handle\.stage\.datacite\.org", doi, re.IGNORECASE
249
+ )
250
+ if match is not None or kwargs.get("sandbox", False):
251
+ return f"https://api.stage.datacite.org/dois/{doi_from_url(doi)}?include=media,client"
252
+ return f"https://api.datacite.org/dois/{doi_from_url(doi)}?include=media,client"
253
+
254
+
255
+ def datacite_api_sample_url(number: int = 1, **kwargs) -> str:
256
+ """Return the DataCite API URL for a sample of dois"""
257
+ if kwargs.get("sandbox", False):
258
+ return f"https://api.stage.datacite.org/dois?random=true&page[size]={number}"
259
+ return f"https://api.datacite.org/dois?random=true&page[size]={number}"
260
+
261
+
262
+ def is_rogue_scholar_doi(doi: str) -> bool:
263
+ """Return True if DOI is from Rogue Scholar"""
264
+ prefix = validate_prefix(doi)
265
+ return prefix in [
266
+ "10.34732",
267
+ "10.53731",
268
+ "10.54900",
269
+ "10.57689",
270
+ "10.59348",
271
+ "10.59349",
272
+ "10.59350",
273
+ ]