commonmeta-py 0.100__py3-none-any.whl → 0.103__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. commonmeta/__init__.py +51 -50
  2. commonmeta/author_utils.py +7 -1
  3. commonmeta/base_utils.py +1 -0
  4. commonmeta/constants.py +35 -1
  5. commonmeta/crossref_utils.py +11 -8
  6. commonmeta/date_utils.py +1 -0
  7. commonmeta/doi_utils.py +42 -14
  8. commonmeta/metadata.py +209 -100
  9. commonmeta/readers/cff_reader.py +1 -0
  10. commonmeta/readers/codemeta_reader.py +1 -0
  11. commonmeta/readers/commonmeta_reader.py +1 -0
  12. commonmeta/readers/crossref_reader.py +19 -18
  13. commonmeta/readers/csl_reader.py +4 -1
  14. commonmeta/readers/inveniordm_reader.py +14 -9
  15. commonmeta/readers/json_feed_reader.py +9 -3
  16. commonmeta/readers/kbase_reader.py +1 -0
  17. commonmeta/readers/openalex_reader.py +380 -0
  18. commonmeta/readers/ris_reader.py +1 -0
  19. commonmeta/readers/schema_org_reader.py +2 -3
  20. commonmeta/schema_utils.py +1 -0
  21. commonmeta/utils.py +126 -63
  22. commonmeta/writers/bibtex_writer.py +1 -0
  23. commonmeta/writers/citation_writer.py +1 -0
  24. commonmeta/writers/crossref_xml_writer.py +1 -0
  25. commonmeta/writers/csl_writer.py +1 -0
  26. commonmeta/writers/datacite_writer.py +1 -0
  27. commonmeta/writers/ris_writer.py +1 -0
  28. commonmeta/writers/schema_org_writer.py +1 -0
  29. {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/METADATA +5 -8
  30. {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/RECORD +33 -32
  31. {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/licenses/LICENSE +1 -1
  32. {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/WHEEL +0 -0
  33. {commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,380 @@
1
+ """OpenAlex reader for commonmeta-py"""
2
+
3
+ from typing import Optional
4
+
5
+ import httpx
6
+ from pydash import py_
7
+
8
+ from ..author_utils import get_authors
9
+ from ..base_utils import compact, presence, sanitize, wrap
10
+ from ..constants import (
11
+ CR_TO_CM_TRANSLATIONS,
12
+ OA_TO_CM_CONTAINER_TRANLATIONS,
13
+ OA_TO_CM_TRANSLATIONS,
14
+ Commonmeta,
15
+ )
16
+ from ..doi_utils import (
17
+ normalize_doi,
18
+ openalex_api_sample_url,
19
+ openalex_api_url,
20
+ )
21
+ from ..utils import (
22
+ dict_to_spdx,
23
+ normalize_url,
24
+ validate_openalex,
25
+ )
26
+
27
+ # Map OpenAlex license strings to SPDX licenceId. May not be the correct license version.
28
+ OA_LICENSES = {"cc-by": "CC-BY-4.0", "cc0": "CC0-1.0"}
29
+ OA_IDENTIFIER_TYPES = {
30
+ "openalex": "OpenAlex",
31
+ "doi": "DOI",
32
+ "mag": "MAG",
33
+ "pmid": "PMID",
34
+ "pmcid": "PMCID",
35
+ }
36
+
37
+
38
+ def get_openalex(pid: str, **kwargs) -> dict:
39
+ """get_openalex"""
40
+ doi = normalize_doi(pid)
41
+ if doi is None:
42
+ return {"state": "not_found"}
43
+ url = openalex_api_url(doi)
44
+ response = httpx.get(url, timeout=10, **kwargs)
45
+ if response.status_code != 200:
46
+ return {"state": "not_found"}
47
+ return response.json() | {"via": "openalex"}
48
+
49
+
50
+ def read_openalex(data: Optional[dict], **kwargs) -> Commonmeta:
51
+ """read_openalex"""
52
+ if data is None:
53
+ return {"state": "not_found"}
54
+ meta = data
55
+ read_options = kwargs or {}
56
+
57
+ doi = meta.get("doi", None)
58
+ _id = normalize_doi(doi)
59
+ _type = CR_TO_CM_TRANSLATIONS.get(meta.get("type_crossref", None)) or "Other"
60
+ additional_type = OA_TO_CM_TRANSLATIONS.get(meta.get("type", None))
61
+ if additional_type == _type:
62
+ additional_type = None
63
+
64
+ archive_locations = []
65
+ contributors = get_contributors(wrap(meta.get("authorships")))
66
+ contributors = get_authors(contributors)
67
+
68
+ url = normalize_url(
69
+ py_.get(meta, "primary_location.landing_page_url") or py_.get(meta, "id")
70
+ )
71
+ title = meta.get("title", None)
72
+ if title is not None:
73
+ titles = [{"title": sanitize(title)}]
74
+ else:
75
+ titles = None
76
+ publisher = compact(
77
+ {"name": py_.get(meta, "primary_location.source.host_organization_name")}
78
+ )
79
+ date = compact(
80
+ {
81
+ "published": py_.get(meta, "publication_date")
82
+ or py_.get(meta, "created_date")
83
+ }
84
+ )
85
+ identifiers = [
86
+ {
87
+ "identifier": uid,
88
+ "identifierType": OA_IDENTIFIER_TYPES[uidType],
89
+ }
90
+ for uidType, uid in (meta.get("ids", {})).items()
91
+ ]
92
+
93
+ license_ = py_.get(meta, "best_oa_location.license")
94
+ if license_ is not None:
95
+ license_ = OA_LICENSES.get(license_, license_)
96
+ license_ = dict_to_spdx({"id": license_})
97
+ container = get_container(meta)
98
+ relations = []
99
+ references = [
100
+ get_related(i) for i in get_references(meta.get("referenced_works", []))
101
+ ]
102
+ funding_references = from_openalex_funding(wrap(meta.get("grants", None)))
103
+
104
+ description = get_abstract(meta)
105
+ if description is not None:
106
+ descriptions = [{"description": sanitize(description), "type": "Abstract"}]
107
+ else:
108
+ descriptions = None
109
+
110
+ subjects = py_.uniq(
111
+ [
112
+ {"subject": py_.get(i, "subfield.display_name")}
113
+ for i in wrap(meta.get("topics", None))
114
+ ]
115
+ )
116
+ files = get_files(meta)
117
+
118
+ return {
119
+ # required properties
120
+ "id": _id,
121
+ "type": _type,
122
+ # recommended and optional properties
123
+ "additionalType": additional_type,
124
+ "archiveLocations": presence(archive_locations),
125
+ "container": presence(container),
126
+ "contributors": presence(contributors),
127
+ "date": presence(date),
128
+ "descriptions": presence(descriptions),
129
+ "files": presence(files),
130
+ "fundingReferences": presence(funding_references),
131
+ "geoLocations": None,
132
+ "identifiers": identifiers,
133
+ "language": meta.get("language", None),
134
+ "license": license_,
135
+ "provider": "OpenAlex",
136
+ "publisher": presence(publisher),
137
+ "references": presence(references),
138
+ "relations": presence(relations),
139
+ "subjects": presence(subjects),
140
+ "titles": presence(titles),
141
+ "url": url,
142
+ "version": meta.get("version", None),
143
+ } | read_options
144
+
145
+
146
+ def get_abstract(meta):
147
+ """Parse abstract from OpenAlex abstract_inverted_index"""
148
+ abstract_inverted_index = py_.get(meta, "abstract_inverted_index")
149
+
150
+ if abstract_inverted_index:
151
+ # Determine the length of the abstract
152
+ max_pos = max(
153
+ p for positions in abstract_inverted_index.values() for p in positions
154
+ )
155
+ abstract_words = [""] * (max_pos + 1)
156
+
157
+ for word, positions in abstract_inverted_index.items():
158
+ for p in positions:
159
+ abstract_words[p] = word
160
+
161
+ abstract = " ".join(abstract_words)
162
+ else:
163
+ abstract = None
164
+ return abstract
165
+
166
+
167
+ def get_contributors(contributors: list) -> list:
168
+ """Parse contributor"""
169
+
170
+ def parse_contributor(c):
171
+ affiliations = []
172
+ for affiliation in c.get("institutions", []):
173
+ affiliations.append(
174
+ compact(
175
+ {
176
+ "id": affiliation.get("ror", None),
177
+ "name": affiliation.get("display_name", None),
178
+ }
179
+ )
180
+ )
181
+
182
+ return compact(
183
+ {
184
+ "id": py_.get(c, "author.orcid"),
185
+ "name": py_.get(c, "author.display_name"),
186
+ "affiliations": affiliations,
187
+ }
188
+ )
189
+
190
+ return [parse_contributor(i) for i in contributors]
191
+
192
+
193
+ def get_references(pids: list, **kwargs) -> list:
194
+ """Get related articles from OpenAlex using their pid
195
+ Used for retrieving metadata for citations and references which are not included in the OpenAlex record
196
+ """
197
+ references = get_openalex_works(pids)
198
+ return references
199
+
200
+
201
+ def get_citations(citation_url: str, **kwargs) -> list:
202
+ response = httpx.get(citation_url, timeout=10, **kwargs)
203
+ if response.status_code != 200:
204
+ return {"state": "not_found"}
205
+ response = response.json()
206
+ return response.json().get("results", [])
207
+
208
+
209
+ def get_related(related: Optional[dict]) -> Optional[dict]:
210
+ """Get reference from OpenAlex reference"""
211
+ if related is None or not isinstance(related, dict):
212
+ return None
213
+ doi = related.get("doi", None)
214
+ metadata = {
215
+ "id": normalize_doi(doi) if doi else None,
216
+ "contributor": related.get("author", None),
217
+ "title": related.get("display_name", None),
218
+ "publisher": related.get(
219
+ "primary_location.source.host_organization_name", None
220
+ ),
221
+ "publicationYear": related.get("publication_year", None),
222
+ "volume": py_.get(related, "biblio.volume"),
223
+ "issue": py_.get(related, "biblio.issue"),
224
+ "firstPage": py_.get(related, "biblio.first_page"),
225
+ "lastPage": py_.get(related, "biblio.last_page"),
226
+ "containerTitle": related.get("primary_location.source.display_name", None),
227
+ }
228
+ return compact(metadata)
229
+
230
+
231
+ def get_openalex_works(pids: list, **kwargs) -> list:
232
+ """Get OpenAlex works, use batches of 49 to honor API limit."""
233
+ pid_batches = [pids[i : i + 49] for i in range(0, len(pids), 49)]
234
+ works = []
235
+ for pid_batch in pid_batches:
236
+ ids = "|".join(pid_batch)
237
+ url = f"https://api.openalex.org/works?filter=ids.openalex:{ids}"
238
+ response = httpx.get(url, timeout=10, **kwargs)
239
+ if response.status_code != 200:
240
+ return {"state": "not_found"}
241
+ response = response.json()
242
+ if py_.get(response, "count") == 0:
243
+ return {"state": "not_found"}
244
+
245
+ works.extend(response.get("results"))
246
+
247
+ return works
248
+
249
+
250
+ def get_openalex_funders(pids: list, **kwargs) -> list:
251
+ """Get ROR id and name from OpenAlex funders.
252
+ use batches of 49 to honor API limit."""
253
+ pid_batches = [pids[i : i + 49] for i in range(0, len(pids), 49)]
254
+ funders = []
255
+ for pid_batch in pid_batches:
256
+ ids = "|".join(pid_batch)
257
+ url = f"https://api.openalex.org/funders?filter=ids.openalex:{ids}"
258
+ response = httpx.get(url, timeout=10, **kwargs)
259
+ if response.status_code != 200:
260
+ return {"state": "not_found"}
261
+ response = response.json()
262
+ if py_.get(response, "count") == 0:
263
+ return {"state": "not_found"}
264
+
265
+ def format_funder(funder):
266
+ return compact(
267
+ {
268
+ "id": py_.get(funder, "id"),
269
+ "ror": py_.get(funder, "ids.ror"),
270
+ "name": py_.get(funder, "display_name"),
271
+ }
272
+ )
273
+
274
+ f = [format_funder(i) for i in response.get("results")]
275
+ funders.extend(f)
276
+
277
+ return funders
278
+
279
+
280
+ def get_openalex_source(str: Optional[str], **kwargs) -> Optional[dict]:
281
+ """Get issn, name, homepage_url and type from OpenAlex source."""
282
+ id = validate_openalex(str)
283
+ if not id:
284
+ return None
285
+
286
+ url = f"https://api.openalex.org/sources/{id}"
287
+ response = httpx.get(url, timeout=10, **kwargs)
288
+ if response.status_code != 200:
289
+ return {"state": "not_found"}
290
+ response = response.json()
291
+ if py_.get(response, "count") == 0:
292
+ return {"state": "not_found"}
293
+
294
+ return compact(
295
+ {
296
+ "id": py_.get(response, "id"),
297
+ "url": py_.get(response, "homepage_url"),
298
+ "issn": py_.get(response, "issn_l"),
299
+ "title": py_.get(response, "display_name"),
300
+ "type": py_.get(response, "type"),
301
+ }
302
+ )
303
+
304
+
305
+ def get_files(meta) -> Optional[list]:
306
+ """get file links"""
307
+ pdf_url = py_.get(meta, "best_oa_location.pdf_url")
308
+ if pdf_url is None:
309
+ return None
310
+ return [
311
+ {"mimeType": "application/pdf", "url": pdf_url},
312
+ ]
313
+
314
+
315
+ def get_container(meta: dict) -> dict:
316
+ """Get container from OpenAlex"""
317
+ source = get_openalex_source(py_.get(meta, "primary_location.source.id"))
318
+ print(source)
319
+ container_type = py_.get(source, "type")
320
+ if container_type:
321
+ container_type = OA_TO_CM_CONTAINER_TRANLATIONS.get(
322
+ container_type, container_type
323
+ )
324
+ issn = py_.get(source, "issn")
325
+ container_title = py_.get(source, "title")
326
+ url_ = py_.get(source, "url")
327
+
328
+ return compact(
329
+ {
330
+ "type": container_type,
331
+ "identifier": issn or url_,
332
+ "identifierType": "ISSN" if issn else "URL" if url_ else None,
333
+ "title": container_title,
334
+ "volume": py_.get(meta, "biblio.volume"),
335
+ "issue": py_.get(meta, "biblio.issue"),
336
+ "firstPage": py_.get(meta, "biblio.first_page"),
337
+ "lastPage": py_.get(meta, "biblio.last_page"),
338
+ }
339
+ )
340
+
341
+
342
+ def from_openalex_funding(funding_references: list) -> list:
343
+ """Get funding references from OpenAlex"""
344
+ funder_ids = [
345
+ validate_openalex(funding.get("funder"))
346
+ for funding in funding_references
347
+ if "funder" in funding
348
+ ]
349
+ funders = get_openalex_funders(funder_ids)
350
+ formatted_funding_references = []
351
+ for funding in funding_references:
352
+ funder = next(
353
+ item for item in funders if item["id"] == funding.get("funder", None)
354
+ )
355
+ f = compact(
356
+ {
357
+ "funderName": funder.get("name", None),
358
+ "funderIdentifier": funder.get("ror", None),
359
+ "funderIdentifierType": "ROR" if funder.get("ror", None) else None,
360
+ "awardNumber": funding.get("award_id", None),
361
+ }
362
+ )
363
+ formatted_funding_references.append(f)
364
+ return py_.uniq(formatted_funding_references)
365
+
366
+
367
+ def get_random_id_from_openalex(number: int = 1, **kwargs) -> list:
368
+ """Get random ID from OpenAlex"""
369
+ number = min(number, 20)
370
+ url = openalex_api_sample_url(number, **kwargs)
371
+ try:
372
+ response = httpx.get(url, timeout=10)
373
+ if response.status_code != 200:
374
+ return []
375
+
376
+ items = py_.get(response.json(), "results")
377
+ print(items)
378
+ return [i.get("id") for i in items]
379
+ except (httpx.ReadTimeout, httpx.ConnectError):
380
+ return []
@@ -1,4 +1,5 @@
1
1
  """RIS reader for commonmeta-py"""
2
+
2
3
  from typing import Optional
3
4
 
4
5
  from ..utils import compact, normalize_url, wrap
@@ -410,9 +410,8 @@ def get_html_meta(soup):
410
410
  pid = pid.get("content", None) or pid.get("href", None)
411
411
  data["@id"] = normalize_id(pid)
412
412
 
413
- _type = (
414
- soup.select_one("meta[name='dc.type']")
415
- or soup.select_one("meta[name='DC.type']")
413
+ _type = soup.select_one("meta[name='dc.type']") or soup.select_one(
414
+ "meta[name='DC.type']"
416
415
  )
417
416
  data["@type"] = _type["content"].capitalize() if _type else None
418
417
  if _type is None:
@@ -1,4 +1,5 @@
1
1
  """Schema utils for commonmeta-py"""
2
+
2
3
  from os import path
3
4
  import orjson as json
4
5
  from jsonschema import Draft202012Validator, ValidationError
commonmeta/utils.py CHANGED
@@ -1,22 +1,22 @@
1
1
  """Utils module for commonmeta-py"""
2
2
 
3
3
  import os
4
- import orjson as json
5
4
  import re
6
5
  import time
7
6
  from typing import Optional
8
7
  from urllib.parse import urlparse
9
- import yaml
10
- from furl import furl
8
+
11
9
  import bibtexparser
10
+ import orjson as json
11
+ import pycountry
12
+ import yaml
12
13
  from bs4 import BeautifulSoup
14
+ from furl import furl
13
15
  from pydash import py_
14
- import pycountry
15
16
 
16
- from .base_utils import wrap, compact, parse_attributes
17
- from .doi_utils import normalize_doi, doi_from_url, get_doi_ra, validate_doi, doi_as_url
17
+ from .base_utils import compact, parse_attributes, wrap
18
18
  from .constants import DATACITE_CONTRIBUTOR_TYPES
19
-
19
+ from .doi_utils import doi_as_url, doi_from_url, get_doi_ra, normalize_doi, validate_doi
20
20
 
21
21
  NORMALIZED_LICENSES = {
22
22
  "https://creativecommons.org/licenses/by/1.0": "https://creativecommons.org/licenses/by/1.0/legalcode",
@@ -144,17 +144,13 @@ def normalize_id(pid: Optional[str], **kwargs) -> Optional[str]:
144
144
  return doi
145
145
 
146
146
  # check for valid HTTP uri and ensure https
147
- uri = urlparse(pid)
148
- if not uri.netloc or uri.scheme not in ["http", "https"]:
147
+ f = furl(pid)
148
+ if not f.host or f.scheme not in ["http", "https"]:
149
149
  return None
150
- if uri.scheme == "http":
151
- pid = pid.replace(HTTP_SCHEME, HTTPS_SCHEME)
150
+ if f.scheme == "http":
151
+ f.scheme = "https"
152
152
 
153
- # remove trailing slash
154
- if pid.endswith("/"):
155
- pid = pid.strip("/")
156
-
157
- return pid
153
+ return f.url
158
154
 
159
155
 
160
156
  def normalize_ids(ids: list, relation_type=None) -> list:
@@ -190,8 +186,6 @@ def normalize_url(
190
186
  if url is None or not isinstance(url, str):
191
187
  return None
192
188
  url = url.strip()
193
- if url.endswith("/"):
194
- url = url.strip("/")
195
189
  scheme = urlparse(url).scheme
196
190
  if not scheme or scheme not in ["http", "https"]:
197
191
  return None
@@ -202,55 +196,13 @@ def normalize_url(
202
196
  return url
203
197
 
204
198
 
205
- # def normalize_url(url: Optional[str], secure=False, fragments=False, lower=False) -> Optional[str]:
206
- # """Normalize URL"""
207
- # if url is None or not isinstance(url, str):
208
- # return None
209
- # try:
210
- # f = furl(url.strip())
211
- # f.path.normalize()
212
-
213
- # # only allow http and https schemes
214
- # if f.scheme not in ["http", "https"]:
215
- # return None
216
- # if secure and f.scheme == "http":
217
- # f.set(scheme="https")
218
-
219
- # # remove index.html
220
- # if f.path.segments and f.path.segments[-1] in ["index.html"]:
221
- # f.path.segments.pop(-1)
222
-
223
- # # remove fragments
224
- # if fragments:
225
- # f.remove(fragment=True)
226
-
227
- # # remove specific query parameters
228
- # f.remove(
229
- # [
230
- # "origin",
231
- # "ref",
232
- # "referrer",
233
- # "source",
234
- # "utm_content",
235
- # "utm_medium",
236
- # "utm_campaign",
237
- # "utm_source",
238
- # ]
239
- # )
240
-
241
- # if lower:
242
- # return f.url.lower().strip("/")
243
- # return f.url.strip("/")
244
- # except ValueError:
245
- # print(f"Error normalizing url {url}")
246
- # return None
247
-
248
-
249
199
  def normalize_cc_url(url: Optional[str]):
250
200
  """Normalize Creative Commons URL"""
251
201
  if url is None or not isinstance(url, str):
252
202
  return None
253
203
  url = normalize_url(url, secure=True)
204
+ if url and url.endswith("/"):
205
+ url = url.strip("/")
254
206
  return NORMALIZED_LICENSES.get(url, url)
255
207
 
256
208
 
@@ -333,6 +285,115 @@ def validate_isni(isni: Optional[str]) -> Optional[str]:
333
285
  return isni
334
286
 
335
287
 
288
+ def validate_mag(mag: Optional[str]) -> Optional[str]:
289
+ """Validate Microsoft Academic Graph ID (mag)"""
290
+ if mag is None or not isinstance(mag, str):
291
+ return None
292
+ match = re.search(
293
+ r"\A(\d{4,10})\Z",
294
+ mag,
295
+ )
296
+ if match is None:
297
+ return None
298
+ return match.group(1)
299
+
300
+
301
+ def validate_openalex(openalex: Optional[str]) -> Optional[str]:
302
+ """Validate OpenAlex ID"""
303
+ if openalex is None or not isinstance(openalex, str):
304
+ return None
305
+ match = re.search(
306
+ r"\A(?:(?:http|https)://openalex\.org/)?([AFIPSW]\d{8,10})\Z",
307
+ openalex,
308
+ )
309
+ if match is None:
310
+ return None
311
+ return match.group(1)
312
+
313
+
314
+ def validate_pmid(pmid: Optional[str]) -> Optional[str]:
315
+ """Validate PubMed ID (pmid)"""
316
+ if pmid is None or not isinstance(pmid, str):
317
+ return None
318
+ match = re.search(
319
+ r"\A(?:(?:http|https)://pubmed\.ncbi\.nlm\.nih\.gov/)?(\d{4,8})\Z",
320
+ pmid,
321
+ )
322
+ if match is None:
323
+ return None
324
+ return match.group(1)
325
+
326
+
327
+ def validate_pmcid(pmcid: Optional[str]) -> Optional[str]:
328
+ """Validate PubMed Central ID (pmcid)"""
329
+ if pmcid is None or not isinstance(pmcid, str):
330
+ return None
331
+ match = re.search(
332
+ r"\A(?:(?:http|https)://www\.ncbi\.nlm\.nih\.gov/pmc/articles/)?(\d{4,8})\Z",
333
+ pmcid,
334
+ )
335
+ if match is None:
336
+ return None
337
+ return match.group(1)
338
+
339
+
340
+ def validate_id(id: Optional[str]) -> tuple[Optional[str], Optional[str]]:
341
+ """
342
+ Validate an identifier and return the validated identifier and its type.
343
+
344
+ Args:
345
+ id: The identifier string to validate
346
+
347
+ Returns:
348
+ A tuple containing (validated_id, id_type) or (None, None) if invalid
349
+ """
350
+ if id is None:
351
+ return None, None
352
+
353
+ # Check if it's a DOI
354
+ doi = validate_doi(id)
355
+ if doi:
356
+ return normalize_doi(id), "DOI"
357
+
358
+ # Check if it's an ORCID
359
+ orcid = validate_orcid(id)
360
+ if orcid:
361
+ return normalize_orcid(id), "ORCID"
362
+
363
+ # Check if it's a ROR
364
+ ror = validate_ror(id)
365
+ if ror:
366
+ return normalize_ror(id), "ROR"
367
+
368
+ # Check if it's an ISNI
369
+ isni = validate_isni(id)
370
+ if isni:
371
+ return normalize_isni(id), "ISNI"
372
+
373
+ # Check if it's an OpenAlex ID
374
+ openalex = validate_openalex(id)
375
+ if openalex:
376
+ return f"https://openalex.org/{openalex}", "OpenAlex"
377
+
378
+ # Check if it's a PubMed ID
379
+ pmid = validate_pmid(id)
380
+ if pmid:
381
+ return f"https://pubmed.ncbi.nlm.nih.gov/{pmid}", "PMID"
382
+
383
+ # Check if it's a PubMed Central ID
384
+ pmcid = validate_pmcid(id)
385
+ if pmcid:
386
+ return f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}", "PMCID"
387
+
388
+ # Check if it's a URL
389
+ url_type = validate_url(id)
390
+ if url_type:
391
+ return normalize_url(id), url_type
392
+
393
+ # No known valid identifier type was found
394
+ return None, None
395
+
396
+
336
397
  def normalize_isni(isni: Optional[str]) -> Optional[str]:
337
398
  """Normalize ISNI"""
338
399
  if isni is None or not isinstance(isni, str):
@@ -1129,7 +1190,9 @@ def replace_curie(string: Optional[str]) -> Optional[str]:
1129
1190
  if string is None:
1130
1191
  return None
1131
1192
  match = re.sub(
1132
- r"((?:doi|DOI):\s?([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))", r'https://doi.org/\2', string
1193
+ r"((?:doi|DOI):\s?([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))",
1194
+ r"https://doi.org/\2",
1195
+ string,
1133
1196
  )
1134
1197
  if match is None:
1135
1198
  return None
@@ -1,4 +1,5 @@
1
1
  """Bibtex writer for commonmeta-py"""
2
+
2
3
  from bibtexparser.bwriter import BibTexWriter
3
4
  from bibtexparser.bibdatabase import BibDatabase
4
5
  from bibtexparser.customization import page_double_hyphen
@@ -1,4 +1,5 @@
1
1
  """Citation writer for commonmeta-py"""
2
+
2
3
  import orjson as json
3
4
  import re
4
5
  from pydash import py_
@@ -1,4 +1,5 @@
1
1
  """Crossref XML writer for commonmeta-py"""
2
+
2
3
  from typing import Optional
3
4
  from ..constants import Commonmeta
4
5
  from ..crossref_utils import generate_crossref_xml, generate_crossref_xml_list
@@ -1,4 +1,5 @@
1
1
  """CSL-JSON writer for commonmeta-py"""
2
+
2
3
  import orjson as json
3
4
  from typing import Optional
4
5
 
@@ -1,4 +1,5 @@
1
1
  """DataCite writer for commonmeta-py"""
2
+
2
3
  import orjson as json
3
4
  from typing import Optional, Union
4
5
 
@@ -1,4 +1,5 @@
1
1
  """RIS writer for commonmeta-py"""
2
+
2
3
  from ..utils import to_ris
3
4
  from ..base_utils import compact, wrap, presence, parse_attributes
4
5
  from ..doi_utils import doi_from_url