PyPI - commonmeta-py - Versions diffs - 0.100__py3-none-any.whl → 0.103__py3-none-any.whl - Mend

commonmeta-py 0.100py3-none-any.whl → 0.103py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

commonmeta/__init__.py +51 -50
commonmeta/author_utils.py +7 -1
commonmeta/base_utils.py +1 -0
commonmeta/constants.py +35 -1
commonmeta/crossref_utils.py +11 -8
commonmeta/date_utils.py +1 -0
commonmeta/doi_utils.py +42 -14
commonmeta/metadata.py +209 -100
commonmeta/readers/cff_reader.py +1 -0
commonmeta/readers/codemeta_reader.py +1 -0
commonmeta/readers/commonmeta_reader.py +1 -0
commonmeta/readers/crossref_reader.py +19 -18
commonmeta/readers/csl_reader.py +4 -1
commonmeta/readers/inveniordm_reader.py +14 -9
commonmeta/readers/json_feed_reader.py +9 -3
commonmeta/readers/kbase_reader.py +1 -0
commonmeta/readers/openalex_reader.py +380 -0
commonmeta/readers/ris_reader.py +1 -0
commonmeta/readers/schema_org_reader.py +2 -3
commonmeta/schema_utils.py +1 -0
commonmeta/utils.py +126 -63
commonmeta/writers/bibtex_writer.py +1 -0
commonmeta/writers/citation_writer.py +1 -0
commonmeta/writers/crossref_xml_writer.py +1 -0
commonmeta/writers/csl_writer.py +1 -0
commonmeta/writers/datacite_writer.py +1 -0
commonmeta/writers/ris_writer.py +1 -0
commonmeta/writers/schema_org_writer.py +1 -0
{commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/METADATA +5 -8
{commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/RECORD +33 -32
{commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/licenses/LICENSE +1 -1
{commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/WHEEL +0 -0
{commonmeta_py-0.100.dist-info → commonmeta_py-0.103.dist-info}/entry_points.txt +0 -0

commonmeta/readers/openalex_reader.py ADDED Viewed

@@ -0,0 +1,380 @@
+"""OpenAlex reader for commonmeta-py"""
+from typing import Optional
+import httpx
+from pydash import py_
+from ..author_utils import get_authors
+from ..base_utils import compact, presence, sanitize, wrap
+from ..constants import (
+    CR_TO_CM_TRANSLATIONS,
+    OA_TO_CM_CONTAINER_TRANLATIONS,
+    OA_TO_CM_TRANSLATIONS,
+    Commonmeta,
+)
+from ..doi_utils import (
+    normalize_doi,
+    openalex_api_sample_url,
+    openalex_api_url,
+)
+from ..utils import (
+    dict_to_spdx,
+    normalize_url,
+    validate_openalex,
+)
+# Map OpenAlex license strings to SPDX licenceId. May not be the correct license version.
+OA_LICENSES = {"cc-by": "CC-BY-4.0", "cc0": "CC0-1.0"}
+OA_IDENTIFIER_TYPES = {
+    "openalex": "OpenAlex",
+    "doi": "DOI",
+    "mag": "MAG",
+    "pmid": "PMID",
+    "pmcid": "PMCID",
+}
+def get_openalex(pid: str, **kwargs) -> dict:
+    """get_openalex"""
+    doi = normalize_doi(pid)
+    if doi is None:
+        return {"state": "not_found"}
+    url = openalex_api_url(doi)
+    response = httpx.get(url, timeout=10, **kwargs)
+    if response.status_code != 200:
+        return {"state": "not_found"}
+    return response.json() | {"via": "openalex"}
+def read_openalex(data: Optional[dict], **kwargs) -> Commonmeta:
+    """read_openalex"""
+    if data is None:
+        return {"state": "not_found"}
+    meta = data
+    read_options = kwargs or {}
+    doi = meta.get("doi", None)
+    _id = normalize_doi(doi)
+    _type = CR_TO_CM_TRANSLATIONS.get(meta.get("type_crossref", None)) or "Other"
+    additional_type = OA_TO_CM_TRANSLATIONS.get(meta.get("type", None))
+    if additional_type == _type:
+        additional_type = None
+    archive_locations = []
+    contributors = get_contributors(wrap(meta.get("authorships")))
+    contributors = get_authors(contributors)
+    url = normalize_url(
+        py_.get(meta, "primary_location.landing_page_url") or py_.get(meta, "id")
+    )
+    title = meta.get("title", None)
+    if title is not None:
+        titles = [{"title": sanitize(title)}]
+    else:
+        titles = None
+    publisher = compact(
+        {"name": py_.get(meta, "primary_location.source.host_organization_name")}
+    )
+    date = compact(
+        {
+            "published": py_.get(meta, "publication_date")
+            or py_.get(meta, "created_date")
+        }
+    )
+    identifiers = [
+        {
+            "identifier": uid,
+            "identifierType": OA_IDENTIFIER_TYPES[uidType],
+        }
+        for uidType, uid in (meta.get("ids", {})).items()
+    ]
+    license_ = py_.get(meta, "best_oa_location.license")
+    if license_ is not None:
+        license_ = OA_LICENSES.get(license_, license_)
+        license_ = dict_to_spdx({"id": license_})
+    container = get_container(meta)
+    relations = []
+    references = [
+        get_related(i) for i in get_references(meta.get("referenced_works", []))
+    ]
+    funding_references = from_openalex_funding(wrap(meta.get("grants", None)))
+    description = get_abstract(meta)
+    if description is not None:
+        descriptions = [{"description": sanitize(description), "type": "Abstract"}]
+    else:
+        descriptions = None
+    subjects = py_.uniq(
+        [
+            {"subject": py_.get(i, "subfield.display_name")}
+            for i in wrap(meta.get("topics", None))
+        ]
+    )
+    files = get_files(meta)
+    return {
+        # required properties
+        "id": _id,
+        "type": _type,
+        # recommended and optional properties
+        "additionalType": additional_type,
+        "archiveLocations": presence(archive_locations),
+        "container": presence(container),
+        "contributors": presence(contributors),
+        "date": presence(date),
+        "descriptions": presence(descriptions),
+        "files": presence(files),
+        "fundingReferences": presence(funding_references),
+        "geoLocations": None,
+        "identifiers": identifiers,
+        "language": meta.get("language", None),
+        "license": license_,
+        "provider": "OpenAlex",
+        "publisher": presence(publisher),
+        "references": presence(references),
+        "relations": presence(relations),
+        "subjects": presence(subjects),
+        "titles": presence(titles),
+        "url": url,
+        "version": meta.get("version", None),
+    } | read_options
+def get_abstract(meta):
+    """Parse abstract from OpenAlex abstract_inverted_index"""
+    abstract_inverted_index = py_.get(meta, "abstract_inverted_index")
+    if abstract_inverted_index:
+        # Determine the length of the abstract
+        max_pos = max(
+            p for positions in abstract_inverted_index.values() for p in positions
+        )
+        abstract_words = [""] * (max_pos + 1)
+        for word, positions in abstract_inverted_index.items():
+            for p in positions:
+                abstract_words[p] = word
+        abstract = " ".join(abstract_words)
+    else:
+        abstract = None
+    return abstract
+def get_contributors(contributors: list) -> list:
+    """Parse contributor"""
+    def parse_contributor(c):
+        affiliations = []
+        for affiliation in c.get("institutions", []):
+            affiliations.append(
+                compact(
+                    {
+                        "id": affiliation.get("ror", None),
+                        "name": affiliation.get("display_name", None),
+                    }
+                )
+            )
+        return compact(
+            {
+                "id": py_.get(c, "author.orcid"),
+                "name": py_.get(c, "author.display_name"),
+                "affiliations": affiliations,
+            }
+        )
+    return [parse_contributor(i) for i in contributors]
+def get_references(pids: list, **kwargs) -> list:
+    """Get related articles from OpenAlex using their pid
+    Used for retrieving metadata for citations and references which are not included in the OpenAlex record
+    """
+    references = get_openalex_works(pids)
+    return references
+def get_citations(citation_url: str, **kwargs) -> list:
+    response = httpx.get(citation_url, timeout=10, **kwargs)
+    if response.status_code != 200:
+        return {"state": "not_found"}
+    response = response.json()
+    return response.json().get("results", [])
+def get_related(related: Optional[dict]) -> Optional[dict]:
+    """Get reference from OpenAlex reference"""
+    if related is None or not isinstance(related, dict):
+        return None
+    doi = related.get("doi", None)
+    metadata = {
+        "id": normalize_doi(doi) if doi else None,
+        "contributor": related.get("author", None),
+        "title": related.get("display_name", None),
+        "publisher": related.get(
+            "primary_location.source.host_organization_name", None
+        ),
+        "publicationYear": related.get("publication_year", None),
+        "volume": py_.get(related, "biblio.volume"),
+        "issue": py_.get(related, "biblio.issue"),
+        "firstPage": py_.get(related, "biblio.first_page"),
+        "lastPage": py_.get(related, "biblio.last_page"),
+        "containerTitle": related.get("primary_location.source.display_name", None),
+    }
+    return compact(metadata)
+def get_openalex_works(pids: list, **kwargs) -> list:
+    """Get OpenAlex works, use batches of 49 to honor API limit."""
+    pid_batches = [pids[i : i + 49] for i in range(0, len(pids), 49)]
+    works = []
+    for pid_batch in pid_batches:
+        ids = "|".join(pid_batch)
+        url = f"https://api.openalex.org/works?filter=ids.openalex:{ids}"
+        response = httpx.get(url, timeout=10, **kwargs)
+        if response.status_code != 200:
+            return {"state": "not_found"}
+        response = response.json()
+        if py_.get(response, "count") == 0:
+            return {"state": "not_found"}
+        works.extend(response.get("results"))
+    return works
+def get_openalex_funders(pids: list, **kwargs) -> list:
+    """Get ROR id and name from OpenAlex funders.
+    use batches of 49 to honor API limit."""
+    pid_batches = [pids[i : i + 49] for i in range(0, len(pids), 49)]
+    funders = []
+    for pid_batch in pid_batches:
+        ids = "|".join(pid_batch)
+        url = f"https://api.openalex.org/funders?filter=ids.openalex:{ids}"
+        response = httpx.get(url, timeout=10, **kwargs)
+        if response.status_code != 200:
+            return {"state": "not_found"}
+        response = response.json()
+        if py_.get(response, "count") == 0:
+            return {"state": "not_found"}
+        def format_funder(funder):
+            return compact(
+                {
+                    "id": py_.get(funder, "id"),
+                    "ror": py_.get(funder, "ids.ror"),
+                    "name": py_.get(funder, "display_name"),
+                }
+            )
+        f = [format_funder(i) for i in response.get("results")]
+        funders.extend(f)
+    return funders
+def get_openalex_source(str: Optional[str], **kwargs) -> Optional[dict]:
+    """Get issn, name, homepage_url and type from OpenAlex source."""
+    id = validate_openalex(str)
+    if not id:
+        return None
+    url = f"https://api.openalex.org/sources/{id}"
+    response = httpx.get(url, timeout=10, **kwargs)
+    if response.status_code != 200:
+        return {"state": "not_found"}
+    response = response.json()
+    if py_.get(response, "count") == 0:
+        return {"state": "not_found"}
+    return compact(
+        {
+            "id": py_.get(response, "id"),
+            "url": py_.get(response, "homepage_url"),
+            "issn": py_.get(response, "issn_l"),
+            "title": py_.get(response, "display_name"),
+            "type": py_.get(response, "type"),
+        }
+    )
+def get_files(meta) -> Optional[list]:
+    """get file links"""
+    pdf_url = py_.get(meta, "best_oa_location.pdf_url")
+    if pdf_url is None:
+        return None
+    return [
+        {"mimeType": "application/pdf", "url": pdf_url},
+    ]
+def get_container(meta: dict) -> dict:
+    """Get container from OpenAlex"""
+    source = get_openalex_source(py_.get(meta, "primary_location.source.id"))
+    print(source)
+    container_type = py_.get(source, "type")
+    if container_type:
+        container_type = OA_TO_CM_CONTAINER_TRANLATIONS.get(
+            container_type, container_type
+        )
+    issn = py_.get(source, "issn")
+    container_title = py_.get(source, "title")
+    url_ = py_.get(source, "url")
+    return compact(
+        {
+            "type": container_type,
+            "identifier": issn or url_,
+            "identifierType": "ISSN" if issn else "URL" if url_ else None,
+            "title": container_title,
+            "volume": py_.get(meta, "biblio.volume"),
+            "issue": py_.get(meta, "biblio.issue"),
+            "firstPage": py_.get(meta, "biblio.first_page"),
+            "lastPage": py_.get(meta, "biblio.last_page"),
+        }
+    )
+def from_openalex_funding(funding_references: list) -> list:
+    """Get funding references from OpenAlex"""
+    funder_ids = [
+        validate_openalex(funding.get("funder"))
+        for funding in funding_references
+        if "funder" in funding
+    ]
+    funders = get_openalex_funders(funder_ids)
+    formatted_funding_references = []
+    for funding in funding_references:
+        funder = next(
+            item for item in funders if item["id"] == funding.get("funder", None)
+        )
+        f = compact(
+            {
+                "funderName": funder.get("name", None),
+                "funderIdentifier": funder.get("ror", None),
+                "funderIdentifierType": "ROR" if funder.get("ror", None) else None,
+                "awardNumber": funding.get("award_id", None),
+            }
+        )
+        formatted_funding_references.append(f)
+    return py_.uniq(formatted_funding_references)
+def get_random_id_from_openalex(number: int = 1, **kwargs) -> list:
+    """Get random ID from OpenAlex"""
+    number = min(number, 20)
+    url = openalex_api_sample_url(number, **kwargs)
+    try:
+        response = httpx.get(url, timeout=10)
+        if response.status_code != 200:
+            return []
+        items = py_.get(response.json(), "results")
+        print(items)
+        return [i.get("id") for i in items]
+    except (httpx.ReadTimeout, httpx.ConnectError):
+        return []

commonmeta/readers/ris_reader.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """RIS reader for commonmeta-py"""
 from typing import Optional
 from ..utils import compact, normalize_url, wrap

commonmeta/readers/schema_org_reader.py CHANGED Viewed

@@ -410,9 +410,8 @@ def get_html_meta(soup):
         pid = pid.get("content", None) or pid.get("href", None)
         data["@id"] = normalize_id(pid)
-    _type = (
-        soup.select_one("meta[name='dc.type']")
-        or soup.select_one("meta[name='DC.type']")
+    _type = soup.select_one("meta[name='dc.type']") or soup.select_one(
+        "meta[name='DC.type']"
     )
     data["@type"] = _type["content"].capitalize() if _type else None
     if _type is None:

commonmeta/schema_utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Schema utils for commonmeta-py"""
 from os import path
 import orjson as json
 from jsonschema import Draft202012Validator, ValidationError

commonmeta/utils.py CHANGED Viewed

@@ -1,22 +1,22 @@
 """Utils module for commonmeta-py"""
 import os
-import orjson as json
 import re
 import time
 from typing import Optional
 from urllib.parse import urlparse
-import yaml
-from furl import furl
 import bibtexparser
+import orjson as json
+import pycountry
+import yaml
 from bs4 import BeautifulSoup
+from furl import furl
 from pydash import py_
-import pycountry
-from .base_utils import wrap, compact, parse_attributes
-from .doi_utils import normalize_doi, doi_from_url, get_doi_ra, validate_doi, doi_as_url
+from .base_utils import compact, parse_attributes, wrap
 from .constants import DATACITE_CONTRIBUTOR_TYPES
+from .doi_utils import doi_as_url, doi_from_url, get_doi_ra, normalize_doi, validate_doi
 NORMALIZED_LICENSES = {
     "https://creativecommons.org/licenses/by/1.0": "https://creativecommons.org/licenses/by/1.0/legalcode",
@@ -144,17 +144,13 @@ def normalize_id(pid: Optional[str], **kwargs) -> Optional[str]:
         return doi
     # check for valid HTTP uri and ensure https
-    uri = urlparse(pid)
-    if not uri.netloc or uri.scheme not in ["http", "https"]:
+    f = furl(pid)
+    if not f.host or f.scheme not in ["http", "https"]:
         return None
-    if uri.scheme == "http":
-        pid = pid.replace(HTTP_SCHEME, HTTPS_SCHEME)
+    if f.scheme == "http":
+        f.scheme = "https"
-    # remove trailing slash
-    if pid.endswith("/"):
-        pid = pid.strip("/")
-    return pid
+    return f.url
 def normalize_ids(ids: list, relation_type=None) -> list:
@@ -190,8 +186,6 @@ def normalize_url(
     if url is None or not isinstance(url, str):
         return None
     url = url.strip()
-    if url.endswith("/"):
-        url = url.strip("/")
     scheme = urlparse(url).scheme
     if not scheme or scheme not in ["http", "https"]:
         return None
@@ -202,55 +196,13 @@ def normalize_url(
     return url
-# def normalize_url(url: Optional[str], secure=False, fragments=False, lower=False) -> Optional[str]:
-#     """Normalize URL"""
-#     if url is None or not isinstance(url, str):
-#         return None
-#     try:
-#         f = furl(url.strip())
-#         f.path.normalize()
-#         # only allow http and https schemes
-#         if f.scheme not in ["http", "https"]:
-#             return None
-#         if secure and f.scheme == "http":
-#             f.set(scheme="https")
-#         # remove index.html
-#         if f.path.segments and f.path.segments[-1] in ["index.html"]:
-#             f.path.segments.pop(-1)
-#         # remove fragments
-#         if fragments:
-#             f.remove(fragment=True)
-#         # remove specific query parameters
-#         f.remove(
-#             [
-#                 "origin",
-#                 "ref",
-#                 "referrer",
-#                 "source",
-#                 "utm_content",
-#                 "utm_medium",
-#                 "utm_campaign",
-#                 "utm_source",
-#             ]
-#         )
-#         if lower:
-#             return f.url.lower().strip("/")
-#         return f.url.strip("/")
-#     except ValueError:
-#         print(f"Error normalizing url {url}")
-#         return None
 def normalize_cc_url(url: Optional[str]):
     """Normalize Creative Commons URL"""
     if url is None or not isinstance(url, str):
         return None
     url = normalize_url(url, secure=True)
+    if url and url.endswith("/"):
+        url = url.strip("/")
     return NORMALIZED_LICENSES.get(url, url)
@@ -333,6 +285,115 @@ def validate_isni(isni: Optional[str]) -> Optional[str]:
     return isni
+def validate_mag(mag: Optional[str]) -> Optional[str]:
+    """Validate Microsoft Academic Graph ID (mag)"""
+    if mag is None or not isinstance(mag, str):
+        return None
+    match = re.search(
+        r"\A(\d{4,10})\Z",
+        mag,
+    )
+    if match is None:
+        return None
+    return match.group(1)
+def validate_openalex(openalex: Optional[str]) -> Optional[str]:
+    """Validate OpenAlex ID"""
+    if openalex is None or not isinstance(openalex, str):
+        return None
+    match = re.search(
+        r"\A(?:(?:http|https)://openalex\.org/)?([AFIPSW]\d{8,10})\Z",
+        openalex,
+    )
+    if match is None:
+        return None
+    return match.group(1)
+def validate_pmid(pmid: Optional[str]) -> Optional[str]:
+    """Validate PubMed ID (pmid)"""
+    if pmid is None or not isinstance(pmid, str):
+        return None
+    match = re.search(
+        r"\A(?:(?:http|https)://pubmed\.ncbi\.nlm\.nih\.gov/)?(\d{4,8})\Z",
+        pmid,
+    )
+    if match is None:
+        return None
+    return match.group(1)
+def validate_pmcid(pmcid: Optional[str]) -> Optional[str]:
+    """Validate PubMed Central ID (pmcid)"""
+    if pmcid is None or not isinstance(pmcid, str):
+        return None
+    match = re.search(
+        r"\A(?:(?:http|https)://www\.ncbi\.nlm\.nih\.gov/pmc/articles/)?(\d{4,8})\Z",
+        pmcid,
+    )
+    if match is None:
+        return None
+    return match.group(1)
+def validate_id(id: Optional[str]) -> tuple[Optional[str], Optional[str]]:
+    """
+    Validate an identifier and return the validated identifier and its type.
+    Args:
+        id: The identifier string to validate
+    Returns:
+        A tuple containing (validated_id, id_type) or (None, None) if invalid
+    """
+    if id is None:
+        return None, None
+    # Check if it's a DOI
+    doi = validate_doi(id)
+    if doi:
+        return normalize_doi(id), "DOI"
+    # Check if it's an ORCID
+    orcid = validate_orcid(id)
+    if orcid:
+        return normalize_orcid(id), "ORCID"
+    # Check if it's a ROR
+    ror = validate_ror(id)
+    if ror:
+        return normalize_ror(id), "ROR"
+    # Check if it's an ISNI
+    isni = validate_isni(id)
+    if isni:
+        return normalize_isni(id), "ISNI"
+    # Check if it's an OpenAlex ID
+    openalex = validate_openalex(id)
+    if openalex:
+        return f"https://openalex.org/{openalex}", "OpenAlex"
+    # Check if it's a PubMed ID
+    pmid = validate_pmid(id)
+    if pmid:
+        return f"https://pubmed.ncbi.nlm.nih.gov/{pmid}", "PMID"
+    # Check if it's a PubMed Central ID
+    pmcid = validate_pmcid(id)
+    if pmcid:
+        return f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}", "PMCID"
+    # Check if it's a URL
+    url_type = validate_url(id)
+    if url_type:
+        return normalize_url(id), url_type
+    # No known valid identifier type was found
+    return None, None
 def normalize_isni(isni: Optional[str]) -> Optional[str]:
     """Normalize ISNI"""
     if isni is None or not isinstance(isni, str):
@@ -1129,7 +1190,9 @@ def replace_curie(string: Optional[str]) -> Optional[str]:
     if string is None:
         return None
     match = re.sub(
-        r"((?:doi|DOI):\s?([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))", r'https://doi.org/\2', string
+        r"((?:doi|DOI):\s?([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))",
+        r"https://doi.org/\2",
+        string,
     )
     if match is None:
         return None

commonmeta/writers/bibtex_writer.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Bibtex writer for commonmeta-py"""
 from bibtexparser.bwriter import BibTexWriter
 from bibtexparser.bibdatabase import BibDatabase
 from bibtexparser.customization import page_double_hyphen

commonmeta/writers/citation_writer.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Citation writer for commonmeta-py"""
 import orjson as json
 import re
 from pydash import py_

commonmeta/writers/crossref_xml_writer.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Crossref XML writer for commonmeta-py"""
 from typing import Optional
 from ..constants import Commonmeta
 from ..crossref_utils import generate_crossref_xml, generate_crossref_xml_list

commonmeta/writers/csl_writer.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """CSL-JSON writer for commonmeta-py"""
 import orjson as json
 from typing import Optional

commonmeta/writers/datacite_writer.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """DataCite writer for commonmeta-py"""
 import orjson as json
 from typing import Optional, Union

commonmeta/writers/ris_writer.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """RIS writer for commonmeta-py"""
 from ..utils import to_ris
 from ..base_utils import compact, wrap, presence, parse_attributes
 from ..doi_utils import doi_from_url

commonmeta-py 0.100__py3-none-any.whl → 0.103__py3-none-any.whl

commonmeta-py 0.100py3-none-any.whl → 0.103py3-none-any.whl