PyPI - invenio-vocabularies - Versions diffs - 3.4.2__py2.py3-none-any.whl → 4.1.1__py2.py3-none-any.whl - Mend

invenio-vocabularies 3.4.2py2.py3-none-any.whl → 4.1.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (69) hide show

invenio_vocabularies/contrib/awards/datastreams.py CHANGED Viewed

@@ -1,22 +1,79 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2022-2024 CERN.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
 # details.
 """Awards datastreams, transformers, writers and readers."""
+import io
+import requests
 from invenio_access.permissions import system_identity
 from invenio_i18n import lazy_gettext as _
-from ...datastreams.errors import TransformerError
+from ...datastreams.errors import ReaderError, TransformerError
+from ...datastreams.readers import BaseReader
 from ...datastreams.transformers import BaseTransformer
 from ...datastreams.writers import ServiceWriter
 from .config import awards_ec_ror_id, awards_openaire_funders_mapping
+class OpenAIREProjectHTTPReader(BaseReader):
+    """OpenAIRE Project HTTP Reader returning an in-memory binary stream of the latest OpenAIRE Graph Dataset project tar file."""
+    def _iter(self, fp, *args, **kwargs):
+        raise NotImplementedError(
+            "OpenAIREProjectHTTPReader downloads one file and therefore does not iterate through items"
+        )
+    def read(self, item=None, *args, **kwargs):
+        """Reads the latest OpenAIRE Graph Dataset project tar file from Zenodo and yields an in-memory binary stream of it."""
+        if item:
+            raise NotImplementedError(
+                "OpenAIREProjectHTTPReader does not support being chained after another reader"
+            )
+        if self._origin == "full":
+            # OpenAIRE Graph Dataset
+            api_url = "https://zenodo.org/api/records/3516917"
+        elif self._origin == "diff":
+            # OpenAIRE Graph dataset: new collected projects
+            api_url = "https://zenodo.org/api/records/6419021"
+        else:
+            raise ReaderError("The --origin option should be either 'full' or 'diff'")
+        # Call the signposting `linkset+json` endpoint for the Concept DOI (i.e. latest version) of the OpenAIRE Graph Dataset.
+        # See: https://github.com/inveniosoftware/rfcs/blob/master/rfcs/rdm-0071-signposting.md#provide-an-applicationlinksetjson-endpoint
+        headers = {"Accept": "application/linkset+json"}
+        api_resp = requests.get(api_url, headers=headers)
+        api_resp.raise_for_status()
+        # Extract the Landing page Link Set Object located as the first (index 0) item.
+        landing_page_linkset = api_resp.json()["linkset"][0]
+        # Extract the URL of the only project tar file linked to the record.
+        landing_page_project_tar_items = [
+            item
+            for item in landing_page_linkset["item"]
+            if item["type"] == "application/x-tar"
+            and item["href"].endswith("/project.tar")
+        ]
+        if len(landing_page_project_tar_items) != 1:
+            raise ReaderError(
+                f"Expected 1 project tar item but got {len(landing_page_project_tar_items)}"
+            )
+        file_url = landing_page_project_tar_items[0]["href"]
+        # Download the project tar file and fully load the response bytes content in memory.
+        # The bytes content are then wrapped by a BytesIO to be file-like object (as required by `tarfile.open`).
+        # Using directly `file_resp.raw` is not possible since `tarfile.open` requires the file-like object to be seekable.
+        file_resp = requests.get(file_url)
+        file_resp.raise_for_status()
+        yield io.BytesIO(file_resp.content)
 class AwardsServiceWriter(ServiceWriter):
     """Funders service writer."""
@@ -39,7 +96,20 @@ class OpenAIREProjectTransformer(BaseTransformer):
         award = {}
         code = record["code"]
-        openaire_funder_prefix = record["id"].split("::")[0].split("|")[1]
+        # The `id` should follow the format `sourcePrefix::md5(localId)` where `sourcePrefix` is 12 characters long.
+        # See: https://graph.openaire.eu/docs/data-model/pids-and-identifiers#identifiers-in-the-graph
+        #
+        # The format of `id` in the full OpenAIRE Graph Dataset (https://doi.org/10.5281/zenodo.3516917)
+        # follows this format (e.g. 'abc_________::0123456789abcdef0123456789abcdef').
+        # However, the format of `id` in the new collected projects dataset (https://doi.org/10.5281/zenodo.6419021)
+        # does not follow this format, and has a `40|` prefix (e.g. '40|abc_________::0123456789abcdef0123456789abcdef').
+        #
+        # The number '40' corresponds to the entity types 'Project'.
+        # See: https://ec.europa.eu/research/participants/documents/downloadPublic?documentIds=080166e5a3a1a213&appId=PPGMS
+        # See: https://graph.openaire.eu/docs/5.0.0/data-model/entities/project#id
+        openaire_funder_prefix = record["id"].split("::", 1)[0].split("|", 1)[-1]
         funder_id = awards_openaire_funders_mapping.get(openaire_funder_prefix)
         if funder_id is None:
             raise TransformerError(
@@ -78,7 +148,20 @@ class OpenAIREProjectTransformer(BaseTransformer):
             award["identifiers"] = identifiers
         award["number"] = code
+        # `title` is a mandatory attribute of the `Project` object in the OpenAIRE Graph Data Model.
+        # See: https://graph.openaire.eu/docs/data-model/entities/project#title
+        # However, 15'000+ awards for the FCT funder (and 1 record the NIH funder) are missing a title attribute.
+        if "title" not in record:
+            raise TransformerError(
+                _(
+                    "Missing title attribute for award {award_id}".format(
+                        award_id=award["id"]
+                    )
+                )
+            )
         award["title"] = {"en": record["title"]}
         award["funder"] = {"id": funder_id}
         acronym = record.get("acronym")
         if acronym:
@@ -88,6 +171,10 @@ class OpenAIREProjectTransformer(BaseTransformer):
         return stream_entry
+VOCABULARIES_DATASTREAM_READERS = {
+    "openaire-project-http": OpenAIREProjectHTTPReader,
+}
 VOCABULARIES_DATASTREAM_TRANSFORMERS = {
     "openaire-award": OpenAIREProjectTransformer,
 }

invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json CHANGED Viewed

@@ -7,6 +7,9 @@
     "$schema": {
       "$ref": "local://definitions-v1.0.0.json#/$schema"
     },
+    "tags": {
+      "$ref": "local://vocabularies/definitions-v1.0.0.json#/tags"
+    },
     "identifiers": {
       "description": "Alternate identifiers for the award.",
       "type": "array",

invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json CHANGED Viewed

@@ -49,6 +49,9 @@
         "type": "object",
         "dynamic": "true"
       },
+      "tags": {
+        "type": "keyword"
+      },
       "number": {
         "type": "keyword"
       },

invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json CHANGED Viewed

@@ -49,6 +49,9 @@
         "type": "object",
         "dynamic": "true"
       },
+      "tags": {
+        "type": "keyword"
+      },
       "number": {
         "type": "keyword"
       },

invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json CHANGED Viewed

@@ -49,6 +49,9 @@
         "type": "object",
         "dynamic": "true"
       },
+      "tags": {
+        "type": "keyword"
+      },
       "number": {
         "type": "keyword"
       },

invenio_vocabularies/contrib/common/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""Vocabularies common module."""

invenio_vocabularies/contrib/common/ror/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""ROR-related module."""

invenio_vocabularies/contrib/common/ror/datastreams.py ADDED Viewed

@@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+# Copyright (C) 2024 California Institute of Technology.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""ROR-related Datastreams Readers/Writers/Transformers module."""
+import io
+import requests
+from idutils import normalize_ror
+from invenio_vocabularies.datastreams.errors import ReaderError, TransformerError
+from invenio_vocabularies.datastreams.readers import BaseReader
+from invenio_vocabularies.datastreams.transformers import BaseTransformer
+class RORHTTPReader(BaseReader):
+    """ROR HTTP Reader returning an in-memory binary stream of the latest ROR data dump ZIP file."""
+    def _iter(self, fp, *args, **kwargs):
+        raise NotImplementedError(
+            "RORHTTPReader downloads one file and therefore does not iterate through items"
+        )
+    def read(self, item=None, *args, **kwargs):
+        """Reads the latest ROR data dump ZIP file from Zenodo and yields an in-memory binary stream of it."""
+        if item:
+            raise NotImplementedError(
+                "RORHTTPReader does not support being chained after another reader"
+            )
+        # Call the signposting `linkset+json` endpoint for the Concept DOI (i.e. latest version) of the ROR data dump.
+        # See: https://github.com/inveniosoftware/rfcs/blob/master/rfcs/rdm-0071-signposting.md#provide-an-applicationlinksetjson-endpoint
+        headers = {"Accept": "application/linkset+json"}
+        api_url = "https://zenodo.org/api/records/6347574"
+        api_resp = requests.get(api_url, headers=headers)
+        api_resp.raise_for_status()
+        # Extract the Landing page Link Set Object located as the first (index 0) item.
+        landing_page_linkset = api_resp.json()["linkset"][0]
+        # Extract the URL of the only ZIP file linked to the record.
+        landing_page_zip_items = [
+            item
+            for item in landing_page_linkset["item"]
+            if item["type"] == "application/zip"
+        ]
+        if len(landing_page_zip_items) != 1:
+            raise ReaderError(
+                f"Expected 1 ZIP item but got {len(landing_page_zip_items)}"
+            )
+        file_url = landing_page_zip_items[0]["href"]
+        # Download the ZIP file and fully load the response bytes content in memory.
+        # The bytes content are then wrapped by a BytesIO to be file-like object (as required by `zipfile.ZipFile`).
+        # Using directly `file_resp.raw` is not possible since `zipfile.ZipFile` requires the file-like object to be seekable.
+        file_resp = requests.get(file_url)
+        file_resp.raise_for_status()
+        yield io.BytesIO(file_resp.content)
+VOCABULARIES_DATASTREAM_READERS = {
+    "ror-http": RORHTTPReader,
+}
+class RORTransformer(BaseTransformer):
+    """Transforms a JSON ROR record into a funders record."""
+    def __init__(
+        self, *args, vocab_schemes=None, funder_fundref_doi_prefix=None, **kwargs
+    ):
+        """Initializes the transformer."""
+        self.vocab_schemes = vocab_schemes
+        self.funder_fundref_doi_prefix = funder_fundref_doi_prefix
+        super().__init__(*args, **kwargs)
+    def apply(self, stream_entry, **kwargs):
+        """Applies the transformation to the stream entry."""
+        record = stream_entry.entry
+        ror = {}
+        ror["title"] = {}
+        ror["id"] = normalize_ror(record.get("id"))
+        if not ror["id"]:
+            raise TransformerError(_("Id not found in ROR entry."))
+        # Using set so aliases are unique
+        aliases = set()
+        acronym = None
+        for name in record.get("names"):
+            lang = name.get("lang", "en")
+            if lang == None:
+                lang = "en"
+            if "ror_display" in name["types"]:
+                ror["name"] = name["value"]
+            if "label" in name["types"]:
+                ror["title"][lang] = name["value"]
+            if "alias" in name["types"]:
+                aliases.add(name["value"])
+            if "acronym" in name["types"]:
+                # The first acronyn goes in acronym field to maintain
+                # compatability with existing data structure
+                if not acronym:
+                    acronym = name["value"]
+                else:
+                    aliases.add(name["value"])
+        if acronym:
+            ror["acronym"] = acronym
+        if aliases:
+            ror["aliases"] = list(aliases)
+        # ror_display is required and should be in every entry
+        if not ror["name"]:
+            raise TransformerError(
+                _("Name with type ror_display not found in ROR entry.")
+            )
+        # This only gets the first location, to maintain compatability
+        # with existing data structure
+        location = record.get("locations", [{}])[0].get("geonames_details", {})
+        ror["country"] = location.get("country_code")
+        ror["country_name"] = location.get("country_name")
+        ror["location_name"] = location.get("name")
+        ror["types"] = record.get("types")
+        status = record.get("status")
+        ror["status"] = status
+        # The ROR is always listed in identifiers, expected by serialization
+        ror["identifiers"] = [{"identifier": ror["id"], "scheme": "ror"}]
+        if self.vocab_schemes:
+            valid_schemes = set(self.vocab_schemes.keys())
+        else:
+            valid_schemes = set()
+        fund_ref = "fundref"
+        if self.funder_fundref_doi_prefix:
+            valid_schemes.add(fund_ref)
+        for identifier in record.get("external_ids"):
+            scheme = identifier["type"]
+            if scheme in valid_schemes:
+                value = identifier.get("preferred") or identifier.get("all")[0]
+                if scheme == fund_ref:
+                    if self.funder_fundref_doi_prefix:
+                        value = f"{self.funder_fundref_doi_prefix}/{value}"
+                        scheme = "doi"
+                ror["identifiers"].append(
+                    {
+                        "identifier": value,
+                        "scheme": scheme,
+                    }
+                )
+        stream_entry.entry = ror
+        return stream_entry
+VOCABULARIES_DATASTREAM_TRANSFORMERS = {
+    "ror": RORTransformer,
+}

invenio_vocabularies/contrib/funders/config.py CHANGED Viewed

@@ -31,6 +31,8 @@ class FundersSearchOptions(SearchOptions):
         fields=[
             "name^100",
             "identifiers.identifier^10",
+            "acronym^10",
+            "aliases^10",
         ]
     )

invenio_vocabularies/contrib/funders/datastreams.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2022-2024 CERN.
+# Copyright (C) 2024 California Institute of Technology.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -12,8 +13,6 @@ from idutils import normalize_ror
 from invenio_access.permissions import system_identity
 from invenio_i18n import lazy_gettext as _
-from ...datastreams.errors import TransformerError
-from ...datastreams.transformers import BaseTransformer
 from ...datastreams.writers import ServiceWriter
 from .config import funder_fundref_doi_prefix, funder_schemes
@@ -31,60 +30,6 @@ class FundersServiceWriter(ServiceWriter):
         return entry["id"]
-class RORTransformer(BaseTransformer):
-    """Transforms a JSON ROR record into a funders record."""
-    def apply(self, stream_entry, **kwargs):
-        """Applies the transformation to the stream entry."""
-        record = stream_entry.entry
-        funder = {}
-        funder["id"] = normalize_ror(record.get("id"))
-        if not funder["id"]:
-            raise TransformerError(_("Id not found in ROR entry."))
-        funder["name"] = record.get("name")
-        if not funder["name"]:
-            raise TransformerError(_("Name not found in ROR entry."))
-        country_code = record.get("country", {}).get("country_code")
-        if country_code:
-            funder["country"] = country_code
-        funder["title"] = {"en": funder["name"]}
-        for label in record.get("labels", []):
-            funder["title"][label["iso639"]] = label["label"]
-        # The ROR is always listed in identifiers, expected by serialization
-        funder["identifiers"] = [{"identifier": funder["id"], "scheme": "ror"}]
-        valid_schemes = set(funder_schemes.keys())
-        fund_ref = "fundref"
-        valid_schemes.add(fund_ref)
-        for scheme, identifier in record.get("external_ids", {}).items():
-            scheme = scheme.lower()
-            if scheme in valid_schemes:
-                value = identifier.get("preferred") or identifier.get("all")[0]
-                if scheme == fund_ref:
-                    value = f"{funder_fundref_doi_prefix}/{value}"
-                    scheme = "doi"
-                funder["identifiers"].append(
-                    {
-                        "identifier": value,
-                        "scheme": scheme,
-                    }
-                )
-        stream_entry.entry = funder
-        return stream_entry
-VOCABULARIES_DATASTREAM_TRANSFORMERS = {
-    "ror-funder": RORTransformer,
-}
-"""ROR Data Streams transformers."""
 VOCABULARIES_DATASTREAM_WRITERS = {
     "funders-service": FundersServiceWriter,
 }
@@ -96,13 +41,19 @@ DATASTREAM_CONFIG = {
         {
             "type": "zip",
             "args": {
-                "regex": "(?<!_schema_v2)\\.json$",
+                "regex": "_schema_v2\\.json$",
             },
         },
         {"type": "json"},
     ],
     "transformers": [
-        {"type": "ror-funder"},
+        {
+            "type": "ror",
+            "args": {
+                "vocab_schemes": funder_schemes,
+                "funder_fundref_doi_prefix": funder_fundref_doi_prefix,
+            },
+        },
     ],
     "writers": [
         {

invenio_vocabularies/contrib/funders/jsonschemas/funders/funder-v1.0.0.json CHANGED Viewed

@@ -7,9 +7,20 @@
     "$schema": {
       "$ref": "local://definitions-v1.0.0.json#/$schema"
     },
+    "tags": {
+      "$ref": "local://vocabularies/definitions-v1.0.0.json#/tags"
+    },
     "country": {
       "type": "string",
-      "description": "Represents a funder's origin country."
+      "description": "Represents a funder's origin country as a country code."
+    },
+    "country_name": {
+      "type": "string",
+      "description": "Represents a funder's origin country as a full name."
+    },
+    "location_name": {
+      "type": "string",
+      "description": "Represents a funder's location name (usually a city)."
     },
     "identifiers": {
       "description": "Alternate identifiers for the record.",
@@ -23,6 +34,30 @@
       "type": "string",
       "description": "Funders name."
     },
+    "acronym": {
+      "type": "string",
+      "description": "Acronym for funders name."
+    },
+    "status": {
+        "type": "string",
+        "description": "Status of the funder."
+    },
+    "aliases": {
+      "description": "Alternate names for the funder.",
+      "type": "array",
+      "items": {
+        "type": "string"
+      },
+      "uniqueItems": true
+    },
+    "types": {
+      "description": "Types of funders.",
+      "type": "array",
+      "items": {
+        "type": "string"
+      },
+      "uniqueItems": true
+    },
     "title": {
       "$ref": "local://vocabularies/definitions-v1.0.0.json#/title"
     }

invenio_vocabularies/contrib/funders/mappings/os-v1/funders/funder-v1.0.0.json CHANGED Viewed

@@ -57,12 +57,33 @@
       "country": {
         "type": "text"
       },
+      "country_name": {
+        "type": "text"
+      },
+      "location_name": {
+        "type": "text"
+      },
+      "acronym": {
+        "type": "text"
+      },
+      "status": {
+        "type": "keyword"
+      },
+      "aliases": {
+        "type": "text"
+      },
+      "types": {
+        "type": "keyword"
+      },
       "id": {
         "type": "keyword"
       },
       "title": {
         "type": "object",
         "dynamic": "true"
+      },
+      "tags": {
+        "type": "keyword"
       }
     }
   }

invenio_vocabularies/contrib/funders/mappings/os-v2/funders/funder-v1.0.0.json CHANGED Viewed

@@ -57,12 +57,33 @@
       "country": {
         "type": "text"
       },
+      "country_name": {
+        "type": "text"
+      },
+      "location_name": {
+        "type": "text"
+      },
+      "acronym": {
+        "type": "text"
+      },
+      "status": {
+        "type": "keyword"
+      },
+      "aliases": {
+        "type": "text"
+      },
+      "types": {
+        "type": "keyword"
+      },
       "id": {
         "type": "keyword"
       },
       "title": {
         "type": "object",
         "dynamic": "true"
+      },
+      "tags": {
+        "type": "keyword"
       }
     }
   }

invenio_vocabularies/contrib/funders/mappings/v7/funders/funder-v1.0.0.json CHANGED Viewed

@@ -57,12 +57,33 @@
       "country": {
         "type": "text"
       },
+      "country_name": {
+        "type": "text"
+      },
+      "location_name": {
+        "type": "text"
+      },
+      "acronym": {
+        "type": "text"
+      },
+      "status": {
+        "type": "keyword"
+      },
+      "aliases": {
+        "type": "text"
+      },
+      "types": {
+        "type": "keyword"
+      },
       "id": {
         "type": "keyword"
       },
       "title": {
         "type": "object",
         "dynamic": "true"
+      },
+      "tags": {
+        "type": "keyword"
       }
     }
   }

invenio_vocabularies/contrib/funders/schema.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # Copyright (C) 2021-2022 CERN.
+# Copyright (C) 2024 California Institute of Technology.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -43,6 +44,8 @@ class FunderSchema(BaseVocabularySchema):
         required=True, validate=validate.Length(min=1, error=_("Name cannot be blank."))
     )
     country = SanitizedUnicode()
+    country_name = SanitizedUnicode()
+    location_name = SanitizedUnicode()
     identifiers = IdentifierSet(
         fields.Nested(
             partial(
@@ -57,6 +60,11 @@ class FunderSchema(BaseVocabularySchema):
         validate=validate.Length(min=1, error=_("PID cannot be blank."))
     )
+    acronym = SanitizedUnicode()
+    aliases = fields.List(SanitizedUnicode())
+    status = SanitizedUnicode()
+    types = fields.List(SanitizedUnicode())
     @validates_schema
     def validate_id(self, data, **kwargs):
         """Validates ID."""

invenio_vocabularies/contrib/funders/serializer.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2022-2024 CERN.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -29,4 +29,5 @@ class FunderL10NItemSchema(Schema):
     props = fields.Dict(dump_only=True)
     name = fields.String(dump_only=True)
     country = fields.String(dump_only=True)
+    country_name = fields.String(dump_only=True)
     identifiers = fields.List(fields.Nested(IdentifierSchema), dump_only=True)

invenio-vocabularies 3.4.2__py2.py3-none-any.whl → 4.1.1__py2.py3-none-any.whl

Potentially problematic release.

invenio-vocabularies 3.4.2py2.py3-none-any.whl → 4.1.1py2.py3-none-any.whl