PyPI - invenio-vocabularies - Versions diffs - 2.3.1__py2.py3-none-any.whl → 6.3.1__py2.py3-none-any.whl - Mend

invenio-vocabularies 2.3.1py2.py3-none-any.whl → 6.3.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (165) hide show

invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json CHANGED Viewed

@@ -47,17 +47,66 @@
       },
       "title": {
         "type": "object",
-        "dynamic": true
+        "dynamic": "true"
+      },
+      "tags": {
+        "type": "keyword"
       },
       "number": {
         "type": "keyword"
       },
       "acronym": {
-        "type": "keyword"
+        "type": "keyword",
+        "fields": {
+          "text": { "type": "text" }
+        }
       },
       "program": {
         "type": "keyword"
       },
+      "subjects": {
+        "properties": {
+          "@v": {
+            "type": "keyword"
+          },
+          "id": {
+            "type": "keyword"
+          },
+          "props": {
+            "type": "object",
+            "dynamic": "true"
+          },
+          "subject": {
+            "type": "keyword"
+          },
+          "scheme": {
+            "type": "keyword"
+          },
+          "identifiers": {
+            "properties": {
+              "identifier": {
+                "type": "keyword"
+              },
+              "scheme": {
+                "type": "keyword"
+              }
+            }
+          }
+        }
+      },
+      "organizations": {
+        "properties": {
+          "scheme": {
+            "type": "keyword"
+          },
+          "id": {
+            "type": "keyword"
+          },
+          "organization": {
+            "type": "keyword"
+          }
+        }
+      },
       "funder": {
         "type": "object",
         "properties": {

invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json CHANGED Viewed

@@ -47,17 +47,66 @@
       },
       "title": {
         "type": "object",
-        "dynamic": true
+        "dynamic": "true"
+      },
+      "tags": {
+        "type": "keyword"
       },
       "number": {
         "type": "keyword"
       },
       "acronym": {
-        "type": "keyword"
+        "type": "keyword",
+        "fields": {
+          "text": { "type": "text" }
+        }
       },
       "program": {
         "type": "keyword"
       },
+      "subjects": {
+        "properties": {
+          "@v": {
+            "type": "keyword"
+          },
+          "id": {
+            "type": "keyword"
+          },
+          "props": {
+            "type": "object",
+            "dynamic": "true"
+          },
+          "subject": {
+            "type": "keyword"
+          },
+          "scheme": {
+            "type": "keyword"
+          },
+          "identifiers": {
+            "properties": {
+              "identifier": {
+                "type": "keyword"
+              },
+              "scheme": {
+                "type": "keyword"
+              }
+            }
+          }
+        }
+      },
+      "organizations": {
+        "properties": {
+          "scheme": {
+            "type": "keyword"
+          },
+          "id": {
+            "type": "keyword"
+          },
+          "organization": {
+            "type": "keyword"
+          }
+        }
+      },
       "funder": {
         "type": "object",
         "properties": {

invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json CHANGED Viewed

@@ -47,17 +47,66 @@
       },
       "title": {
         "type": "object",
-        "dynamic": true
+        "dynamic": "true"
+      },
+      "tags": {
+        "type": "keyword"
       },
       "number": {
         "type": "keyword"
       },
       "acronym": {
-        "type": "keyword"
+        "type": "keyword",
+        "fields": {
+          "text": { "type": "text" }
+        }
       },
       "program": {
         "type": "keyword"
       },
+      "subjects": {
+        "properties": {
+          "@v": {
+            "type": "keyword"
+          },
+          "id": {
+            "type": "keyword"
+          },
+          "props": {
+            "type": "object",
+            "dynamic": "true"
+          },
+          "subject": {
+            "type": "keyword"
+          },
+          "scheme": {
+            "type": "keyword"
+          },
+          "identifiers": {
+            "properties": {
+              "identifier": {
+                "type": "keyword"
+              },
+              "scheme": {
+                "type": "keyword"
+              }
+            }
+          }
+        }
+      },
+      "organizations": {
+        "properties": {
+          "scheme": {
+            "type": "keyword"
+          },
+          "id": {
+            "type": "keyword"
+          },
+          "organization": {
+            "type": "keyword"
+          }
+        }
+      },
       "funder": {
         "type": "object",
         "properties": {

invenio_vocabularies/contrib/awards/schema.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2021-2022 CERN.
+# Copyright (C) 2021-2024 CERN.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -17,13 +17,24 @@ from marshmallow_utils.schemas import IdentifierSchema
 from ...services.schema import (
     BaseVocabularySchema,
+    ContribVocabularyRelationSchema,
     ModePIDFieldVocabularyMixin,
     i18n_strings,
 )
 from ..funders.schema import FunderRelationSchema
+from ..subjects.schema import SubjectRelationSchema
 from .config import award_schemes
+class AwardOrganizationRelationSchema(ContribVocabularyRelationSchema):
+    """Schema to define an organization relation in an award."""
+    ftf_name = "organization"
+    parent_field_name = "organizations"
+    organization = SanitizedUnicode()
+    scheme = SanitizedUnicode()
 class AwardSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin):
     """Award schema."""
@@ -46,6 +57,10 @@ class AwardSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin):
     program = SanitizedUnicode()
+    subjects = fields.List(fields.Nested(SubjectRelationSchema))
+    organizations = fields.List(fields.Nested(AwardOrganizationRelationSchema))
     id = SanitizedUnicode(
         validate=validate.Length(min=1, error=_("PID cannot be blank."))
     )

invenio_vocabularies/contrib/awards/serializer.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2022-2024 CERN.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -12,6 +12,9 @@ from marshmallow import Schema, fields
 from invenio_vocabularies.resources import L10NString
+from ..subjects.schema import SubjectRelationSchema
+from .schema import AwardOrganizationRelationSchema
 class IdentifierSchema(Schema):
     """Identifier scheme."""
@@ -37,4 +40,8 @@ class AwardL10NItemSchema(Schema):
     acronym = fields.String(dump_only=True)
     program = fields.String(dump_only=True)
     funder = fields.Nested(FunderRelationSchema, dump_only=True)
+    subjects = fields.List(fields.Nested(SubjectRelationSchema), dump_only=True)
     identifiers = fields.List(fields.Nested(IdentifierSchema), dump_only=True)
+    organizations = fields.List(
+        fields.Nested(AwardOrganizationRelationSchema), dump_only=True
+    )

invenio_vocabularies/contrib/awards/services.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2022-2024 CERN.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,7 +8,6 @@
 """Vocabulary awards."""
 from .awards import record_type
 AwardsServiceConfig = record_type.service_config_cls

invenio_vocabularies/contrib/common/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""Vocabularies common module."""

invenio_vocabularies/contrib/common/openaire/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""OpenAIRE-related module."""

invenio_vocabularies/contrib/common/openaire/datastreams.py ADDED Viewed

@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""OpenAIRE-related Datastreams Readers/Writers/Transformers module."""
+import io
+import requests
+from invenio_vocabularies.datastreams.errors import ReaderError
+from invenio_vocabularies.datastreams.readers import BaseReader
+class OpenAIREHTTPReader(BaseReader):
+    """OpenAIRE HTTP Reader returning an in-memory binary stream of the latest OpenAIRE Graph Dataset tar file of a given type."""
+    def __init__(self, origin=None, mode="r", tar_href=None, *args, **kwargs):
+        """Constructor."""
+        self.tar_href = tar_href
+        super().__init__(origin, mode, *args, **kwargs)
+    def _iter(self, fp, *args, **kwargs):
+        raise NotImplementedError(
+            "OpenAIREHTTPReader downloads one file and therefore does not iterate through items"
+        )
+    def read(self, item=None, *args, **kwargs):
+        """Reads the latest OpenAIRE Graph Dataset tar file of a given type from Zenodo and yields an in-memory binary stream of it."""
+        if item:
+            raise NotImplementedError(
+                "OpenAIREHTTPReader does not support being chained after another reader"
+            )
+        if self._origin == "full":
+            # OpenAIRE Graph Dataset
+            api_url = "https://zenodo.org/api/records/3516917"
+        elif self._origin == "diff":
+            # OpenAIRE Graph dataset: new collected projects
+            api_url = "https://zenodo.org/api/records/6419021"
+        else:
+            raise ReaderError("The --origin option should be either 'full' or 'diff'")
+        # Call the signposting `linkset+json` endpoint for the Concept DOI (i.e. latest version) of the OpenAIRE Graph Dataset.
+        # See: https://github.com/inveniosoftware/rfcs/blob/master/rfcs/rdm-0071-signposting.md#provide-an-applicationlinksetjson-endpoint
+        headers = {"Accept": "application/linkset+json"}
+        api_resp = requests.get(api_url, headers=headers)
+        api_resp.raise_for_status()
+        # Extract the Landing page Link Set Object located as the first (index 0) item.
+        landing_page_linkset = api_resp.json()["linkset"][0]
+        # Extract the URL of the only tar file matching `tar_href` linked to the record.
+        landing_page_matching_tar_items = [
+            item
+            for item in landing_page_linkset["item"]
+            if item["type"] == "application/x-tar"
+            and item["href"].endswith(self.tar_href)
+        ]
+        if len(landing_page_matching_tar_items) != 1:
+            raise ReaderError(
+                f"Expected 1 tar item matching {self.tar_href} but got {len(landing_page_matching_tar_items)}"
+            )
+        file_url = landing_page_matching_tar_items[0]["href"]
+        # Download the matching tar file and fully load the response bytes content in memory.
+        # The bytes content are then wrapped by a BytesIO to be file-like object (as required by `tarfile.open`).
+        # Using directly `file_resp.raw` is not possible since `tarfile.open` requires the file-like object to be seekable.
+        file_resp = requests.get(file_url)
+        file_resp.raise_for_status()
+        yield io.BytesIO(file_resp.content)
+VOCABULARIES_DATASTREAM_READERS = {
+    "openaire-http": OpenAIREHTTPReader,
+}
+VOCABULARIES_DATASTREAM_TRANSFORMERS = {}
+VOCABULARIES_DATASTREAM_WRITERS = {}

invenio_vocabularies/contrib/common/ror/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""ROR-related module."""

invenio_vocabularies/contrib/common/ror/datastreams.py ADDED Viewed

@@ -0,0 +1,220 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+# Copyright (C) 2024 California Institute of Technology.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""ROR-related Datastreams Readers/Writers/Transformers module."""
+import io
+import arrow
+import requests
+from idutils import normalize_ror
+from invenio_vocabularies.datastreams.errors import ReaderError, TransformerError
+from invenio_vocabularies.datastreams.readers import BaseReader
+from invenio_vocabularies.datastreams.transformers import BaseTransformer
+class RORHTTPReader(BaseReader):
+    """ROR HTTP Reader.
+    Returning an in-memory
+    binary stream of the latest ROR data dump ZIP file.
+    """
+    def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
+        """Constructor."""
+        self._since = since
+        super().__init__(origin, mode, *args, **kwargs)
+    def _iter(self, fp, *args, **kwargs):
+        raise NotImplementedError(
+            "RORHTTPReader downloads one file "
+            "and therefore does not iterate through items"
+        )
+    def _get_last_dump_date(self, linksets):
+        """Get the last dump date."""
+        for linkset in linksets:
+            metadata_formats = linkset.get("describedby", [])
+            for format_link in metadata_formats:
+                if format_link.get("type") == "application/ld+json":
+                    json_ld_reponse = requests.get(
+                        format_link["href"],
+                        headers={"Accept": format_link["type"]},
+                    )
+                    json_ld_reponse.raise_for_status()
+                    json_ld_data = json_ld_reponse.json()
+                    last_dump_date = arrow.get(
+                        json_ld_data.get("dateCreated")
+                        or json_ld_data.get("datePublished")
+                    )
+                    return last_dump_date
+        else:
+            raise ReaderError(
+                "Couldn't find JSON-LD in publisher's linkset "
+                "to determine last dump date."
+            )
+    def read(self, item=None, *args, **kwargs):
+        """Reads the latest ROR data dump.
+        Read from ZIP file from
+        Zenodo and yields an in-memory binary stream of it.
+        """
+        if item:
+            raise NotImplementedError(
+                "RORHTTPReader does not support being chained after another reader"
+            )
+        # Follow the DOI to get the link of the linkset
+        dataset_doi_link = "https://doi.org/10.5281/zenodo.6347574"
+        landing_page = requests.get(dataset_doi_link, allow_redirects=True)
+        landing_page.raise_for_status()
+        # Call the signposting `linkset+json` endpoint for
+        # the Concept DOI (i.e. latest version) of the ROR data dump.
+        # See: https://github.com/inveniosoftware/rfcs/blob/master/rfcs/rdm-0071-signposting.md#provide-an-applicationlinksetjson-endpoint
+        if "linkset" not in landing_page.links:
+            raise ReaderError("Linkset not found in the ROR dataset record.")
+        linkset_response = requests.get(
+            landing_page.links["linkset"]["url"],
+            headers={"Accept": "application/linkset+json"},
+        )
+        linkset_response.raise_for_status()
+        linksets = linkset_response.json()["linkset"]
+        if self._since:
+            last_dump_date = self._get_last_dump_date(linksets)
+            if last_dump_date < arrow.get(self._since):
+                return
+        for linkset in linksets:
+            items = linkset.get("item", [])
+            zip_files = [item for item in items if item["type"] == "application/zip"]
+            if len(zip_files) == 1:
+                file_url = zip_files[0]["href"]
+                break
+            if len(zip_files) > 1:
+                raise ReaderError(f"Expected 1 ZIP item but got {len(zip_files)}")
+        # Download the ZIP file and fully load the response bytes content in memory.
+        # The bytes content are then wrapped by a BytesIO to be
+        # file-like object (as required by `zipfile.ZipFile`).
+        # Using directly `file_resp.raw` is not possible since
+        # `zipfile.ZipFile` requires the file-like object to be seekable.
+        file_resp = requests.get(file_url)
+        file_resp.raise_for_status()
+        yield io.BytesIO(file_resp.content)
+VOCABULARIES_DATASTREAM_READERS = {
+    "ror-http": RORHTTPReader,
+}
+class RORTransformer(BaseTransformer):
+    """Transforms a JSON ROR record into a funders record."""
+    def __init__(
+        self, *args, vocab_schemes=None, funder_fundref_doi_prefix=None, **kwargs
+    ):
+        """Initializes the transformer."""
+        self.vocab_schemes = vocab_schemes
+        self.funder_fundref_doi_prefix = funder_fundref_doi_prefix
+        super().__init__(*args, **kwargs)
+    def apply(self, stream_entry, **kwargs):
+        """Applies the transformation to the stream entry."""
+        record = stream_entry.entry
+        ror = {}
+        ror["title"] = {}
+        ror["id"] = normalize_ror(record.get("id"))
+        if not ror["id"]:
+            raise TransformerError(_("Id not found in ROR entry."))
+        # Using set so aliases are unique
+        aliases = set()
+        acronym = None
+        for name in record.get("names"):
+            lang = name.get("lang", "en")
+            if lang == None:
+                lang = "en"
+            if "ror_display" in name["types"]:
+                ror["name"] = name["value"]
+            if "label" in name["types"]:
+                ror["title"][lang] = name["value"]
+            if "alias" in name["types"]:
+                aliases.add(name["value"])
+            if "acronym" in name["types"]:
+                # The first acronyn goes in acronym field to maintain
+                # compatability with existing data structure
+                if not acronym:
+                    acronym = name["value"]
+                else:
+                    aliases.add(name["value"])
+        if "en" not in ror["title"]:
+            ror["title"]["en"] = ror["name"]
+        if acronym:
+            ror["acronym"] = acronym
+        if aliases:
+            ror["aliases"] = list(aliases)
+        # ror_display is required and should be in every entry
+        if not ror["name"]:
+            raise TransformerError(
+                _("Name with type ror_display not found in ROR entry.")
+            )
+        # This only gets the first location, to maintain compatability
+        # with existing data structure
+        location = record.get("locations", [{}])[0].get("geonames_details", {})
+        ror["country"] = location.get("country_code")
+        ror["country_name"] = location.get("country_name")
+        ror["location_name"] = location.get("name")
+        ror["types"] = record.get("types")
+        status = record.get("status")
+        ror["status"] = status
+        # The ROR is always listed in identifiers, expected by serialization
+        ror["identifiers"] = [{"identifier": ror["id"], "scheme": "ror"}]
+        if self.vocab_schemes:
+            valid_schemes = set(self.vocab_schemes.keys())
+        else:
+            valid_schemes = set()
+        fund_ref = "fundref"
+        if self.funder_fundref_doi_prefix:
+            valid_schemes.add(fund_ref)
+        for identifier in record.get("external_ids"):
+            scheme = identifier["type"]
+            if scheme in valid_schemes:
+                value = identifier.get("preferred") or identifier.get("all")[0]
+                if scheme == fund_ref:
+                    if self.funder_fundref_doi_prefix:
+                        value = f"{self.funder_fundref_doi_prefix}/{value}"
+                        scheme = "doi"
+                ror["identifiers"].append(
+                    {
+                        "identifier": value,
+                        "scheme": scheme,
+                    }
+                )
+        stream_entry.entry = ror
+        return stream_entry
+VOCABULARIES_DATASTREAM_TRANSFORMERS = {
+    "ror": RORTransformer,
+}
+VOCABULARIES_DATASTREAM_WRITERS = {}

invenio_vocabularies/contrib/funders/config.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2022-2024 CERN.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -9,6 +9,7 @@
 """Vocabulary funders configuration."""
 from flask import current_app
+from invenio_i18n import get_locale
 from invenio_i18n import lazy_gettext as _
 from invenio_records_resources.services import SearchOptions
 from invenio_records_resources.services.records.components import DataComponent
@@ -22,6 +23,7 @@ funder_schemes = LocalProxy(lambda: current_app.config["VOCABULARIES_FUNDER_SCHE
 funder_fundref_doi_prefix = LocalProxy(
     lambda: current_app.config["VOCABULARIES_FUNDER_DOI_PREFIX"]
 )
+localized_title = LocalProxy(lambda: f"title.{get_locale()}^20")
 class FundersSearchOptions(SearchOptions):
@@ -30,8 +32,15 @@ class FundersSearchOptions(SearchOptions):
     suggest_parser_cls = SuggestQueryParser.factory(
         fields=[
             "name^100",
+            "acronym.keyword^100",
+            "acronym^40",
+            localized_title,
+            "id^20",
+            "aliases^20",
             "identifiers.identifier^10",
-        ]
+        ],
+        type="most_fields",  # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html#multi-match-types
+        fuzziness="AUTO",  # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
     )
     sort_default = "bestmatch"

invenio-vocabularies 2.3.1__py2.py3-none-any.whl → 6.3.1__py2.py3-none-any.whl

Potentially problematic release.

invenio-vocabularies 2.3.1py2.py3-none-any.whl → 6.3.1py2.py3-none-any.whl