PyPI - invenio-vocabularies - Versions diffs - 6.5.0__py2.py3-none-any.whl → 6.7.0__py2.py3-none-any.whl - Mend

invenio-vocabularies 6.5.0py2.py3-none-any.whl → 6.7.0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (46) hide show

invenio_vocabularies/contrib/names/mappings/os-v1/names/name-v2.0.0.json CHANGED Viewed

@@ -125,6 +125,17 @@
             "type": "text",
             "analyzer": "accent_edge_analyzer",
             "search_analyzer": "accent_analyzer"
+          },
+          "acronym": {
+            "type": "text",
+            "analyzer": "accent_edge_analyzer",
+            "search_analyzer": "accent_analyzer",
+            "fields": {
+              "keyword": {
+                "type": "keyword",
+                "normalizer": "accent_normalizer"
+              }
+            }
           }
         }
       },
@@ -144,6 +155,10 @@
             "type": "keyword"
           }
         }
+      },
+      "props": {
+        "type": "object",
+        "dynamic": "true"
       }
     }
   }

invenio_vocabularies/contrib/names/mappings/os-v2/names/name-v2.0.0.json CHANGED Viewed

@@ -125,6 +125,17 @@
             "type": "text",
             "analyzer": "accent_edge_analyzer",
             "search_analyzer": "accent_analyzer"
+          },
+          "acronym": {
+            "type": "text",
+            "analyzer": "accent_edge_analyzer",
+            "search_analyzer": "accent_analyzer",
+            "fields": {
+              "keyword": {
+                "type": "keyword",
+                "normalizer": "accent_normalizer"
+              }
+            }
           }
         }
       },
@@ -144,6 +155,10 @@
             "type": "keyword"
           }
         }
+      },
+      "props": {
+        "type": "object",
+        "dynamic": "true"
       }
     }
   }

invenio_vocabularies/contrib/names/names.py CHANGED Viewed

@@ -30,7 +30,7 @@ from .schema import NameSchema
 name_relations = RelationsField(
     affiliations=PIDListRelation(
         "affiliations",
-        keys=["name"],
+        keys=["name", "acronym"],
         pid_field=Affiliation.pid,
         cache_key="affiliations",
     )

invenio_vocabularies/contrib/names/permissions.py CHANGED Viewed

@@ -10,11 +10,21 @@
 from invenio_records_permissions.generators import AuthenticatedUser, SystemProcess
-from ...services.permissions import PermissionPolicy
+from invenio_vocabularies.services.generators import IfTags
+from invenio_vocabularies.services.permissions import PermissionPolicy
 class NamesPermissionPolicy(PermissionPolicy):
-    """Permission policy."""
+    """Names permission policy.
-    can_search = [SystemProcess(), AuthenticatedUser()]
-    can_read = [SystemProcess(), AuthenticatedUser()]
+    Names endpoints are protected, only authenticated users can access them.
+    """
+    can_search = [
+        SystemProcess(),
+        AuthenticatedUser(),
+    ]
+    can_read = [
+        SystemProcess(),
+        IfTags(["unlisted"], then_=[SystemProcess()], else_=[AuthenticatedUser()]),
+    ]

invenio_vocabularies/contrib/names/schema.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2021 CERN.
+# Copyright (C) 2021-2024 CERN.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -16,10 +16,18 @@ from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode
 from marshmallow_utils.schemas import IdentifierSchema
 from ...services.schema import BaseVocabularySchema, ModePIDFieldVocabularyMixin
-from ..affiliations.schema import AffiliationRelationSchema
+from ..affiliations.schema import (
+    AffiliationRelationSchema as BaseAffiliationRelationSchema,
+)
 from .config import names_schemes
+class AffiliationRelationSchema(BaseAffiliationRelationSchema):
+    """Affiliation relation schema."""
+    acronym = SanitizedUnicode(dump_only=True)
 class NameSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin):
     """Service schema for names.
@@ -42,6 +50,7 @@ class NameSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin):
         )
     )
     affiliations = fields.List(fields.Nested(AffiliationRelationSchema))
+    props = fields.Dict(keys=fields.Str(), values=fields.Raw())
     @validates_schema
     def validate_names(self, data, **kwargs):

invenio_vocabularies/contrib/names/services.py CHANGED Viewed

@@ -19,11 +19,12 @@ NamesServiceConfig = record_type.service_config_cls
 class NamesService(record_type.service_cls):
     """Name service."""
-    def resolve(self, identity, id_, id_type):
+    def resolve(self, identity, id_, id_type, many=False):
         """Get the record with a given identifier.
-        This method assumes that the are no duplicates in the system
-        (i.e. only one name record can have a pair of identifier:scheme).
+        param id_: The identifier value.
+        param id_type: The identifier type.
+        param many: If True, return a list of records.
         """
         search_query = dsl.Q(
             "bool",
@@ -36,20 +37,28 @@ class NamesService(record_type.service_cls):
         # max_records = 1, we assume there cannot be duplicates
         # the loading process needs to make sure of that
-        results = self._read_many(identity, search_query, max_records=1)
+        if many:
+            results = self._read_many(identity, search_query)
+        else:
+            results = self._read_many(identity, search_query, max_records=1)
         # cant use the results_item because it returns dicts intead of records
         total = results.hits.total["value"]
         if total == 0:
             # Not a PID but trated as such
             raise PIDDoesNotExistError(pid_type=id_type, pid_value=id_)
+        if many:
+            for result in results:
+                record = self.record_cls.loads(result.to_dict())
+                self.require_permission(identity, "read", record=record)
+            return self.result_list(self, identity, results)
+        else:
+            record = self.record_cls.loads(results[0].to_dict())
+            self.require_permission(identity, "read", record=record)
-        # (0 < #hits <= max_records) = 1
-        record = self.record_cls.loads(results[0].to_dict())
-        self.require_permission(identity, "read", record=record)
-        return self.result_item(
-            self,
-            identity,
-            record,
-            links_tpl=self.links_item_tpl,
-        )
+            return self.result_item(
+                self,
+                identity,
+                record,
+                links_tpl=self.links_item_tpl,
+            )

invenio_vocabularies/contrib/subjects/config.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2021 CERN.
+# Copyright (C) 2021-2024 CERN.
 # Copyright (C) 2021 Northwestern University.
 # Copyright (C) 2024 University of Münster.
 #
@@ -11,6 +11,7 @@
 """Subjects configuration."""
 from flask import current_app
+from invenio_i18n import get_locale
 from invenio_i18n import lazy_gettext as _
 from invenio_records_resources.services import SearchOptions
 from invenio_records_resources.services.records.components import DataComponent
@@ -22,6 +23,16 @@ from ...services.querystr import FilteredSuggestQueryParser
 subject_schemes = LocalProxy(
     lambda: current_app.config["VOCABULARIES_SUBJECTS_SCHEMES"]
 )
+localized_title = LocalProxy(lambda: f"title.{get_locale()}^20")
+gemet_file_url = LocalProxy(
+    lambda: current_app.config["VOCABULARIES_SUBJECTS_GEMET_FILE_URL"]
+)
+euroscivoc_file_url = LocalProxy(
+    lambda: current_app.config["VOCABULARIES_SUBJECTS_EUROSCIVOC_FILE_URL"]
+)
 class SubjectsSearchOptions(SearchOptions):
@@ -30,7 +41,8 @@ class SubjectsSearchOptions(SearchOptions):
     suggest_parser_cls = FilteredSuggestQueryParser.factory(
         filter_field="scheme",
         fields=[  # suggest fields
-            "title.*^100",
+            "subject^100",
+            localized_title,
             "synonyms^20",
         ],
     )

invenio_vocabularies/contrib/subjects/datastreams.py CHANGED Viewed

@@ -13,6 +13,7 @@ from invenio_i18n import lazy_gettext as _
 from ...datastreams.writers import ServiceWriter
 from .euroscivoc import datastreams as euroscivoc_datastreams
+from .gemet import datastreams as gemet_datastreams
 from .mesh import datastreams as mesh_datastreams
@@ -32,12 +33,14 @@ class SubjectsServiceWriter(ServiceWriter):
 VOCABULARIES_DATASTREAM_READERS = {
     **mesh_datastreams.VOCABULARIES_DATASTREAM_READERS,
     **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS,
+    **gemet_datastreams.VOCABULARIES_DATASTREAM_READERS,
 }
 """Subjects Data Streams readers."""
 VOCABULARIES_DATASTREAM_TRANSFORMERS = {
     **mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
     **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
+    **gemet_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
 }
 """Subjects Data Streams transformers."""
@@ -45,6 +48,7 @@ VOCABULARIES_DATASTREAM_WRITERS = {
     "subjects-service": SubjectsServiceWriter,
     **mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
     **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
+    **gemet_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
 }
 """Subjects Data Streams writers."""

invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022-2024 CERN.
+# Copyright (C) 2024 CERN.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,120 +8,36 @@
 """EuroSciVoc subjects datastreams, readers, transformers, and writers."""
-import io
-from collections import namedtuple
+from invenio_vocabularies.datastreams.transformers import RDFTransformer
-import requests
-from rdflib import OWL, RDF, Graph, Namespace
+from ..config import euroscivoc_file_url
-from invenio_vocabularies.config import SUBJECTS_EUROSCIVOC_FILE_URL
-from invenio_vocabularies.datastreams.readers import BaseReader
-from invenio_vocabularies.datastreams.transformers import BaseTransformer
-class EuroSciVocSubjectsHTTPReader(BaseReader):
-    """Reader class to fetch and process EuroSciVoc RDF data."""
-    def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
-        """Initialize the reader with the data source.
-        :param origin: The URL from which to fetch the RDF data.
-        :param mode: Mode of operation (default is 'r' for reading).
-        """
-        self.origin = origin or SUBJECTS_EUROSCIVOC_FILE_URL
-        super().__init__(origin=origin, mode=mode, *args, **kwargs)
-    def _iter(self, rdf_graph):
-        """Iterate over the RDF graph, yielding one subject at a time.
-        :param rdf_graph: The RDF graph to process.
-        :yield: Subject and graph to be transformed.
-        """
-        SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
-        for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)):
-            yield {"subject": subject, "rdf_graph": rdf_graph}
-    def read(self, item=None, *args, **kwargs):
-        """Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time.
-        :param item: The RDF data provided as bytes (optional).
-        :yield: Processed EuroSciVoc subject data.
-        """
-        if item:
-            raise NotImplementedError(
-                "EuroSciVocSubjectsHTTPReader does not support being chained after another reader"
-            )
-        # Fetch the RDF data from the specified origin URL
-        response = requests.get(self.origin)
-        response.raise_for_status()
-        # Treat the response content as a file-like object
-        rdf_data = io.BytesIO(response.content)
-        # Parse the RDF data into a graph
-        rdf_graph = Graph()
-        rdf_graph.parse(rdf_data, format="xml")
-        # Yield each processed subject from the RDF graph
-        yield from self._iter(rdf_graph)
-class EuroSciVocSubjectsTransformer(BaseTransformer):
+class EuroSciVocSubjectsTransformer(RDFTransformer):
     """Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
-    SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
-    SPLITCHAR = ","
     def _get_notation(self, subject, rdf_graph):
         """Extract the numeric notation for a subject."""
         for _, _, notation in rdf_graph.triples(
-            (subject, self.SKOS_CORE.notation, None)
+            (subject, self.skos_core.notation, None)
         ):
             if str(notation).isdigit():
                 return str(notation)
         return None
-    def _get_labels(self, subject, rdf_graph):
-        """Extract prefLabel and altLabel languages for a subject."""
-        labels = {
-            label.language: label.value.capitalize()
-            for _, _, label in rdf_graph.triples(
-                (subject, self.SKOS_CORE.prefLabel, None)
-            )
-        }
-        if "en" not in labels:
-            for _, _, label in rdf_graph.triples(
-                (subject, self.SKOS_CORE.altLabel, None)
-            ):
-                labels.setdefault(label.language, label.value.capitalize())
-        return labels
-    def _find_parents(self, subject, rdf_graph):
-        """Find parent notations."""
-        parents = []
-        # Traverse the broader hierarchy
-        for broader in rdf_graph.transitive_objects(subject, self.SKOS_CORE.broader):
-            if broader != subject:  # Ensure we don't include the current subject
-                parent_notation = self._get_notation(broader, rdf_graph)
-                if parent_notation:
-                    parents.append(parent_notation)
-        return parents
+    def _get_parent_notation(self, broader, rdf_graph):
+        """Extract parent notation using numeric notation."""
+        return self._get_notation(broader, rdf_graph)
     def _transform_entry(self, subject, rdf_graph):
-        """Transform an entry to the required dictionary format."""
-        # Get subject notation with euroscivoc prefix
         notation = self._get_notation(subject, rdf_graph)
         id = f"euroscivoc:{notation}" if notation else None
-        # Get labels for the current subject
         labels = self._get_labels(subject, rdf_graph)
-        # Join parent notations with SPLITCHAR separator and add euroscivoc prefix
-        parents = self.SPLITCHAR.join(
-            f"euroscivoc:{n}" for n in reversed(self._find_parents(subject, rdf_graph))
+        parents = ",".join(
+            f"euroscivoc:{n}"
+            for n in reversed(self._find_parents(subject, rdf_graph))
+            if n
         )
-        # Create identifiers list
         identifiers = [{"scheme": "url", "identifier": str(subject)}]
         return {
@@ -133,23 +49,9 @@ class EuroSciVocSubjectsTransformer(BaseTransformer):
             "identifiers": identifiers,
         }
-    def apply(self, stream_entry, *args, **kwargs):
-        """Transform a stream entry to the required dictionary format.
-        :param stream_entry: The entry to be transformed, which includes the subject and the RDF graph.
-        :return: The transformed stream entry.
-        """
-        # Apply transformations
-        entry_data = self._transform_entry(
-            stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
-        )
-        stream_entry.entry = entry_data
-        return stream_entry
-# Configuration for datastream readers, transformers, and writers
-VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}
+# Configuration for datastream transformers, and writers
+VOCABULARIES_DATASTREAM_READERS = {}
 VOCABULARIES_DATASTREAM_WRITERS = {}
 VOCABULARIES_DATASTREAM_TRANSFORMERS = {
@@ -159,8 +61,14 @@ VOCABULARIES_DATASTREAM_TRANSFORMERS = {
 DATASTREAM_CONFIG = {
     "readers": [
         {
-            "type": "euroscivoc-reader",
-        }
+            "type": "http",
+            "args": {
+                "origin": euroscivoc_file_url,
+            },
+        },
+        {
+            "type": "rdf",
+        },
     ],
     "transformers": [{"type": "euroscivoc-transformer"}],
     "writers": [

invenio_vocabularies/contrib/subjects/gemet/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""GEMET Subjects module."""

invenio_vocabularies/contrib/subjects/gemet/datastreams.py ADDED Viewed

@@ -0,0 +1,109 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""GEMET subjects datastreams, readers, transformers, and writers."""
+from invenio_vocabularies.datastreams.transformers import RDFTransformer
+from ..config import gemet_file_url
+# Available with the "rdf" extra
+try:
+    import rdflib
+except ImportError:
+    rdflib = None
+class GEMETSubjectsTransformer(RDFTransformer):
+    """Transformer class to convert GEMET RDF data to a dictionary format."""
+    def _get_parent_notation(self, broader, rdf_graph):
+        """Extract parent notation from GEMET URI."""
+        return "/".join(broader.split("/")[-2:])
+    def _get_groups_and_themes(self, subject, rdf_graph):
+        """Extract groups and themes for a subject."""
+        groups = []
+        themes = []
+        for relation in rdf_graph.subjects(
+            predicate=self.skos_core.member, object=subject
+        ):
+            relation_uri = str(relation)
+            relation_label = None
+            # If the relation is a group, check for skos:prefLabel
+            if "group" in relation_uri:
+                labels = rdf_graph.objects(
+                    subject=relation, predicate=self.skos_core.prefLabel
+                )
+                relation_label = next(
+                    (str(label) for label in labels if label.language == "en"), None
+                )
+                groups.append(relation_uri)
+            # If the relation is a theme, check for rdfs:label
+            elif "theme" in relation_uri:
+                labels = rdf_graph.objects(
+                    subject=relation, predicate=rdflib.RDFS.label
+                )
+                relation_label = next(
+                    (str(label) for label in labels if label.language == "en"), None
+                )
+                themes.append(relation_uri)
+        return groups, themes
+    def _transform_entry(self, subject, rdf_graph):
+        """Transform an entry to the required dictionary format."""
+        concept_number = "/".join(subject.split("/")[-2:])
+        id = f"gemet:{concept_number}" if concept_number else None
+        labels = self._get_labels(subject, rdf_graph)
+        parents = ",".join(
+            f"gemet:{n}" for n in reversed(self._find_parents(subject, rdf_graph)) if n
+        )
+        identifiers = [{"scheme": "url", "identifier": str(subject)}]
+        groups, themes = self._get_groups_and_themes(subject, rdf_graph)
+        props = {"parents": parents} if parents else {}
+        if groups:
+            props["groups"] = groups
+        if themes:
+            props["themes"] = themes
+        return {
+            "id": id,
+            "scheme": "GEMET",
+            "subject": labels.get("en", "").capitalize(),
+            "title": labels,
+            "props": props,
+            "identifiers": identifiers,
+        }
+# Configuration for datastream transformers, and writers
+VOCABULARIES_DATASTREAM_READERS = {}
+VOCABULARIES_DATASTREAM_WRITERS = {}
+VOCABULARIES_DATASTREAM_TRANSFORMERS = {"gemet-transformer": GEMETSubjectsTransformer}
+DATASTREAM_CONFIG = {
+    "readers": [
+        {
+            "type": "http",
+            "args": {
+                "origin": gemet_file_url,
+            },
+        },
+        {"type": "gzip"},
+        {"type": "rdf"},
+    ],
+    "transformers": [{"type": "gemet-transformer"}],
+    "writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
+}

invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json CHANGED Viewed

@@ -34,9 +34,20 @@
       "type": "object",
       "patternProperties": {
         "^.*$": {
-          "type": "string"
+          "oneOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "array",
+              "items": {
+                "type": "string"
+              }
+            }
+          ]
         }
-      }
+      },
+      "additionalProperties": false
     },
     "identifiers": {
       "description": "Alternate identifiers for the subject.",

invenio_vocabularies/contrib/subjects/mesh/datastreams.py CHANGED Viewed

@@ -22,14 +22,19 @@ class MeshSubjectsTransformer(BaseTransformer):
         """Apply transformation on steam entry."""
         entry_data = stream_entry.entry
-        # ID in MeSH data is the URL, ex. https://id.nlm.nih.gov/mesh/D000001
+        # ID in MeSH data is in the URL, ex. https://id.nlm.nih.gov/mesh/D000001
         # We just want to use the ID prefixed by "mesh:""
         try:
             mesh_id = entry_data["id"].split("/")[-1]
+            entry_data["id"] = "mesh:" + mesh_id
         except Exception:
             raise TransformerError("Not a valid MeSH ID.")
-        entry_data["id"] = "mesh:" + mesh_id
+        entry_data["title"] = title = entry_data.get("title", {})
+        # NOTE: MeSH import file comes with an English subject by default
+        if "en" not in title:
+            title["en"] = entry_data["subject"]
         return stream_entry

invenio_vocabularies/contrib/subjects/schema.py CHANGED Viewed

@@ -13,8 +13,8 @@
 from functools import partial
 from invenio_i18n import get_locale
-from marshmallow import EXCLUDE, Schema, fields, pre_load
-from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode
+from marshmallow import EXCLUDE, Schema, ValidationError, fields, pre_load, validate
+from marshmallow_utils.fields import URL, IdentifierSet, SanitizedUnicode
 from marshmallow_utils.schemas import IdentifierSchema
 from ...services.schema import (
@@ -25,6 +25,21 @@ from ...services.schema import (
 from .config import subject_schemes
+class StringOrListOfStrings(fields.Field):
+    """Custom field to handle both string and list of strings."""
+    # TODO: Move this to marshmallow-utils for broader type support.
+    def _deserialize(self, value, attr, data, **kwargs):
+        if isinstance(value, str):
+            return fields.String()._deserialize(value, attr, data, **kwargs)
+        elif isinstance(value, list) and all(isinstance(item, str) for item in value):
+            return [
+                fields.String()._deserialize(item, attr, data, **kwargs)
+                for item in value
+            ]
+        raise ValidationError("Invalid value. Must be a string or a list of strings.")
 class SubjectSchema(BaseVocabularySchema):
     """Service schema for subjects."""
@@ -35,7 +50,7 @@ class SubjectSchema(BaseVocabularySchema):
     scheme = SanitizedUnicode(required=True)
     subject = SanitizedUnicode(required=True)
     title = i18n_strings
-    props = fields.Dict(keys=SanitizedUnicode(), values=SanitizedUnicode())
+    props = fields.Dict(keys=SanitizedUnicode(), values=StringOrListOfStrings())
     identifiers = IdentifierSet(
         fields.Nested(
             partial(

invenio-vocabularies 6.5.0__py2.py3-none-any.whl → 6.7.0__py2.py3-none-any.whl

Potentially problematic release.

invenio-vocabularies 6.5.0py2.py3-none-any.whl → 6.7.0py2.py3-none-any.whl