PyPI - invenio-vocabularies - Versions diffs - 6.5.0__py2.py3-none-any.whl → 6.7.0__py2.py3-none-any.whl - Mend

invenio-vocabularies 6.5.0py2.py3-none-any.whl → 6.7.0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (46) hide show

invenio_vocabularies/datastreams/readers.py CHANGED Viewed

@@ -11,6 +11,7 @@
 import csv
 import gzip
+import io
 import json
 import re
 import tarfile
@@ -27,11 +28,25 @@ from lxml.html import parse as html_parse
 from .errors import ReaderError
 from .xml import etree_to_dict
+# Extras dependencies
+# "oaipmh"
 try:
     import oaipmh_scythe
 except ImportError:
     oaipmh_scythe = None
+# "rdf"
+try:
+    import rdflib
+except ImportError:
+    rdflib = None
+# "sparql"
+try:
+    import SPARQLWrapper as sparql
+except ImportError:
+    sparql = None
 class BaseReader(ABC):
     """Base reader."""
@@ -103,8 +118,7 @@ class SimpleHTTPReader(BaseReader):
     def __init__(self, origin, id=None, ids=None, content_type=None, *args, **kwargs):
         """Constructor."""
-        assert id or ids
-        self._ids = ids if ids else [id]
+        self._ids = ids if ids else ([id] if id else None)
         self.content_type = content_type
         super().__init__(origin, *args, **kwargs)
@@ -113,14 +127,22 @@ class SimpleHTTPReader(BaseReader):
         base_url = url
         headers = {"Accept": self.content_type}
-        for id_ in self._ids:
-            url = base_url.format(id=id_)
+        # If there are no IDs, query the base URL
+        if not self._ids:
             resp = requests.get(url, headers=headers)
-            if resp.status_code != 200:
-                # todo add logging/fail
-                pass
+            if resp.status_code == 200:
+                yield resp.content
+            else:
+                print(f"Failed to fetch URL {url}: {resp.status_code}")
+        else:
+            for id_ in self._ids:
+                url = base_url.format(id=id_)
+                resp = requests.get(url, headers=headers)
+                if resp.status_code != 200:
+                    # todo add logging/fail
+                    pass
-            yield resp.content
+                yield resp.content
     def read(self, item=None, *args, **kwargs):
         """Chooses between item and origin as url."""
@@ -197,6 +219,9 @@ class GzipReader(BaseReader):
     """Gzip reader."""
     def _iter(self, fp, *args, **kwargs):
+        if isinstance(fp, bytes):
+            fp = io.BytesIO(fp)
         with gzip.open(fp) as gp:
             yield gp
@@ -236,7 +261,7 @@ class XMLReader(BaseReader):
         try:
             xml_tree = fromstring(fp)
             xml_dict = etree_to_dict(xml_tree)
-        except Exception as e:
+        except Exception:
             xml_tree = html_parse(fp).getroot()
             xml_dict = etree_to_dict(xml_tree)["html"]["body"]
@@ -346,3 +371,68 @@ def xml_to_dict(tree: etree._Element):
     dict_obj["record"] = etree.tostring(tree)
     return dict_obj
+class RDFReader(BaseReader):
+    """Base Reader class to fetch and process RDF data."""
+    @property
+    def skos_core(self):
+        """Return the SKOS Core namespace."""
+        return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")
+    def _iter(self, rdf_graph):
+        """Iterate over the RDF graph, yielding one subject at a time."""
+        for subject, _, _ in rdf_graph.triples(
+            (None, rdflib.RDF.type, self.skos_core.Concept)
+        ):
+            yield {"subject": subject, "rdf_graph": rdf_graph}
+    def read(self, item=None, *args, **kwargs):
+        """Fetch and process the RDF data, yielding it one subject at a time."""
+        if isinstance(item, gzip.GzipFile):
+            rdf_content = item.read().decode("utf-8")
+        elif isinstance(item, bytes):
+            rdf_content = item.decode("utf-8")
+        else:
+            raise ReaderError("Unsupported content type")
+        rdf_graph = rdflib.Graph()
+        rdf_graph.parse(io.StringIO(rdf_content), format="xml")
+        yield from self._iter(rdf_graph)
+class SPARQLReader(BaseReader):
+    """Generic reader class to fetch and process RDF data from a SPARQL endpoint."""
+    def __init__(self, origin, query, mode="r", *args, **kwargs):
+        """Initialize the reader with the data source.
+        :param origin: The SPARQL endpoint from which to fetch the RDF data.
+        :param query: The SPARQL query to execute.
+        :param mode: Mode of operation (default is 'r' for reading).
+        """
+        self._origin = origin
+        self._query = query
+        super().__init__(origin=origin, mode=mode, *args, **kwargs)
+    def _iter(self, fp, *args, **kwargs):
+        raise NotImplementedError(
+            "SPARQLReader downloads one result set from SPARQL and therefore does not iterate through items"
+        )
+    def read(self, item=None, *args, **kwargs):
+        """Fetch and process RDF data, yielding results one at a time."""
+        if item:
+            raise NotImplementedError(
+                "SPARQLReader does not support being chained after another reader"
+            )
+        sparql_client = sparql.SPARQLWrapper(self._origin)
+        sparql_client.setQuery(self._query)
+        sparql_client.setReturnFormat(sparql.JSON)
+        results = sparql_client.query().convert()
+        yield from results["results"]["bindings"]

invenio_vocabularies/datastreams/transformers.py CHANGED Viewed

@@ -15,6 +15,11 @@ from lxml import etree
 from .errors import TransformerError
 from .xml import etree_to_dict
+try:
+    import rdflib
+except ImportError:
+    rdflib = None
 class BaseTransformer(ABC):
     """Base transformer."""
@@ -61,3 +66,53 @@ class XMLTransformer(BaseTransformer):
         stream_entry.entry = record
         return stream_entry
+class RDFTransformer(BaseTransformer):
+    """Base Transformer class for RDF data to dictionary format."""
+    @property
+    def skos_core(self):
+        """Get the SKOS core namespace."""
+        return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")
+    def _get_labels(self, subject, rdf_graph):
+        """Extract labels (prefLabel or altLabel) for a subject."""
+        labels = {
+            label.language: label.value.capitalize()
+            for _, _, label in rdf_graph.triples(
+                (subject, self.skos_core.prefLabel, None)
+            )
+            if label.language and "-" not in label.language
+        }
+        if "en" not in labels:
+            for _, _, label in rdf_graph.triples(
+                (subject, self.skos_core.altLabel, None)
+            ):
+                labels.setdefault(label.language, label.value.capitalize())
+        return labels
+    def _find_parents(self, subject, rdf_graph):
+        """Find parent notations."""
+        return [
+            self._get_parent_notation(broader, rdf_graph)
+            for broader in rdf_graph.transitive_objects(subject, self.skos_core.broader)
+            if broader != subject
+        ]
+    def _get_parent_notation(self, broader, rdf_graph):
+        """Extract notation for a parent."""
+        raise NotImplementedError("This method should be implemented in a subclass.")
+    def _transform_entry(self, subject, rdf_graph):
+        """Transform an RDF subject entry into the desired dictionary format."""
+        raise NotImplementedError("This method should be implemented in a subclass.")
+    def apply(self, stream_entry, *args, **kwargs):
+        """Apply transformation to a stream entry."""
+        stream_entry.entry = self._transform_entry(
+            stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
+        )
+        return stream_entry

invenio_vocabularies/factories.py CHANGED Viewed

@@ -16,6 +16,9 @@ from invenio_records_resources.proxies import current_service_registry
 from .contrib.affiliations.datastreams import (
     DATASTREAM_CONFIG as affiliations_ds_config,
 )
+from .contrib.affiliations.datastreams import (
+    DATASTREAM_CONFIG_EDMO as affiliations_edmo_ds_config,
+)
 from .contrib.affiliations.datastreams import (
     DATASTREAM_CONFIG_OPENAIRE as affiliations_openaire_ds_config,
 )
@@ -123,6 +126,17 @@ class AffiliationsOpenAIREVocabularyConfig(VocabularyConfig):
         raise NotImplementedError("Service not implemented for OpenAIRE Affiliations")
+class AffiliationsEDMOVocabularyConfig(VocabularyConfig):
+    """European Directory of Marine Organisations (EDMO) Affiliations Vocabulary Config."""
+    config = affiliations_edmo_ds_config
+    vocabulary_name = "affiliations:edmo"
+    def get_service(self):
+        """Get the service for the vocabulary."""
+        raise NotImplementedError("Service not implemented for EDMO Affiliations")
 def get_vocabulary_config(vocabulary):
     """Factory function to get the appropriate Vocabulary Config."""
     vocab_config = {
@@ -132,6 +146,7 @@ def get_vocabulary_config(vocabulary):
         "awards:cordis": AwardsCordisVocabularyConfig,
         "affiliations": AffiliationsVocabularyConfig,
         "affiliations:openaire": AffiliationsOpenAIREVocabularyConfig,
+        "affiliations:edmo": AffiliationsEDMOVocabularyConfig,
         "subjects": SubjectsVocabularyConfig,
     }
     return vocab_config.get(vocabulary, VocabularyConfig)()

invenio_vocabularies/jobs.py CHANGED Viewed

@@ -18,6 +18,8 @@ from marshmallow_utils.fields import TZDateTime
 from invenio_vocabularies.services.tasks import process_datastream
+from .contrib.names.datastreams import ORCID_PRESET_DATASTREAM_CONFIG
 class ArgsSchema(Schema):
     """Schema of task input arguments."""
@@ -194,3 +196,16 @@ class UpdateAwardsCordisJob(ProcessDataStreamJob):
                 ],
             }
         }
+class ImportORCIDJob(ProcessDataStreamJob):
+    """Import ORCID data registered task."""
+    description = "Import ORCID data"
+    title = "Import ORCID data"
+    id = "import_orcid"
+    @classmethod
+    def default_args(cls, job_obj, **kwargs):
+        """Generate default job arguments."""
+        return {"config": {**ORCID_PRESET_DATASTREAM_CONFIG}}

invenio_vocabularies/records/jsonschemas/vocabularies/definitions-v1.0.0.json CHANGED Viewed

@@ -8,6 +8,15 @@
       }
     }
   },
+  "description": {
+    "type": "object",
+    "description": "Description of vocabulary item. Keys are locale codes.",
+    "patternProperties": {
+      "^[a-z]{2}$": {
+        "type": "string"
+      }
+    }
+  },
   "icon": {
     "type": "string"
   },

invenio_vocabularies/services/config.py CHANGED Viewed

@@ -14,17 +14,11 @@ from flask import current_app
 from invenio_i18n import lazy_gettext as _
 from invenio_records_resources.services import (
     Link,
-    LinksTemplate,
-    RecordService,
     RecordServiceConfig,
     SearchOptions,
     pagination_links,
 )
-from invenio_records_resources.services.base import (
-    ConditionalLink,
-    Service,
-    ServiceListResult,
-)
+from invenio_records_resources.services.base import ConditionalLink
 from invenio_records_resources.services.records.components import DataComponent
 from invenio_records_resources.services.records.params import (
     FilterParam,

invenio_vocabularies/services/generators.py ADDED Viewed

@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+#
+"""Vocabulary generators."""
+from invenio_access import any_user, authenticated_user
+from invenio_records_permissions.generators import ConditionalGenerator
+from invenio_search.engine import dsl
+class IfTags(ConditionalGenerator):
+    """Generator to filter based on tags.
+    This generator will filter out records based on the tags field.
+    """
+    def __init__(self, tags, then_, else_):
+        """Constructor."""
+        self.tags = tags or []
+        super().__init__(then_, else_)
+    def _condition(self, record=None, **kwargs):
+        """Check if the record has the tags."""
+        return any(tag in record.get("tags", []) for tag in self.tags)
+    def query_filter(self, **kwargs):
+        """Search based on configured tags."""
+        must_not_clauses = [dsl.Q("terms", tags=self.tags)]
+        return dsl.Q(
+            "bool",
+            must_not=must_not_clauses,
+        )

invenio_vocabularies/services/permissions.py CHANGED Viewed

@@ -11,12 +11,17 @@
 from invenio_records_permissions import RecordPermissionPolicy
 from invenio_records_permissions.generators import AnyUser, SystemProcess
+from invenio_vocabularies.services.generators import IfTags
 class PermissionPolicy(RecordPermissionPolicy):
     """Permission policy."""
     can_search = [SystemProcess(), AnyUser()]
-    can_read = [SystemProcess(), AnyUser()]
+    can_read = [
+        SystemProcess(),
+        IfTags(["unlisted"], then_=[SystemProcess()], else_=[AnyUser()]),
+    ]
     can_create = [SystemProcess()]
     can_update = [SystemProcess()]
     can_delete = [SystemProcess()]

{invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: invenio-vocabularies
-Version: 6.5.0
+Version: 6.7.0
 Summary: Invenio module for managing vocabularies.
 Home-page: https://github.com/inveniosoftware/invenio-vocabularies
 Author: CERN
@@ -15,9 +15,10 @@ Requires-Dist: invenio-records-resources <7.0.0,>=6.0.0
 Requires-Dist: invenio-administration <3.0.0,>=2.0.0
 Requires-Dist: invenio-jobs <2.0.0,>=1.0.0
 Requires-Dist: lxml >=4.5.0
+Requires-Dist: pycountry <23.0.0,>=22.3.5
 Requires-Dist: PyYAML >=5.4.1
 Requires-Dist: regex >=2024.7.24
-Requires-Dist: rdflib >=7.0.0
+Requires-Dist: SPARQLWrapper >=2.0.0
 Provides-Extra: elasticsearch7
 Requires-Dist: invenio-search[elasticsearch7] <3.0.0,>=2.1.0 ; extra == 'elasticsearch7'
 Provides-Extra: mysql
@@ -28,8 +29,12 @@ Requires-Dist: invenio-search[opensearch1] <3.0.0,>=2.1.0 ; extra == 'opensearch
 Provides-Extra: opensearch2
 Requires-Dist: invenio-search[opensearch2] <3.0.0,>=2.1.0 ; extra == 'opensearch2'
 Provides-Extra: postgresql
+Provides-Extra: rdf
+Requires-Dist: rdflib >=7.0.0 ; extra == 'rdf'
 Provides-Extra: s3fs
 Requires-Dist: s3fs >=2024.6.1 ; extra == 's3fs'
+Provides-Extra: sparql
+Requires-Dist: SPARQLWrapper >=2.0.0 ; extra == 'sparql'
 Provides-Extra: sqlite
 Provides-Extra: tests
 Requires-Dist: pytest-black-ng >=0.4.0 ; extra == 'tests'
@@ -83,6 +88,31 @@ https://invenio-vocabularies.readthedocs.io/
 Changes
 =======
+Version v6.7.0 (released 2024-11-27)
+- contrib: improve search accuracy for names, funders, affiliations
+- names: add affiliation acronym in mappings and schema
+    * Dereferences the affiliation `acronym` when indexing names and serving
+      REST API results. This is useful for disambiguating authors in search.
+- affiliations: move RDF and SPARQL as extra dependencies
+    * Moves `rdflib` and `SPARQLWrapper` to extras.
+- affiliation: refactored edmo datastreams
+- subjects: added datastream for GEMET vocabulary
+- awards/schema.py: read app config for alternate funding validation (#429)
+- awards: fix description field and mappings
+- awards: add fields start/end date and description
+Version v6.6.0 (released 2024-11-15)
+- mesh: add title en if not present
+- subjects: add subject to search fields
+- jobs: add ORCID job
+- global: Add unlisted tag
+    * This adds a new tag to the vocabularies to allow for unlisted
+      vocabularies. This is useful for vocabularies that are not meant to be
+      displayed in the UI.
+    * This requires to update the names mapping to add the props.
 Version v6.5.0 (released 2024-10-31)
 - subjects: euroscivoc: change default to latest version-less URL

invenio-vocabularies 6.5.0__py2.py3-none-any.whl → 6.7.0__py2.py3-none-any.whl

Potentially problematic release.

invenio-vocabularies 6.5.0py2.py3-none-any.whl → 6.7.0py2.py3-none-any.whl