PyPI - invenio-vocabularies - Versions diffs - 6.6.0__py2.py3-none-any.whl → 6.8.0__py2.py3-none-any.whl - Mend

invenio-vocabularies 6.6.0py2.py3-none-any.whl → 6.8.0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (46) hide show

invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022-2024 CERN.
+# Copyright (C) 2024 CERN.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,121 +8,65 @@
 """EuroSciVoc subjects datastreams, readers, transformers, and writers."""
-import io
-from collections import namedtuple
-import requests
-from rdflib import OWL, RDF, Graph, Namespace
-from invenio_vocabularies.config import SUBJECTS_EUROSCIVOC_FILE_URL
-from invenio_vocabularies.datastreams.readers import BaseReader
-from invenio_vocabularies.datastreams.transformers import BaseTransformer
-class EuroSciVocSubjectsHTTPReader(BaseReader):
-    """Reader class to fetch and process EuroSciVoc RDF data."""
-    def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
-        """Initialize the reader with the data source.
-        :param origin: The URL from which to fetch the RDF data.
-        :param mode: Mode of operation (default is 'r' for reading).
-        """
-        self.origin = origin or SUBJECTS_EUROSCIVOC_FILE_URL
-        super().__init__(origin=origin, mode=mode, *args, **kwargs)
-    def _iter(self, rdf_graph):
-        """Iterate over the RDF graph, yielding one subject at a time.
-        :param rdf_graph: The RDF graph to process.
-        :yield: Subject and graph to be transformed.
-        """
-        SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
-        for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)):
-            yield {"subject": subject, "rdf_graph": rdf_graph}
-    def read(self, item=None, *args, **kwargs):
-        """Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time.
-        :param item: The RDF data provided as bytes (optional).
-        :yield: Processed EuroSciVoc subject data.
-        """
-        if item:
-            raise NotImplementedError(
-                "EuroSciVocSubjectsHTTPReader does not support being chained after another reader"
-            )
-        # Fetch the RDF data from the specified origin URL
-        response = requests.get(self.origin)
-        response.raise_for_status()
-        # Treat the response content as a file-like object
-        rdf_data = io.BytesIO(response.content)
-        # Parse the RDF data into a graph
-        rdf_graph = Graph()
-        rdf_graph.parse(rdf_data, format="xml")
-        # Yield each processed subject from the RDF graph
-        yield from self._iter(rdf_graph)
-class EuroSciVocSubjectsTransformer(BaseTransformer):
-    """Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
-    SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
-    SPLITCHAR = ","
+from invenio_vocabularies.datastreams.transformers import RDFTransformer
+from ..config import euroscivoc_file_url
+class EuroSciVocSubjectsTransformer(RDFTransformer):
+    """
+     Transformer class to convert EuroSciVoc RDF data to a dictionary format.
+     Input:
+         - Relevant fields:
+             - `skos:notation`: Primary identifier for the concept.
+             - `skos:prefLabel`: Preferred labels with language codes.
+             - `skos:altLabel`: Alternative labels.
+             - `skos:broader`: Broader concepts that this concept belongs to.
+    Output:
+         {
+             "id": "euroscivoc:1717",  # EuroSciVoc-specific concept ID (skos:notation).
+             "scheme": "EuroSciVoc",  # The scheme name indicating this is a EuroSciVoc concept.
+             "subject": "Satellite radio",  # The primary subject label (first preferred label in English, skos:prefLabel).
+             "title": {
+                 "it": "Radio satellitare",  # Italian preferred label (skos:prefLabel).
+                 "en": "Satellite radio",  # English preferred label (skos:prefLabel).
+             },
+             "props": {
+                 "parents": "euroscivoc:1225",  # The broader concept (skos:broader), identified by its EuroSciVoc Concept ID.
+             },
+             "identifiers": [
+                 {
+                     "scheme": "url",  # Type of identifier (URL).
+                     "identifier": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba",  # URI of the concept (rdf:about).
+                 }
+             ],
+         }
+    """
     def _get_notation(self, subject, rdf_graph):
         """Extract the numeric notation for a subject."""
         for _, _, notation in rdf_graph.triples(
-            (subject, self.SKOS_CORE.notation, None)
+            (subject, self.skos_core.notation, None)
         ):
             if str(notation).isdigit():
                 return str(notation)
         return None
-    def _get_labels(self, subject, rdf_graph):
-        """Extract prefLabel and altLabel languages for a subject."""
-        labels = {
-            label.language: label.value.capitalize()
-            for _, _, label in rdf_graph.triples(
-                (subject, self.SKOS_CORE.prefLabel, None)
-            )
-        }
-        if "en" not in labels:
-            for _, _, label in rdf_graph.triples(
-                (subject, self.SKOS_CORE.altLabel, None)
-            ):
-                labels.setdefault(label.language, label.value.capitalize())
-        return labels
-    def _find_parents(self, subject, rdf_graph):
-        """Find parent notations."""
-        parents = []
-        # Traverse the broader hierarchy
-        for broader in rdf_graph.transitive_objects(subject, self.SKOS_CORE.broader):
-            if broader != subject:  # Ensure we don't include the current subject
-                parent_notation = self._get_notation(broader, rdf_graph)
-                if parent_notation:
-                    parents.append(parent_notation)
-        return parents
+    def _get_parent_notation(self, broader, rdf_graph):
+        """Extract parent notation using numeric notation."""
+        return self._get_notation(broader, rdf_graph)
     def _transform_entry(self, subject, rdf_graph):
-        """Transform an entry to the required dictionary format."""
-        # Get subject notation with euroscivoc prefix
         notation = self._get_notation(subject, rdf_graph)
         id = f"euroscivoc:{notation}" if notation else None
-        # Get labels for the current subject
         labels = self._get_labels(subject, rdf_graph)
-        # Join parent notations with SPLITCHAR separator and add euroscivoc prefix
-        parents = self.SPLITCHAR.join(
-            f"euroscivoc:{n}" for n in reversed(self._find_parents(subject, rdf_graph))
+        parents = ",".join(
+            f"euroscivoc:{n}"
+            for n in reversed(self._find_parents(subject, rdf_graph))
+            if n
         )
-        # Create identifiers list
-        identifiers = [{"scheme": "url", "identifier": str(subject)}]
         return {
             "id": id,
@@ -130,27 +74,11 @@ class EuroSciVocSubjectsTransformer(BaseTransformer):
             "subject": labels.get("en", "").capitalize(),
             "title": labels,
             "props": {"parents": parents} if parents else {},
-            "identifiers": identifiers,
+            "identifiers": self._get_identifiers(subject),
         }
-    def apply(self, stream_entry, *args, **kwargs):
-        """Transform a stream entry to the required dictionary format.
-        :param stream_entry: The entry to be transformed, which includes the subject and the RDF graph.
-        :return: The transformed stream entry.
-        """
-        # Apply transformations
-        entry_data = self._transform_entry(
-            stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
-        )
-        stream_entry.entry = entry_data
-        return stream_entry
-# Configuration for datastream readers, transformers, and writers
-VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}
-VOCABULARIES_DATASTREAM_WRITERS = {}
+# Configuration for datastream
 VOCABULARIES_DATASTREAM_TRANSFORMERS = {
     "euroscivoc-transformer": EuroSciVocSubjectsTransformer
@@ -159,13 +87,15 @@ VOCABULARIES_DATASTREAM_TRANSFORMERS = {
 DATASTREAM_CONFIG = {
     "readers": [
         {
-            "type": "euroscivoc-reader",
-        }
-    ],
-    "transformers": [{"type": "euroscivoc-transformer"}],
-    "writers": [
+            "type": "http",
+            "args": {
+                "origin": euroscivoc_file_url,
+            },
+        },
         {
-            "type": "subjects-service",
-        }
+            "type": "rdf",
+        },
     ],
+    "transformers": [{"type": "euroscivoc-transformer"}],
+    "writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
 }

invenio_vocabularies/contrib/subjects/gemet/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""GEMET Subjects module."""

invenio_vocabularies/contrib/subjects/gemet/datastreams.py ADDED Viewed

@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""GEMET subjects datastreams, readers, transformers, and writers."""
+from invenio_vocabularies.datastreams.transformers import RDFTransformer
+from ..config import gemet_file_url
+# Available with the "rdf" extra
+try:
+    import rdflib
+except ImportError:
+    rdflib = None
+class GEMETSubjectsTransformer(RDFTransformer):
+    """
+    Transformer class to convert GEMET RDF data to a dictionary format.
+    Input:
+        - Relevant fields:
+            - `skos:prefLabel`: Preferred labels with language codes.
+            - `skos:broader`: References to broader concepts (parent concepts).
+            - `skos:memberOf`: References to groups or themes the concept belongs to.
+    Output:
+        - A dictionary with the following structure:
+            {
+                "id": "gemet:concept/10008",  # GEMET-specific concept ID (skos:Concept).
+                "scheme": "GEMET",  # The scheme name indicating this is a GEMET concept.
+                "subject": "Consumer product",  # The subject label (first preferred label in English, skos:prefLabel).
+                "title": {
+                    "en": "Consumer product",  # English label for the concept (skos:prefLabel).
+                    "ar": "منتج استهلاكي"  # Arabic label for the concept (skos:prefLabel).
+                },
+                "props": {
+                    "parents": "gemet:concept/6660",  # The parent concept (skos:broader), identified by its GEMET Concept ID.
+                    "groups": ["http://www.eionet.europa.eu/gemet/group/10112"],  # Group the concept belongs to (skos:memberOf)(skos:prefLabel).
+                    "themes": [
+                        "http://www.eionet.europa.eu/gemet/theme/27",  # Theme the concept belongs to (skos:memberOf)(rdfs:label).
+                    ]
+                },
+                "identifiers": [
+                    {
+                        "scheme": "url",  # Type of identifier (URL).
+                        "identifier": "http://www.eionet.europa.eu/gemet/concept/10008"  # URI of the concept (rdf:about).
+                    }
+                ]
+            }
+    """
+    def _get_parent_notation(self, broader, rdf_graph):
+        """Extract parent notation from GEMET URI."""
+        return "/".join(broader.split("/")[-2:])
+    def _get_groups_and_themes(self, subject, rdf_graph):
+        """Extract groups and themes for a subject."""
+        groups = []
+        themes = []
+        for relation in rdf_graph.subjects(
+            predicate=self.skos_core.member, object=subject
+        ):
+            relation_uri = str(relation)
+            relation_label = None
+            # If the relation is a group, check for skos:prefLabel
+            if "group" in relation_uri:
+                labels = rdf_graph.objects(
+                    subject=relation, predicate=self.skos_core.prefLabel
+                )
+                relation_label = next(
+                    (str(label) for label in labels if label.language == "en"), None
+                )
+                groups.append(relation_uri)
+            # If the relation is a theme, check for rdfs:label
+            elif "theme" in relation_uri:
+                labels = rdf_graph.objects(
+                    subject=relation, predicate=rdflib.RDFS.label
+                )
+                relation_label = next(
+                    (str(label) for label in labels if label.language == "en"), None
+                )
+                themes.append(relation_uri)
+        return groups, themes
+    def _transform_entry(self, subject, rdf_graph):
+        """Transform an entry to the required dictionary format."""
+        concept_number = "/".join(subject.split("/")[-2:])
+        id = f"gemet:{concept_number}" if concept_number else None
+        labels = self._get_labels(subject, rdf_graph)
+        parents = ",".join(
+            f"gemet:{n}" for n in reversed(self._find_parents(subject, rdf_graph)) if n
+        )
+        identifiers = [{"scheme": "url", "identifier": str(subject)}]
+        groups, themes = self._get_groups_and_themes(subject, rdf_graph)
+        props = {"parents": parents} if parents else {}
+        if groups:
+            props["groups"] = groups
+        if themes:
+            props["themes"] = themes
+        return {
+            "id": id,
+            "scheme": "GEMET",
+            "subject": labels.get("en", "").capitalize(),
+            "title": labels,
+            "props": props,
+            "identifiers": self._get_identifiers(subject),
+        }
+# Configuration for datastream
+VOCABULARIES_DATASTREAM_TRANSFORMERS = {"gemet-transformer": GEMETSubjectsTransformer}
+DATASTREAM_CONFIG = {
+    "readers": [
+        {
+            "type": "http",
+            "args": {
+                "origin": gemet_file_url,
+            },
+        },
+        {"type": "gzip"},
+        {"type": "rdf"},
+    ],
+    "transformers": [{"type": "gemet-transformer"}],
+    "writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
+}

invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json CHANGED Viewed

@@ -34,9 +34,20 @@
       "type": "object",
       "patternProperties": {
         "^.*$": {
-          "type": "string"
+          "oneOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "array",
+              "items": {
+                "type": "string"
+              }
+            }
+          ]
         }
-      }
+      },
+      "additionalProperties": false
     },
     "identifiers": {
       "description": "Alternate identifiers for the subject.",

invenio_vocabularies/contrib/subjects/schema.py CHANGED Viewed

@@ -13,8 +13,8 @@
 from functools import partial
 from invenio_i18n import get_locale
-from marshmallow import EXCLUDE, Schema, fields, pre_load
-from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode
+from marshmallow import EXCLUDE, Schema, ValidationError, fields, pre_load, validate
+from marshmallow_utils.fields import URL, IdentifierSet, SanitizedUnicode
 from marshmallow_utils.schemas import IdentifierSchema
 from ...services.schema import (
@@ -25,6 +25,21 @@ from ...services.schema import (
 from .config import subject_schemes
+class StringOrListOfStrings(fields.Field):
+    """Custom field to handle both string and list of strings."""
+    # TODO: Move this to marshmallow-utils for broader type support.
+    def _deserialize(self, value, attr, data, **kwargs):
+        if isinstance(value, str):
+            return fields.String()._deserialize(value, attr, data, **kwargs)
+        elif isinstance(value, list) and all(isinstance(item, str) for item in value):
+            return [
+                fields.String()._deserialize(item, attr, data, **kwargs)
+                for item in value
+            ]
+        raise ValidationError("Invalid value. Must be a string or a list of strings.")
 class SubjectSchema(BaseVocabularySchema):
     """Service schema for subjects."""
@@ -35,7 +50,7 @@ class SubjectSchema(BaseVocabularySchema):
     scheme = SanitizedUnicode(required=True)
     subject = SanitizedUnicode(required=True)
     title = i18n_strings
-    props = fields.Dict(keys=SanitizedUnicode(), values=SanitizedUnicode())
+    props = fields.Dict(keys=SanitizedUnicode(), values=StringOrListOfStrings())
     identifiers = IdentifierSet(
         fields.Nested(
             partial(

invenio_vocabularies/datastreams/datastreams.py CHANGED Viewed

@@ -48,7 +48,16 @@ class StreamEntry:
 class DataStream:
     """Data stream."""
-    def __init__(self, readers, writers, transformers=None, *args, **kwargs):
+    def __init__(
+        self,
+        readers,
+        writers,
+        transformers=None,
+        batch_size=100,
+        write_many=False,
+        *args,
+        **kwargs,
+    ):
         """Constructor.
         :param readers: an ordered list of readers.
@@ -58,12 +67,14 @@ class DataStream:
         self._readers = readers
         self._transformers = transformers
         self._writers = writers
+        self.batch_size = batch_size
+        self.write_many = write_many
     def filter(self, stream_entry, *args, **kwargs):
         """Checks if an stream_entry should be filtered out (skipped)."""
         return False
-    def process_batch(self, batch, write_many=False):
+    def process_batch(self, batch):
         """Process a batch of entries."""
         transformed_entries = []
         for stream_entry in batch:
@@ -79,12 +90,12 @@ class DataStream:
                 else:
                     transformed_entries.append(transformed_entry)
         if transformed_entries:
-            if write_many:
+            if self.write_many:
                 yield from self.batch_write(transformed_entries)
             else:
                 yield from (self.write(entry) for entry in transformed_entries)
-    def process(self, batch_size=100, write_many=False, *args, **kwargs):
+    def process(self, *args, **kwargs):
         """Iterates over the entries.
         Uses the reader to get the raw entries and transforms them.
@@ -95,13 +106,13 @@ class DataStream:
         batch = []
         for stream_entry in self.read():
             batch.append(stream_entry)
-            if len(batch) >= batch_size:
-                yield from self.process_batch(batch, write_many=write_many)
+            if len(batch) >= self.batch_size:
+                yield from self.process_batch(batch)
                 batch = []
         # Process any remaining entries in the last batch
         if batch:
-            yield from self.process_batch(batch, write_many=write_many)
+            yield from self.process_batch(batch)
     def read(self):
         """Recursively read the entries."""

invenio_vocabularies/datastreams/factories.py CHANGED Viewed

@@ -81,4 +81,6 @@ class DataStreamFactory:
             for t_conf in transformers_config:
                 transformers.append(TransformerFactory.create(t_conf))
-        return DataStream(readers=readers, writers=writers, transformers=transformers)
+        return DataStream(
+            readers=readers, writers=writers, transformers=transformers, **kwargs
+        )

invenio_vocabularies/datastreams/readers.py CHANGED Viewed

@@ -11,6 +11,7 @@
 import csv
 import gzip
+import io
 import json
 import re
 import tarfile
@@ -27,11 +28,25 @@ from lxml.html import parse as html_parse
 from .errors import ReaderError
 from .xml import etree_to_dict
+# Extras dependencies
+# "oaipmh"
 try:
     import oaipmh_scythe
 except ImportError:
     oaipmh_scythe = None
+# "rdf"
+try:
+    import rdflib
+except ImportError:
+    rdflib = None
+# "sparql"
+try:
+    import SPARQLWrapper as sparql
+except ImportError:
+    sparql = None
 class BaseReader(ABC):
     """Base reader."""
@@ -103,8 +118,7 @@ class SimpleHTTPReader(BaseReader):
     def __init__(self, origin, id=None, ids=None, content_type=None, *args, **kwargs):
         """Constructor."""
-        assert id or ids
-        self._ids = ids if ids else [id]
+        self._ids = ids if ids else ([id] if id else None)
         self.content_type = content_type
         super().__init__(origin, *args, **kwargs)
@@ -113,14 +127,22 @@ class SimpleHTTPReader(BaseReader):
         base_url = url
         headers = {"Accept": self.content_type}
-        for id_ in self._ids:
-            url = base_url.format(id=id_)
+        # If there are no IDs, query the base URL
+        if not self._ids:
             resp = requests.get(url, headers=headers)
-            if resp.status_code != 200:
-                # todo add logging/fail
-                pass
+            if resp.status_code == 200:
+                yield resp.content
+            else:
+                print(f"Failed to fetch URL {url}: {resp.status_code}")
+        else:
+            for id_ in self._ids:
+                url = base_url.format(id=id_)
+                resp = requests.get(url, headers=headers)
+                if resp.status_code != 200:
+                    # todo add logging/fail
+                    pass
-            yield resp.content
+                yield resp.content
     def read(self, item=None, *args, **kwargs):
         """Chooses between item and origin as url."""
@@ -197,6 +219,9 @@ class GzipReader(BaseReader):
     """Gzip reader."""
     def _iter(self, fp, *args, **kwargs):
+        if isinstance(fp, bytes):
+            fp = io.BytesIO(fp)
         with gzip.open(fp) as gp:
             yield gp
@@ -236,7 +261,7 @@ class XMLReader(BaseReader):
         try:
             xml_tree = fromstring(fp)
             xml_dict = etree_to_dict(xml_tree)
-        except Exception as e:
+        except Exception:
             xml_tree = html_parse(fp).getroot()
             xml_dict = etree_to_dict(xml_tree)["html"]["body"]
@@ -346,3 +371,68 @@ def xml_to_dict(tree: etree._Element):
     dict_obj["record"] = etree.tostring(tree)
     return dict_obj
+class RDFReader(BaseReader):
+    """Base Reader class to fetch and process RDF data."""
+    @property
+    def skos_core(self):
+        """Return the SKOS Core namespace."""
+        return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")
+    def _iter(self, rdf_graph):
+        """Iterate over the RDF graph, yielding one subject at a time."""
+        for subject, _, _ in rdf_graph.triples(
+            (None, rdflib.RDF.type, self.skos_core.Concept)
+        ):
+            yield {"subject": subject, "rdf_graph": rdf_graph}
+    def read(self, item=None, *args, **kwargs):
+        """Fetch and process the RDF data, yielding it one subject at a time."""
+        if isinstance(item, gzip.GzipFile):
+            rdf_content = item.read().decode("utf-8")
+        elif isinstance(item, bytes):
+            rdf_content = item.decode("utf-8")
+        else:
+            raise ReaderError("Unsupported content type")
+        rdf_graph = rdflib.Graph()
+        rdf_graph.parse(io.StringIO(rdf_content), format="xml")
+        yield from self._iter(rdf_graph)
+class SPARQLReader(BaseReader):
+    """Generic reader class to fetch and process RDF data from a SPARQL endpoint."""
+    def __init__(self, origin, query, mode="r", *args, **kwargs):
+        """Initialize the reader with the data source.
+        :param origin: The SPARQL endpoint from which to fetch the RDF data.
+        :param query: The SPARQL query to execute.
+        :param mode: Mode of operation (default is 'r' for reading).
+        """
+        self._origin = origin
+        self._query = query
+        super().__init__(origin=origin, mode=mode, *args, **kwargs)
+    def _iter(self, fp, *args, **kwargs):
+        raise NotImplementedError(
+            "SPARQLReader downloads one result set from SPARQL and therefore does not iterate through items"
+        )
+    def read(self, item=None, *args, **kwargs):
+        """Fetch and process RDF data, yielding results one at a time."""
+        if item:
+            raise NotImplementedError(
+                "SPARQLReader does not support being chained after another reader"
+            )
+        sparql_client = sparql.SPARQLWrapper(self._origin)
+        sparql_client.setQuery(self._query)
+        sparql_client.setReturnFormat(sparql.JSON)
+        results = sparql_client.query().convert()
+        yield from results["results"]["bindings"]

invenio-vocabularies 6.6.0__py2.py3-none-any.whl → 6.8.0__py2.py3-none-any.whl

Potentially problematic release.

invenio-vocabularies 6.6.0py2.py3-none-any.whl → 6.8.0py2.py3-none-any.whl