PyPI - invenio-vocabularies - Versions diffs - 4.1.1__py2.py3-none-any.whl → 4.3.0__py2.py3-none-any.whl - Mend

invenio-vocabularies 4.1.1py2.py3-none-any.whl → 4.3.0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (35) hide show

invenio_vocabularies/datastreams/datastreams.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2021-2022 CERN.
+# Copyright (C) 2021-2024 CERN.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,17 +8,41 @@
 """Base data stream."""
+from flask import current_app
 from .errors import ReaderError, TransformerError, WriterError
 class StreamEntry:
     """Object to encapsulate streams processing."""
-    def __init__(self, entry, errors=None):
-        """Constructor."""
+    def __init__(self, entry, record=None, errors=None, op_type=None, exc=None):
+        """Constructor for the StreamEntry class.
+        :param entry (object): The entry object, usually a record dict.
+        :param record (object): The record object, usually a record class.
+        :param errors (list, optional): List of errors. Defaults to None.
+        :param op_type (str, optional): The operation type. Defaults to None.
+        :param exc (str, optional): The raised unhandled exception. Defaults to None.
+        """
         self.entry = entry
+        self.record = record
         self.filtered = False
         self.errors = errors or []
+        self.op_type = op_type
+        self.exc = exc
+    def log_errors(self, logger=None):
+        """Log the errors using the provided logger or the default logger.
+        :param logger (logging.Logger, optional): Logger instance to use. Defaults to None.
+        """
+        if logger is None:
+            logger = current_app.logger
+        for error in self.errors:
+            logger.error(f"Error in entry {self.entry}: {error}")
+        if self.exc:
+            logger.error(f"Exception in entry {self.entry}: {self.exc}")
 class DataStream:
@@ -39,15 +63,10 @@ class DataStream:
         """Checks if an stream_entry should be filtered out (skipped)."""
         return False
-    def process(self, *args, **kwargs):
-        """Iterates over the entries.
-        Uses the reader to get the raw entries and transforms them.
-        It will iterate over the `StreamEntry` objects returned by
-        the reader, apply the transformations and yield the result of
-        writing it.
-        """
-        for stream_entry in self.read():
+    def process_batch(self, batch, write_many=False):
+        """Process a batch of entries."""
+        transformed_entries = []
+        for stream_entry in batch:
             if stream_entry.errors:
                 yield stream_entry  # reading errors
             else:
@@ -58,7 +77,31 @@ class DataStream:
                     transformed_entry.filtered = True
                     yield transformed_entry
                 else:
-                    yield self.write(transformed_entry)
+                    transformed_entries.append(transformed_entry)
+        if transformed_entries:
+            if write_many:
+                yield from self.batch_write(transformed_entries)
+            else:
+                yield from (self.write(entry) for entry in transformed_entries)
+    def process(self, batch_size=100, write_many=False, *args, **kwargs):
+        """Iterates over the entries.
+        Uses the reader to get the raw entries and transforms them.
+        It will iterate over the `StreamEntry` objects returned by
+        the reader, apply the transformations and yield the result of
+        writing it.
+        """
+        batch = []
+        for stream_entry in self.read():
+            batch.append(stream_entry)
+            if len(batch) >= batch_size:
+                yield from self.process_batch(batch, write_many=write_many)
+                batch = []
+        # Process any remaining entries in the last batch
+        if batch:
+            yield from self.process_batch(batch, write_many=write_many)
     def read(self):
         """Recursively read the entries."""
@@ -107,6 +150,11 @@ class DataStream:
         return stream_entry
+    def batch_write(self, stream_entries, *args, **kwargs):
+        """Apply the transformations to an stream_entry. Errors are handler in the service layer."""
+        for writer in self._writers:
+            yield from writer.write_many(stream_entries)
     def total(self, *args, **kwargs):
         """The total of entries obtained from the origin."""
         raise NotImplementedError()

invenio_vocabularies/datastreams/readers.py CHANGED Viewed

@@ -21,6 +21,7 @@ from json.decoder import JSONDecodeError
 import requests
 import yaml
 from lxml import etree
+from lxml.html import fromstring
 from lxml.html import parse as html_parse
 from .errors import ReaderError
@@ -226,8 +227,13 @@ class XMLReader(BaseReader):
     def _iter(self, fp, *args, **kwargs):
         """Read and parse an XML file to dict."""
         # NOTE: We parse HTML, to skip XML validation and strip XML namespaces
-        xml_tree = html_parse(fp).getroot()
-        record = etree_to_dict(xml_tree)["html"]["body"].get("record")
+        record = None
+        try:
+            xml_tree = fromstring(fp)
+            record = etree_to_dict(xml_tree).get("record")
+        except Exception as e:
+            xml_tree = html_parse(fp).getroot()
+            record = etree_to_dict(xml_tree)["html"]["body"].get("record")
         if not record:
             raise ReaderError(f"Record not found in XML entry.")
@@ -270,19 +276,38 @@ class OAIPMHReader(BaseReader):
                     self.xml.find(f".//{self._oai_namespace}metadata").getchildren()[0],
                 )
-        scythe.class_mapping["ListRecords"] = OAIRecord
-        try:
-            records = scythe.list_records(
-                from_=self._from,
-                until=self._until,
-                metadata_prefix=self._metadata_prefix,
-                set_=self._set,
-                ignore_deleted=True,
-            )
-            for record in records:
-                yield {"record": record}
-        except oaipmh_scythe.NoRecordsMatch:
-            raise ReaderError(f"No records found in OAI-PMH request.")
+        if self._verb == "ListRecords":
+            scythe.class_mapping["ListRecords"] = OAIRecord
+            try:
+                records = scythe.list_records(
+                    from_=self._from,
+                    until=self._until,
+                    metadata_prefix=self._metadata_prefix,
+                    set_=self._set,
+                    ignore_deleted=True,
+                )
+                for record in records:
+                    yield {"record": record}
+            except oaipmh_scythe.NoRecordsMatch:
+                raise ReaderError("No records found in OAI-PMH request.")
+        else:
+            scythe.class_mapping["GetRecord"] = OAIRecord
+            try:
+                headers = scythe.list_identifiers(
+                    from_=self._from,
+                    until=self._until,
+                    metadata_prefix=self._metadata_prefix,
+                    set_=self._set,
+                    ignore_deleted=True,
+                )
+                for header in headers:
+                    record = scythe.get_record(
+                        identifier=header.identifier,
+                        metadata_prefix=self._metadata_prefix,
+                    )
+                    yield {"record": record}
+            except oaipmh_scythe.NoRecordsMatch:
+                raise ReaderError("No records found in OAI-PMH request.")
     def read(self, item=None, *args, **kwargs):
         """Reads from item or opens the file descriptor from origin."""

invenio_vocabularies/datastreams/tasks.py ADDED Viewed

@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2022-2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""Data Streams Celery tasks."""
+from celery import shared_task
+from ..datastreams import StreamEntry
+from ..datastreams.factories import WriterFactory
+@shared_task(ignore_result=True)
+def write_entry(writer_config, entry):
+    """Write an entry.
+    :param writer: writer configuration as accepted by the WriterFactory.
+    :param entry: dictionary, StreamEntry is not serializable.
+    """
+    writer = WriterFactory.create(config=writer_config)
+    writer.write(StreamEntry(entry))
+@shared_task(ignore_result=True)
+def write_many_entry(writer_config, entries):
+    """Write many entries.
+    :param writer: writer configuration as accepted by the WriterFactory.
+    :param entry: lisf ot dictionaries, StreamEntry is not serializable.
+    """
+    writer = WriterFactory.create(config=writer_config)
+    stream_entries = [StreamEntry(entry) for entry in entries]
+    writer.write_many(stream_entries)

invenio_vocabularies/datastreams/writers.py CHANGED Viewed

@@ -20,11 +20,17 @@ from marshmallow import ValidationError
 from .datastreams import StreamEntry
 from .errors import WriterError
+from .tasks import write_entry, write_many_entry
 class BaseWriter(ABC):
     """Base writer."""
+    def __init__(self, *args, **kwargs):
+        """Base initialization logic."""
+        # Add any base initialization here if needed
+        pass
     @abstractmethod
     def write(self, stream_entry, *args, **kwargs):
         """Writes the input stream entry to the target output.
@@ -35,6 +41,16 @@ class BaseWriter(ABC):
         """
         pass
+    @abstractmethod
+    def write_many(self, stream_entries, *args, **kwargs):
+        """Writes the input streams entry to the target output.
+        :returns: A List of StreamEntry. The result of writing the entry.
+                  Raises WriterException in case of errors.
+        """
+        pass
 class ServiceWriter(BaseWriter):
     """Writes the entries to an RDM instance using a Service object."""
@@ -85,6 +101,25 @@ class ServiceWriter(BaseWriter):
             # TODO: Check if we can get the error message easier
             raise WriterError([{"InvalidRelationValue": err.args[0]}])
+    def write_many(self, stream_entries, *args, **kwargs):
+        """Writes the input entries using a given service."""
+        entries = [entry.entry for entry in stream_entries]
+        entries_with_id = [(self._entry_id(entry), entry) for entry in entries]
+        results = self._service.create_or_update_many(self._identity, entries_with_id)
+        stream_entries_processed = []
+        for entry, result in zip(entries, results):
+            processed_stream_entry = StreamEntry(
+                entry=entry,
+                record=result.record,
+                errors=result.errors,
+                op_type=result.op_type,
+                exc=result.exc,
+            )
+            processed_stream_entry.log_errors()
+            stream_entries_processed.append(processed_stream_entry)
+        return stream_entries_processed
 class YamlWriter(BaseWriter):
     """Writes the entries to a YAML file."""
@@ -106,3 +141,38 @@ class YamlWriter(BaseWriter):
             yaml.safe_dump([stream_entry.entry], file, allow_unicode=True)
         return stream_entry
+    def write_many(self, stream_entries, *args, **kwargs):
+        """Writes the yaml input entries."""
+        with open(self._filepath, "a") as file:
+            yaml.safe_dump(
+                [stream_entry.entry for stream_entry in stream_entries],
+                file,
+                allow_unicode=True,
+            )
+class AsyncWriter(BaseWriter):
+    """Writes the entries asynchronously (celery task)."""
+    def __init__(self, writer, *args, **kwargs):
+        """Constructor.
+        :param writer: writer to use.
+        """
+        super().__init__(*args, **kwargs)
+        self._writer = writer
+    def write(self, stream_entry, *args, **kwargs):
+        """Launches a celery task to write an entry."""
+        write_entry.delay(self._writer, stream_entry.entry)
+        return stream_entry
+    def write_many(self, stream_entries, *args, **kwargs):
+        """Launches a celery task to write an entry."""
+        write_many_entry.delay(
+            self._writer, [stream_entry.entry for stream_entry in stream_entries]
+        )
+        return stream_entries

invenio_vocabularies/factories.py CHANGED Viewed

@@ -7,6 +7,7 @@
 # modify it under the terms of the MIT License; see LICENSE file for more
 # details.
 """Generate Vocabulary Config."""
 from copy import deepcopy
 import yaml

invenio_vocabularies/records/models.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2020-2021 CERN.
+# Copyright (C) 2020-2024 CERN.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -79,9 +79,7 @@ class VocabularyScheme(db.Model):
         """Create a new vocabulary subtype."""
         banned = [",", ":"]
         for b in banned:
-            assert (
-                b not in data["id"]
-            ), f"No '{b}' allowed in VocabularyScheme.id"  # noqa
+            assert b not in data["id"], f"No '{b}' allowed in VocabularyScheme.id"
         with db.session.begin_nested():
             obj = cls(**data)

invenio_vocabularies/records/pidprovider.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2021 CERN.
+# Copyright (C) 2021-2024 CERN.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,7 +8,6 @@
 """Persistent identifier provider for vocabularies."""
 from invenio_pidstore.models import PIDStatus
 from invenio_pidstore.providers.base import BaseProvider

invenio_vocabularies/resources/__init__.py CHANGED Viewed

@@ -7,6 +7,7 @@
 # details.
 """Resources module."""
 from invenio_vocabularies.resources.schema import L10NString, VocabularyL10Schema
 from .config import VocabulariesResourceConfig, VocabularyTypeResourceConfig

invenio_vocabularies/resources/schema.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2020-2021 CERN.
+# Copyright (C) 2020-2024 CERN.
 # Copyright (C) 2021 Northwestern University.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
@@ -8,6 +8,7 @@
 # details.
 """Vocabulary resource schema."""
 from marshmallow import Schema, fields
 from invenio_vocabularies.resources.serializer import L10NString

invenio_vocabularies/services/custom_fields/subject.py CHANGED Viewed

@@ -1,12 +1,13 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2024-2024 CERN.
+# Copyright (C) 2024 CERN.
 #
 # Invenio-RDM-Records is free software; you can redistribute it and/or modify
 # it under the terms of the MIT License; see LICENSE file for more details.
 """Custom fields."""
 from invenio_i18n import lazy_gettext as _
 from ...contrib.subjects.api import Subject
@@ -25,7 +26,7 @@ class SubjectCF(VocabularyCF):
             vocabulary_id="subjects",
             schema=SubjectRelationSchema,
             ui_schema=SubjectRelationSchema,
-            **kwargs
+            **kwargs,
         )
         self.pid_field = Subject.pid

invenio_vocabularies/services/custom_fields/vocabulary.py CHANGED Viewed

@@ -39,7 +39,7 @@ class VocabularyCF(BaseCF):
         sort_by=None,
         schema=VocabularyRelationSchema,
         ui_schema=VocabularyL10NItemSchema,
-        **kwargs
+        **kwargs,
     ):
         """Constructor."""
         super().__init__(name, **kwargs)

invenio_vocabularies/services/tasks.py CHANGED Viewed

@@ -11,7 +11,6 @@ from celery import shared_task
 from flask import current_app
 from ..datastreams.factories import DataStreamFactory
-from ..factories import get_vocabulary_config
 @shared_task(ignore_result=True)
@@ -27,32 +26,3 @@ def process_datastream(config):
         if result.errors:
             for err in result.errors:
                 current_app.logger.error(err)
-@shared_task()
-def import_funders():
-    """Import the funders vocabulary.
-    Only new records are imported.
-    Existing records are not updated.
-    """
-    vc = get_vocabulary_config("funders")
-    config = vc.get_config()
-    # When importing funders via a Celery task, make sure that we are automatically downloading the ROR file,
-    # instead of relying on a local file on the file system.
-    if config["readers"][0]["type"] == "ror-http":
-        readers_config_with_ror_http = config["readers"]
-    else:
-        readers_config_with_ror_http = [{"type": "ror-http"}] + config["readers"]
-    ds = DataStreamFactory.create(
-        readers_config=readers_config_with_ror_http,
-        transformers_config=config.get("transformers"),
-        writers_config=config["writers"],
-    )
-    for result in ds.process():
-        if result.errors:
-            for err in result.errors:
-                current_app.logger.exception(err)

invenio_vocabularies/templates/semantic-ui/invenio_vocabularies/subjects.html CHANGED Viewed

@@ -1,5 +1,5 @@
 {#
-  Copyright (C) 2024-2024 CERN.
+  Copyright (C) 2024 CERN.
   Invenio RDM Records is free software; you can redistribute it and/or modify
   it under the terms of the MIT License; see LICENSE file for more details.

{invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: invenio-vocabularies
-Version: 4.1.1
+Version: 4.3.0
 Summary: "Invenio module for managing vocabularies."
 Home-page: https://github.com/inveniosoftware/invenio-vocabularies
 Author: CERN
@@ -15,6 +15,7 @@ Requires-Dist: invenio-records-resources <7.0.0,>=6.0.0
 Requires-Dist: invenio-administration <3.0.0,>=2.0.0
 Requires-Dist: lxml >=4.5.0
 Requires-Dist: PyYAML >=5.4.1
+Requires-Dist: regex >=2024.7.24
 Provides-Extra: elasticsearch7
 Requires-Dist: invenio-search[elasticsearch7] <3.0.0,>=2.1.0 ; extra == 'elasticsearch7'
 Provides-Extra: mysql
@@ -25,6 +26,8 @@ Requires-Dist: invenio-search[opensearch1] <3.0.0,>=2.1.0 ; extra == 'opensearch
 Provides-Extra: opensearch2
 Requires-Dist: invenio-search[opensearch2] <3.0.0,>=2.1.0 ; extra == 'opensearch2'
 Provides-Extra: postgresql
+Provides-Extra: s3fs
+Requires-Dist: s3fs >=2024.6.1 ; extra == 's3fs'
 Provides-Extra: sqlite
 Provides-Extra: tests
 Requires-Dist: pytest-black-ng >=0.4.0 ; extra == 'tests'
@@ -78,6 +81,20 @@ https://invenio-vocabularies.readthedocs.io/
 Changes
 =======
+Version v4.3.0 (released 2024-08-05)
+- names: make names_exclude_regex configurable
+- names: validate entry full names
+- names: add orcid public data sync
+Version v4.2.0 (released 2024-07-24)
+- ror: check last update; use ld+json for metadata (#367)
+- tasks: remove import funders task
+- funders: add and export custom transformer
+- affiliations: add and export custom transformer
+- datastreams: implement asynchronous writer
 Version v4.1.1 (released 2024-07-15)
 - installation: use invenio-oaipmh-scythe from PyPI

invenio-vocabularies 4.1.1__py2.py3-none-any.whl → 4.3.0__py2.py3-none-any.whl

Potentially problematic release.

invenio-vocabularies 4.1.1py2.py3-none-any.whl → 4.3.0py2.py3-none-any.whl