PyPI - invenio-vocabularies - Versions diffs - 7.3.0__py2.py3-none-any.whl → 7.4.0__py2.py3-none-any.whl - Mend

invenio-vocabularies 7.3.0py2.py3-none-any.whl → 7.4.0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (52) hide show

invenio_vocabularies/__init__.py CHANGED Viewed

@@ -11,6 +11,6 @@
 from .ext import InvenioVocabularies
-__version__ = "7.3.0"
+__version__ = "7.4.0"
 __all__ = ("__version__", "InvenioVocabularies")

invenio_vocabularies/administration/views/vocabularies.py CHANGED Viewed

@@ -9,10 +9,8 @@
 """Vocabularies admin interface."""
-from invenio_administration.views.base import (
-    AdminResourceEditView,
-    AdminResourceListView,
-)
+from invenio_administration.views.base import AdminResourceListView
+from invenio_i18n import lazy_gettext as _
 class VocabulariesListView(AdminResourceListView):
@@ -20,11 +18,11 @@ class VocabulariesListView(AdminResourceListView):
     api_endpoint = "/vocabularies/"
     name = "vocabulary-types"
-    menu_label = "Vocabulary Types"
+    menu_label = _("Vocabulary Types")
     resource_config = "vocabulary_admin_resource"
     search_request_headers = {"Accept": "application/json"}
-    title = "Vocabulary Types"
-    category = "Site management"
+    title = _("Vocabulary Types")
+    category = _("Site management")
     pid_path = "id"
     icon = "exchange"
@@ -36,8 +34,8 @@ class VocabulariesListView(AdminResourceListView):
     display_create = False
     item_field_list = {
-        "id": {"text": "Name", "order": 1},
-        "count": {"text": "Number of entries", "order": 2},
+        "id": {"text": _("Name"), "order": 1},
+        "count": {"text": _("Number of entries"), "order": 2},
     }
     search_config_name = "VOCABULARIES_TYPES_SEARCH"

invenio_vocabularies/contrib/names/datastreams.py CHANGED Viewed

@@ -12,6 +12,7 @@ import csv
 import io
 import tarfile
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from contextvars import copy_context
 from datetime import timedelta
 from itertools import islice
 from pathlib import Path
@@ -43,17 +44,18 @@ class OrcidDataSyncReader(BaseReader):
         self.s3_client = S3OrcidClient()
         self.since = since
-    def _fetch_orcid_data(self, orcid_to_sync, bucket):
+    def _fetch_orcid_data(self, app, orcid_to_sync, bucket):
         """Fetches a single ORCiD record from S3."""
         # The ORCiD file key is located in a folder which name corresponds to the last three digits of the ORCiD
         suffix = orcid_to_sync[-3:]
         key = f"{suffix}/{orcid_to_sync}.xml"
+        app.logger.debug(f"Fetching ORCiD record: {key} from bucket: {bucket}")
         try:
             # Potential improvement: use the a XML jax parser to avoid loading the whole file in memory
             # and choose the sections we need to read (probably the summary)
             return self.s3_client.read_file(f"s3://{bucket}/{key}")
         except Exception:
-            current_app.logger.exception("Failed to fetch ORCiD record.")
+            app.logger.exception(f"Failed to fetch ORCiD record: {key}")
     def _process_lambda_file(self, fileobj):
         """Process the ORCiD lambda file and returns a list of ORCiDs to sync.
@@ -87,7 +89,11 @@ class OrcidDataSyncReader(BaseReader):
                     )
                 if last_modified_date < last_sync:
+                    current_app.logger.debug(
+                        f"Skipping ORCiD {orcid} (last modified: {last_modified_date})"
+                    )
                     break
+                current_app.logger.debug(f"Yielding ORCiD {orcid} for sync.")
                 yield orcid
         finally:
             fileobj.close()
@@ -97,10 +103,15 @@ class OrcidDataSyncReader(BaseReader):
         with ThreadPoolExecutor(
             max_workers=current_app.config["VOCABULARIES_ORCID_SYNC_MAX_WORKERS"]
         ) as executor:
+            app = current_app._get_current_object()
             # futures is a dictionary where the key is the ORCID value and the item is the Future object
+            # Flask does not propagate app/request context to new threads, so `copy_context().run`
+            # ensures the current instantianted contextvars (such as job_context) is preserved in each thread.
             futures = {
                 orcid: executor.submit(
+                    copy_context().run,  # Required to pass the context to the thread
                     self._fetch_orcid_data,
+                    app,  # Pass the Flask app to the thread
                     orcid,
                     current_app.config["VOCABULARIES_ORCID_SUMMARIES_BUCKET"],
                 )
@@ -111,7 +122,14 @@ class OrcidDataSyncReader(BaseReader):
                 try:
                     result = futures[orcid].result()
                     if result:
+                        current_app.logger.debug(
+                            f"Successfully fetched ORCiD record: {orcid}"
+                        )
                         yield result
+                except Exception:
+                    current_app.logger.exception(
+                        f"Error processing ORCiD record: {orcid}"
+                    )
                 finally:
                     # Explicitly release memory, as we don't need the future anymore.
                     # This is mostly required because as long as we keep a reference to the future
@@ -125,7 +143,7 @@ class OrcidDataSyncReader(BaseReader):
         tar_content = self.s3_client.read_file(
             "s3://orcid-lambda-file/last_modified.csv.tar"
         )
+        current_app.logger.info("Fetching ORCiD lambda file")
         # Opens tar file and process it
         with tarfile.open(fileobj=io.BytesIO(tar_content)) as tar:
             # Iterate over each member (file or directory) in the tar file
@@ -133,7 +151,7 @@ class OrcidDataSyncReader(BaseReader):
                 # Extract the file
                 extracted_file = tar.extractfile(member)
                 if extracted_file:
-                    current_app.logger.info(f"[ORCID Reader] Processing lambda file...")
+                    current_app.logger.info(f"Processing lambda file: {member.name}")
                     # Process the file and get the ORCiDs to sync
                     orcids_to_sync = set(self._process_lambda_file(extracted_file))
@@ -150,6 +168,7 @@ class OrcidDataSyncReader(BaseReader):
         """Yield successive chunks of a given size."""
         it = iter(iterable)
         while chunk := list(islice(it, batch_size)):
+            current_app.logger.debug(f"Processing batch of size {len(chunk)}.")
             yield chunk
@@ -239,18 +258,25 @@ class OrcidTransformer(BaseTransformer):
     def apply(self, stream_entry, **kwargs):
         """Applies the transformation to the stream entry."""
+        current_app.logger.debug("Applying transformation to stream entry.")
         record = stream_entry.entry
         person = record["person"]
         orcid_id = record["orcid-identifier"]["path"]
         name = person.get("name")
         if name is None:
-            raise TransformerError("Name not found in ORCiD entry.")
+            raise TransformerError(
+                f"Name not found in ORCiD entry for ORCiD ID: {orcid_id}."
+            )
         if name.get("family-name") is None:
-            raise TransformerError("Family name not found in ORCiD entry.")
+            raise TransformerError(
+                f"Family name not found in ORCiD entry for ORCiD ID: {orcid_id}."
+            )
         if not self._is_valid_name(name["given-names"] + name["family-name"]):
-            raise TransformerError("Invalid characters in name.")
+            raise TransformerError(
+                f"Invalid characters in name for ORCiD ID: {orcid_id}."
+            )
         entry = {
             "id": orcid_id,
@@ -261,6 +287,7 @@ class OrcidTransformer(BaseTransformer):
         }
         stream_entry.entry = entry
+        current_app.logger.debug(f"Transformed entry: {entry}")
         return stream_entry
     def _is_valid_name(self, name):
@@ -271,6 +298,7 @@ class OrcidTransformer(BaseTransformer):
     def _extract_affiliations(self, record):
         """Extract affiliations from the ORCiD record."""
+        current_app.logger.debug("Extracting affiliations from ORCiD record.")
         result = []
         try:
             employments = (
@@ -312,7 +340,7 @@ class OrcidTransformer(BaseTransformer):
                 result.append(aff)
         except Exception:
-            pass
+            current_app.logger.error("Error extracting affiliations.")
         return result
     def _extract_affiliation_id(self, org):

invenio_vocabularies/datastreams/datastreams.py CHANGED Viewed

@@ -72,13 +72,18 @@ class DataStream:
     def filter(self, stream_entry, *args, **kwargs):
         """Checks if an stream_entry should be filtered out (skipped)."""
+        current_app.logger.debug(f"Filtering entry: {stream_entry.entry}")
         return False
     def process_batch(self, batch):
         """Process a batch of entries."""
+        current_app.logger.info(f"Processing batch of size: {len(batch)}")
         transformed_entries = []
         for stream_entry in batch:
             if stream_entry.errors:
+                current_app.logger.warning(
+                    f"Skipping entry with errors: {stream_entry.errors}"
+                )
                 yield stream_entry  # reading errors
             else:
                 transformed_entry = self.transform(stream_entry)
@@ -103,19 +108,23 @@ class DataStream:
         the reader, apply the transformations and yield the result of
         writing it.
         """
+        current_app.logger.info("Starting data stream processing")
         batch = []
         for stream_entry in self.read():
             batch.append(stream_entry)
             if len(batch) >= self.batch_size:
+                current_app.logger.debug(f"Processing batch of size: {len(batch)}")
                 yield from self.process_batch(batch)
                 batch = []
         # Process any remaining entries in the last batch
         if batch:
+            current_app.logger.debug(f"Processing final batch of size: {len(batch)}")
             yield from self.process_batch(batch)
     def read(self):
         """Recursively read the entries."""
+        current_app.logger.debug("Reading entries from readers")
         def pipe_gen(gen_funcs, piped_item=None):
             _gen_funcs = list(gen_funcs)  # copy to avoid modifying ref list
@@ -130,6 +139,7 @@ class DataStream:
                     else:
                         yield StreamEntry(item)
                 except ReaderError as err:
+                    current_app.logger.error(f"Reader error: {str(err)}")
                     yield StreamEntry(
                         entry=item,
                         errors=[f"{current_gen_func.__qualname__}: {str(err)}"],
@@ -140,6 +150,7 @@ class DataStream:
     def transform(self, stream_entry, *args, **kwargs):
         """Apply the transformations to an stream_entry."""
+        current_app.logger.debug(f"Transforming entry: {stream_entry.entry}")
         for transformer in self._transformers:
             try:
                 stream_entry = transformer.apply(stream_entry)
@@ -153,16 +164,19 @@ class DataStream:
     def write(self, stream_entry, *args, **kwargs):
         """Apply the transformations to an stream_entry."""
+        current_app.logger.debug(f"Writing entry: {stream_entry.entry}")
         for writer in self._writers:
             try:
                 writer.write(stream_entry)
             except WriterError as err:
+                current_app.logger.error(f"Writer error: {str(err)}")
                 stream_entry.errors.append(f"{writer.__class__.__name__}: {str(err)}")
         return stream_entry
     def batch_write(self, stream_entries, *args, **kwargs):
         """Apply the transformations to an stream_entry. Errors are handler in the service layer."""
+        current_app.logger.debug(f"Batch writing entries: {len(stream_entries)}")
         for writer in self._writers:
             yield from writer.write_many(stream_entries)

invenio_vocabularies/datastreams/writers.py CHANGED Viewed

@@ -87,17 +87,21 @@ class ServiceWriter(BaseWriter):
     def _do_update(self, entry):
         vocab_id = self._entry_id(entry)
+        current_app.logger.debug(f"Resolving entry with ID: {vocab_id}")
         current = self._resolve(vocab_id)
         updated = dict(current.to_dict(), **entry)
+        current_app.logger.debug(f"Updating entry with ID: {vocab_id}")
         return StreamEntry(self._service.update(self._identity, vocab_id, updated))
     def write(self, stream_entry, *args, **kwargs):
         """Writes the input entry using a given service."""
         entry = stream_entry.entry
+        current_app.logger.debug(f"Writing entry: {entry}")
         try:
             if self._insert:
                 try:
+                    current_app.logger.debug("Inserting entry.")
                     return StreamEntry(self._service.create(self._identity, entry))
                 except PIDAlreadyExists:
                     if not self._update:
@@ -105,6 +109,7 @@ class ServiceWriter(BaseWriter):
                     return self._do_update(entry)
             elif self._update:
                 try:
+                    current_app.logger.debug("Attempting to update entry.")
                     return self._do_update(entry)
                 except (NoResultFound, PIDDoesNotExistError):
                     raise WriterError([f"Vocabulary entry does not exist: {entry}"])
@@ -139,6 +144,7 @@ class ServiceWriter(BaseWriter):
             processed_stream_entry.log_errors()
             stream_entries_processed.append(processed_stream_entry)
+        current_app.logger.debug(f"Finished writing {len(stream_entries)} entries")
         return stream_entries_processed

invenio_vocabularies/jobs.py CHANGED Viewed

@@ -10,7 +10,7 @@
 import datetime
-from invenio_i18n import gettext as _
+from invenio_i18n import lazy_gettext as _
 from invenio_jobs.jobs import JobType
 from invenio_vocabularies.services.tasks import process_datastream
@@ -27,8 +27,8 @@ class ProcessDataStreamJob(JobType):
 class ProcessRORAffiliationsJob(ProcessDataStreamJob):
     """Process ROR affiliations datastream registered task."""
-    description = "Process ROR affiliations"
-    title = "Load ROR affiliations"
+    description = _("Process ROR affiliations")
+    title = _("Load ROR affiliations")
     id = "process_ror_affiliations"
     @classmethod
@@ -65,8 +65,8 @@ class ProcessRORAffiliationsJob(ProcessDataStreamJob):
 class ProcessRORFundersJob(ProcessDataStreamJob):
     """Process ROR funders datastream registered task."""
-    description = "Process ROR funders"
-    title = "Load ROR funders"
+    description = _("Process ROR funders")
+    title = _("Load ROR funders")
     id = "process_ror_funders"
     @classmethod
@@ -103,8 +103,8 @@ class ProcessRORFundersJob(ProcessDataStreamJob):
 class ImportAwardsOpenAIREJob(ProcessDataStreamJob):
     """Import awards from OpenAIRE registered task."""
-    description = "Import awards from OpenAIRE"
-    title = "Import Awards OpenAIRE"
+    description = _("Import awards from OpenAIRE")
+    title = _("Import Awards OpenAIRE")
     id = "import_awards_openaire"
     @classmethod
@@ -138,8 +138,8 @@ class ImportAwardsOpenAIREJob(ProcessDataStreamJob):
 class UpdateAwardsCordisJob(ProcessDataStreamJob):
     """Update awards from CORDIS registered task."""
-    description = "Update awards from CORDIS"
-    title = "Update Awards CORDIS"
+    description = _("Update awards from CORDIS")
+    title = _("Update Awards CORDIS")
     id = "update_awards_cordis"
     @classmethod
@@ -166,8 +166,8 @@ class UpdateAwardsCordisJob(ProcessDataStreamJob):
 class ImportORCIDJob(ProcessDataStreamJob):
     """Import ORCID data registered task."""
-    description = "Import ORCID data"
-    title = "Import ORCID data"
+    description = _("Import ORCID data")
+    title = _("Import ORCID data")
     id = "import_orcid"
     @classmethod

invenio_vocabularies/records/models.py CHANGED Viewed

@@ -9,6 +9,7 @@
 """Vocabulary models."""
 from invenio_db import db
+from invenio_i18n import gettext as _
 from invenio_records.models import RecordMetadataBase
@@ -79,7 +80,9 @@ class VocabularyScheme(db.Model):
         """Create a new vocabulary subtype."""
         banned = [",", ":"]
         for b in banned:
-            assert b not in data["id"], f"No '{b}' allowed in VocabularyScheme.id"
+            assert b not in data["id"], _(
+                "No '%(banned_char)s' allowed in VocabularyScheme.id", banned_char=b
+            )
         with db.session.begin_nested():
             obj = cls(**data)

invenio_vocabularies/services/custom_fields/subject.py CHANGED Viewed

@@ -54,12 +54,12 @@ SUBJECT_FIELDS_UI = [
                 ui_widget="SubjectAutocompleteDropdown",
                 isGenericVocabulary=False,
                 props=dict(
-                    label="Keywords and subjects",
+                    label=_("Keywords and subjects"),
                     icon="tag",
-                    description="The subjects related to the community",
-                    placeholder="Search for a subject by name e.g. Psychology ...",
+                    description=_("The subjects related to the community"),
+                    placeholder=_("Search for a subject by name e.g. Psychology ..."),
                     autocompleteFrom="api/subjects",
-                    noQueryMessage="Search for subjects...",
+                    noQueryMessage=_("Search for subjects..."),
                     autocompleteFromAcceptHeader="application/vnd.inveniordm.v1+json",
                     required=False,
                     multiple=True,

invenio_vocabularies/services/tasks.py CHANGED Viewed

@@ -9,6 +9,7 @@
 from celery import shared_task
 from flask import current_app
+from invenio_jobs.errors import TaskExecutionError
 from ..datastreams.factories import DataStreamFactory
@@ -23,8 +24,13 @@ def process_datastream(config):
         batch_size=config.get("batch_size", 1000),
         write_many=config.get("write_many", False),
     )
+    entries_with_errors = 0
     for result in ds.process():
         if result.errors:
             for err in result.errors:
                 current_app.logger.error(err)
+            entries_with_errors += 1
+    if entries_with_errors:
+        raise TaskExecutionError(
+            message=f"Task execution succeeded with {entries_with_errors} entries with errors."
+        )

{invenio_vocabularies-7.3.0.dist-info → invenio_vocabularies-7.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: invenio-vocabularies
-Version: 7.3.0
+Version: 7.4.0
 Summary: Invenio module for managing vocabularies.
 Home-page: https://github.com/inveniosoftware/invenio-vocabularies
 Author: CERN
@@ -45,9 +45,10 @@ Requires-Dist: invenio-search[opensearch2]<4.0.0,>=3.0.0; extra == "opensearch2"
 Provides-Extra: mysql
 Provides-Extra: postgresql
 Provides-Extra: sqlite
+Dynamic: license-file
 ..
-    Copyright (C) 2020-2021 CERN.
+    Copyright (C) 2020-2025 CERN.
     Invenio-Vocabularies is free software; you can redistribute it and/or
     modify it under the terms of the MIT License; see LICENSE file for more
@@ -77,9 +78,6 @@ Invenio module for managing vocabularies, based on Invenio-Records and Invenio-R
 - Factories for easily generating models, record API classes, services, and resources
 - Helpers for importing vocabularies
-Further documentation is available on
-https://invenio-vocabularies.readthedocs.io/
 ..
     Copyright (C) 2020-2024 CERN.
     Copyright (C) 2024 Graz University of Technology.
@@ -91,6 +89,11 @@ https://invenio-vocabularies.readthedocs.io/
 Changes
 =======
+Version v7.4.0 (released 2025-04-28)
+- i18n: Fix untranslated strings in vocabularies
+- logging: add basic logging for ORCID
 Version v7.3.0 (released 2025-03-18)
 - form: funding: use FeedbackLabel and add error styling

invenio-vocabularies 7.3.0__py2.py3-none-any.whl → 7.4.0__py2.py3-none-any.whl

Potentially problematic release.

invenio-vocabularies 7.3.0py2.py3-none-any.whl → 7.4.0py2.py3-none-any.whl