PyPI - invenio-vocabularies - Versions diffs - 6.6.0__py2.py3-none-any.whl → 6.8.0__py2.py3-none-any.whl - Mend

invenio-vocabularies 6.6.0py2.py3-none-any.whl → 6.8.0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (46) hide show

invenio_vocabularies/contrib/names/datastreams.py CHANGED Viewed

@@ -13,12 +13,14 @@ import io
 import tarfile
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import timedelta
+from itertools import islice
+from pathlib import Path
 import arrow
 import regex as re
 from flask import current_app
 from invenio_access.permissions import system_identity
-from invenio_records.dictutils import dict_lookup
+from werkzeug.utils import cached_property
 from invenio_vocabularies.contrib.names.s3client import S3OrcidClient
@@ -47,10 +49,11 @@ class OrcidDataSyncReader(BaseReader):
         suffix = orcid_to_sync[-3:]
         key = f"{suffix}/{orcid_to_sync}.xml"
         try:
+            # Potential improvement: use the a XML jax parser to avoid loading the whole file in memory
+            # and choose the sections we need to read (probably the summary)
             return self.s3_client.read_file(f"s3://{bucket}/{key}")
-        except Exception as e:
-            # TODO: log
-            return None
+        except Exception:
+            current_app.logger.exception("Failed to fetch ORCiD record.")
     def _process_lambda_file(self, fileobj):
         """Process the ORCiD lambda file and returns a list of ORCiDs to sync.
@@ -67,42 +70,54 @@ class OrcidDataSyncReader(BaseReader):
         if self.since:
             time_shift = self.since
         last_sync = arrow.now() - timedelta(**time_shift)
-        file_content = fileobj.read().decode("utf-8")
-        csv_reader = csv.DictReader(file_content.splitlines())
-        for row in csv_reader:  # Skip the header line
-            orcid = row["orcid"]
-            # Lambda file is ordered by last modified date
-            last_modified_str = row["last_modified"]
-            try:
-                last_modified_date = arrow.get(last_modified_str, date_format)
-            except arrow.parser.ParserError:
-                last_modified_date = arrow.get(last_modified_str, date_format_no_millis)
-            if last_modified_date < last_sync:
-                break
-            yield orcid
+        try:
+            content = io.TextIOWrapper(fileobj, encoding="utf-8")
+            csv_reader = csv.DictReader(content)
+            for row in csv_reader:  # Skip the header line
+                orcid = row["orcid"]
+                # Lambda file is ordered by last modified date
+                last_modified_str = row["last_modified"]
+                try:
+                    last_modified_date = arrow.get(last_modified_str, date_format)
+                except arrow.parser.ParserError:
+                    last_modified_date = arrow.get(
+                        last_modified_str, date_format_no_millis
+                    )
+                if last_modified_date < last_sync:
+                    break
+                yield orcid
+        finally:
+            fileobj.close()
     def _iter(self, orcids):
         """Iterates over the ORCiD records yielding each one."""
         with ThreadPoolExecutor(
             max_workers=current_app.config["VOCABULARIES_ORCID_SYNC_MAX_WORKERS"]
         ) as executor:
-            futures = [
-                executor.submit(
+            # futures is a dictionary where the key is the ORCID value and the item is the Future object
+            futures = {
+                orcid: executor.submit(
                     self._fetch_orcid_data,
                     orcid,
                     current_app.config["VOCABULARIES_ORCID_SUMMARIES_BUCKET"],
                 )
                 for orcid in orcids
-            ]
-            for future in as_completed(futures):
-                result = future.result()
-                if result is not None:
-                    yield result
+            }
+            for orcid in list(futures.keys()):
+                try:
+                    result = futures[orcid].result()
+                    if result:
+                        yield result
+                finally:
+                    # Explicitly release memory, as we don't need the future anymore.
+                    # This is mostly required because as long as we keep a reference to the future
+                    # (in the above futures dict), the garbage collector won't collect it
+                    # and it will keep the memory allocated.
+                    del futures[orcid]
     def read(self, item=None, *args, **kwargs):
         """Streams the ORCiD lambda file, process it to get the ORCiDS to sync and yields it's data."""
@@ -111,7 +126,6 @@ class OrcidDataSyncReader(BaseReader):
             "s3://orcid-lambda-file/last_modified.csv.tar"
         )
-        orcids_to_sync = []
         # Opens tar file and process it
         with tarfile.open(fileobj=io.BytesIO(tar_content)) as tar:
             # Iterate over each member (file or directory) in the tar file
@@ -119,10 +133,24 @@ class OrcidDataSyncReader(BaseReader):
                 # Extract the file
                 extracted_file = tar.extractfile(member)
                 if extracted_file:
+                    current_app.logger.info(f"[ORCID Reader] Processing lambda file...")
                     # Process the file and get the ORCiDs to sync
-                    orcids_to_sync.extend(self._process_lambda_file(extracted_file))
+                    orcids_to_sync = set(self._process_lambda_file(extracted_file))
+                    # Close the file explicitly after processing
+                    extracted_file.close()
+                    # Process ORCIDs in smaller batches
+                    for orcid_batch in self._chunked_iter(
+                        orcids_to_sync, batch_size=100
+                    ):
+                        yield from self._iter(orcid_batch)
-        yield from self._iter(orcids_to_sync)
+    def _chunked_iter(self, iterable, batch_size):
+        """Yield successive chunks of a given size."""
+        it = iter(iterable)
+        while chunk := list(islice(it, batch_size)):
+            yield chunk
 class OrcidHTTPReader(SimpleHTTPReader):
@@ -139,24 +167,75 @@ class OrcidHTTPReader(SimpleHTTPReader):
 DEFAULT_NAMES_EXCLUDE_REGEX = r"[\p{P}\p{S}\p{Nd}\p{No}\p{Emoji}--,.()\-']"
-"""Regex to filter out names with punctuations, symbols, decimal numbers and emojis."""
+"""Regex to filter out names with punctuation, symbols, numbers and emojis."""
+class OrcidOrgToAffiliationMapper:
+    """Default ORCiD Org ID to affiliation ID mapper."""
+    def __init__(self, org_ids_mapping=None, org_ids_mapping_file=None):
+        """Constructor."""
+        self._org_ids_mapping = org_ids_mapping
+        self._org_ids_mapping_file = org_ids_mapping_file
+    @cached_property
+    def org_ids_mapping(self):
+        """Mapping of ORCiD org IDs to affiliation IDs."""
+        org_ids_mapping_file = self._org_ids_mapping_file or current_app.config.get(
+            "VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"
+        )
+        if org_ids_mapping_file:
+            org_ids_mapping_file = Path(org_ids_mapping_file)
+            # If the path is relative, prepend the instance path
+            if not org_ids_mapping_file.is_absolute():
+                org_ids_mapping_file = (
+                    Path(current_app.instance_path) / org_ids_mapping_file
+                )
+            with open(org_ids_mapping_file) as fin:
+                result = {}
+                reader = csv.reader(fin)
+                # Check if the first row is a header
+                org_scheme, org_id, aff_id = next(reader)
+                if org_scheme.lower() != "org_scheme":
+                    result[(org_scheme, org_id)] = aff_id
+                for org_scheme, org_id, aff_id in reader:
+                    result[(org_scheme, org_id)] = aff_id
+                return result
+        return self._org_ids_mapping or {}
+    def __call__(self, org_scheme, org_id):
+        """Map an ORCiD org ID to an affiliation ID."""
+        # By default we know that ROR IDs are linkable
+        if org_scheme == "ROR":
+            return org_id.split("/")[-1]
+        # Otherwise see if we have a mapping from other schemes to an affiliation ID
+        return self.org_ids_mapping.get((org_scheme, org_id))
 class OrcidTransformer(BaseTransformer):
     """Transforms an ORCiD record into a names record."""
     def __init__(
-        self, *args, names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX, **kwargs
+        self,
+        *args,
+        names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX,
+        org_id_to_affiliation_id_func=None,
+        **kwargs,
     ) -> None:
         """Constructor."""
         self._names_exclude_regex = names_exclude_regex
+        self._org_id_to_affiliation_id_func = (
+            org_id_to_affiliation_id_func or OrcidOrgToAffiliationMapper()
+        )
         super().__init__()
-    def _is_valid_name(self, name):
-        """Check whether the name passes the regex."""
-        if not self._names_exclude_regex:
-            return True
-        return not bool(re.search(self._names_exclude_regex, name, re.UNICODE | re.V1))
+    def org_id_to_affiliation_id(self, org_scheme, org_id):
+        """Convert and ORCiD org ID to a linkable affiliation ID."""
+        return self._org_id_to_affiliation_id_func(org_scheme, org_id)
     def apply(self, stream_entry, **kwargs):
         """Applies the transformation to the stream entry."""
@@ -166,42 +245,88 @@ class OrcidTransformer(BaseTransformer):
         name = person.get("name")
         if name is None:
-            raise TransformerError(f"Name not found in ORCiD entry.")
+            raise TransformerError("Name not found in ORCiD entry.")
         if name.get("family-name") is None:
-            raise TransformerError(f"Family name not found in ORCiD entry.")
+            raise TransformerError("Family name not found in ORCiD entry.")
         if not self._is_valid_name(name["given-names"] + name["family-name"]):
-            raise TransformerError(f"Invalid characters in name.")
+            raise TransformerError("Invalid characters in name.")
         entry = {
             "id": orcid_id,
             "given_name": name.get("given-names"),
             "family_name": name.get("family-name"),
             "identifiers": [{"scheme": "orcid", "identifier": orcid_id}],
-            "affiliations": [],
+            "affiliations": self._extract_affiliations(record),
         }
+        stream_entry.entry = entry
+        return stream_entry
+    def _is_valid_name(self, name):
+        """Check whether the name passes the regex."""
+        if not self._names_exclude_regex:
+            return True
+        return not bool(re.search(self._names_exclude_regex, name, re.UNICODE | re.V1))
+    def _extract_affiliations(self, record):
+        """Extract affiliations from the ORCiD record."""
+        result = []
         try:
-            employments = dict_lookup(
-                record, "activities-summary.employments.affiliation-group"
+            employments = (
+                record.get("activities-summary", {})
+                .get("employments", {})
+                .get("affiliation-group", [])
             )
+            # If there are single values, the XML to dict, doesn't wrap them in a list
             if isinstance(employments, dict):
                 employments = [employments]
-            history = set()
+            # Remove the "employment-summary" nesting
+            employments = [
+                employment.get("employment-summary", {}) for employment in employments
+            ]
             for employment in employments:
-                terminated = employment["employment-summary"].get("end-date")
-                affiliation = dict_lookup(
-                    employment,
-                    "employment-summary.organization.name",
-                )
-                if affiliation not in history and not terminated:
-                    history.add(affiliation)
-                    entry["affiliations"].append({"name": affiliation})
+                terminated = employment.get("end-date")
+                if terminated:
+                    continue
+                org = employment["organization"]
+                aff_id = self._extract_affiliation_id(org)
+                # Skip adding if the ID already exists in result
+                if aff_id and any(aff.get("id") == aff_id for aff in result):
+                    continue
+                # Skip adding if the name exists in result with no ID
+                if any(
+                    aff.get("name") == org["name"] and "id" not in aff for aff in result
+                ):
+                    continue
+                aff = {"name": org["name"]}
+                if aff_id:
+                    aff["id"] = aff_id
+                result.append(aff)
         except Exception:
             pass
-        stream_entry.entry = entry
-        return stream_entry
+        return result
+    def _extract_affiliation_id(self, org):
+        """Extract the affiliation ID from an ORCiD organization."""
+        dis_org = org.get("disambiguated-organization")
+        if not dis_org:
+            return
+        aff_id = None
+        org_id = dis_org.get("disambiguated-organization-identifier")
+        org_scheme = dis_org.get("disambiguation-source")
+        if org_id and org_scheme:
+            aff_id = self.org_id_to_affiliation_id(org_scheme, org_id)
+        return aff_id
 class NamesServiceWriter(ServiceWriter):

invenio_vocabularies/contrib/names/mappings/os-v1/names/name-v2.0.0.json CHANGED Viewed

@@ -125,6 +125,17 @@
             "type": "text",
             "analyzer": "accent_edge_analyzer",
             "search_analyzer": "accent_analyzer"
+          },
+          "acronym": {
+            "type": "text",
+            "analyzer": "accent_edge_analyzer",
+            "search_analyzer": "accent_analyzer",
+            "fields": {
+              "keyword": {
+                "type": "keyword",
+                "normalizer": "accent_normalizer"
+              }
+            }
           }
         }
       },

invenio_vocabularies/contrib/names/mappings/os-v2/names/name-v2.0.0.json CHANGED Viewed

@@ -125,6 +125,17 @@
             "type": "text",
             "analyzer": "accent_edge_analyzer",
             "search_analyzer": "accent_analyzer"
+          },
+          "acronym": {
+            "type": "text",
+            "analyzer": "accent_edge_analyzer",
+            "search_analyzer": "accent_analyzer",
+            "fields": {
+              "keyword": {
+                "type": "keyword",
+                "normalizer": "accent_normalizer"
+              }
+            }
           }
         }
       },

invenio_vocabularies/contrib/names/names.py CHANGED Viewed

@@ -30,7 +30,7 @@ from .schema import NameSchema
 name_relations = RelationsField(
     affiliations=PIDListRelation(
         "affiliations",
-        keys=["name"],
+        keys=["name", "acronym"],
         pid_field=Affiliation.pid,
         cache_key="affiliations",
     )

invenio_vocabularies/contrib/names/schema.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2021 CERN.
+# Copyright (C) 2021-2024 CERN.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -16,10 +16,18 @@ from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode
 from marshmallow_utils.schemas import IdentifierSchema
 from ...services.schema import BaseVocabularySchema, ModePIDFieldVocabularyMixin
-from ..affiliations.schema import AffiliationRelationSchema
+from ..affiliations.schema import (
+    AffiliationRelationSchema as BaseAffiliationRelationSchema,
+)
 from .config import names_schemes
+class AffiliationRelationSchema(BaseAffiliationRelationSchema):
+    """Affiliation relation schema."""
+    acronym = SanitizedUnicode(dump_only=True)
 class NameSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin):
     """Service schema for names.

invenio_vocabularies/contrib/subjects/bodc/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""BODC Subjects module."""

invenio_vocabularies/contrib/subjects/bodc/datastreams.py ADDED Viewed

@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+"""BODC subjects datastreams, readers, transformers, and writers."""
+from invenio_vocabularies.datastreams.errors import TransformerError
+from invenio_vocabularies.datastreams.readers import RDFReader
+from invenio_vocabularies.datastreams.transformers import RDFTransformer
+from ..config import bodc_puv_file_url
+# Available with the "rdf" extra
+try:
+    import rdflib
+except ImportError:
+    rdflib = None
+class BODCPUVSubjectsTransformer(RDFTransformer):
+    """
+    Transformer class to convert BODC-PUV RDF data to a dictionary format.
+    Input:
+        - Relevant fields:
+            - `skos:notation`: Primary identifier for the concept.
+            - `skos:prefLabel`: Preferred labels with language codes.
+            - `skos:altLabel`: Alternative labels (optional).
+            - `skos:definition`: Definitions of the concept.
+            - `owl:deprecated`: Boolean flag indicating if the concept is deprecated.
+    Output:
+        - A dictionary with the following structure:
+            {
+                "id": "SDN:P01::SAGEMSFM",  # BODC-specific parameter ID (skos:notation).
+                "scheme": "BODC-PUV",  # The scheme name indicating this is a BODC Parameter Usage Vocabulary concept.
+                "subject": "AMSSedAge",  # The alternative label (skos:altLabel), if available, or None.
+                "title": {
+                    "en": "14C age of Foraminiferida"  # English preferred label (skos:prefLabel).
+                },
+                "props": {
+                    "definitions": "Accelerated mass spectrometry on picked tests",  # Definition of subject (skos:definition).
+                },
+                "identifiers": [
+                    {
+                        "scheme": "url",  # Type of identifier (URL).
+                        "identifier": "http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM"  # URI of the concept.
+                    }
+                ]
+            }
+    """
+    def _get_subject_data(self, rdf_graph, subject):
+        """Fetch all triples for a subject and organize them into a dictionary."""
+        data = {}
+        for predicate, obj in rdf_graph.predicate_objects(subject=subject):
+            predicate_name = str(predicate)
+            if predicate_name not in data:
+                data[predicate_name] = []
+            data[predicate_name].append(obj)
+        return data
+    def _transform_entry(self, subject, rdf_graph):
+        """Transform an entry to the required dictionary format."""
+        labels = self._get_labels(subject, rdf_graph)
+        subject_data = self._get_subject_data(rdf_graph, subject)
+        deprecated = subject_data.get(str(rdflib.namespace.OWL.deprecated), [False])
+        if deprecated and str(deprecated[0]).lower() == "true":
+            return None  # Skip deprecated subjects
+        notation = subject_data.get(str(self.skos_core.notation), [])
+        if notation:
+            id = str(notation[0])
+        else:
+            raise TransformerError(f"No id found for: {subject}")
+        alt_labels = [obj for obj in subject_data.get(str(self.skos_core.altLabel), [])]
+        subject_text = str(alt_labels[0]) if alt_labels else ""
+        definition = str(subject_data.get(str(self.skos_core.definition), [None])[0])
+        return {
+            "id": id,
+            "scheme": "BODC-PUV",
+            "subject": subject_text,
+            "title": labels,
+            "props": {"definition": definition} if definition else {},
+            "identifiers": self._get_identifiers(subject),
+        }
+# Configuration for datastream
+VOCABULARIES_DATASTREAM_TRANSFORMERS = {"bodc-transformer": BODCPUVSubjectsTransformer}
+DATASTREAM_CONFIG = {
+    "readers": [
+        {
+            "type": "http",
+            "args": {
+                "origin": bodc_puv_file_url,
+            },
+        },
+        {"type": "rdf"},
+    ],
+    "transformers": [{"type": "bodc-transformer"}],
+    "writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
+}

invenio_vocabularies/contrib/subjects/config.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2021 CERN.
+# Copyright (C) 2021-2024 CERN.
 # Copyright (C) 2021 Northwestern University.
 # Copyright (C) 2024 University of Münster.
 #
@@ -15,10 +15,12 @@ from invenio_i18n import get_locale
 from invenio_i18n import lazy_gettext as _
 from invenio_records_resources.services import SearchOptions
 from invenio_records_resources.services.records.components import DataComponent
+from invenio_records_resources.services.records.queryparser import (
+    CompositeSuggestQueryParser,
+)
 from werkzeug.local import LocalProxy
 from ...services.components import PIDComponent
-from ...services.querystr import FilteredSuggestQueryParser
 subject_schemes = LocalProxy(
     lambda: current_app.config["VOCABULARIES_SUBJECTS_SCHEMES"]
@@ -26,12 +28,24 @@ subject_schemes = LocalProxy(
 localized_title = LocalProxy(lambda: f"title.{get_locale()}^20")
+gemet_file_url = LocalProxy(
+    lambda: current_app.config["VOCABULARIES_SUBJECTS_GEMET_FILE_URL"]
+)
+euroscivoc_file_url = LocalProxy(
+    lambda: current_app.config["VOCABULARIES_SUBJECTS_EUROSCIVOC_FILE_URL"]
+)
+bodc_puv_file_url = LocalProxy(
+    lambda: current_app.config["VOCABULARIES_SUBJECTS_BODC_PUV_FILE_URL"]
+)
 class SubjectsSearchOptions(SearchOptions):
     """Search options."""
-    suggest_parser_cls = FilteredSuggestQueryParser.factory(
-        filter_field="scheme",
-        fields=[  # suggest fields
+    suggest_parser_cls = CompositeSuggestQueryParser.factory(
+        fields=[
             "subject^100",
             localized_title,
             "synonyms^20",

invenio_vocabularies/contrib/subjects/datastreams.py CHANGED Viewed

@@ -12,7 +12,9 @@ from invenio_access.permissions import system_identity
 from invenio_i18n import lazy_gettext as _
 from ...datastreams.writers import ServiceWriter
+from .bodc import datastreams as bodc_datastreams
 from .euroscivoc import datastreams as euroscivoc_datastreams
+from .gemet import datastreams as gemet_datastreams
 from .mesh import datastreams as mesh_datastreams
@@ -31,20 +33,20 @@ class SubjectsServiceWriter(ServiceWriter):
 VOCABULARIES_DATASTREAM_READERS = {
     **mesh_datastreams.VOCABULARIES_DATASTREAM_READERS,
-    **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS,
 }
 """Subjects Data Streams readers."""
 VOCABULARIES_DATASTREAM_TRANSFORMERS = {
     **mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
     **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
+    **gemet_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
+    **bodc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
 }
 """Subjects Data Streams transformers."""
 VOCABULARIES_DATASTREAM_WRITERS = {
     "subjects-service": SubjectsServiceWriter,
     **mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
-    **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
 }
 """Subjects Data Streams writers."""

invenio-vocabularies 6.6.0__py2.py3-none-any.whl → 6.8.0__py2.py3-none-any.whl

Potentially problematic release.

invenio-vocabularies 6.6.0py2.py3-none-any.whl → 6.8.0py2.py3-none-any.whl