PyPI - esgvoc - Versions diffs - 1.0.1__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

esgvoc 1.0.1py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of esgvoc might be problematic. Click here for more details.

Files changed (41) hide show

esgvoc/__init__.py +1 -1
esgvoc/api/__init__.py +0 -6
esgvoc/api/data_descriptors/__init__.py +6 -0
esgvoc/api/data_descriptors/archive.py +5 -0
esgvoc/api/data_descriptors/citation_url.py +5 -0
esgvoc/api/data_descriptors/experiment.py +2 -2
esgvoc/api/data_descriptors/known_branded_variable.py +58 -5
esgvoc/api/data_descriptors/regex.py +5 -0
esgvoc/api/data_descriptors/vertical_label.py +2 -2
esgvoc/api/project_specs.py +48 -130
esgvoc/api/projects.py +104 -63
esgvoc/apps/drs/generator.py +47 -42
esgvoc/apps/drs/validator.py +22 -38
esgvoc/apps/jsg/json_schema_generator.py +252 -136
esgvoc/apps/jsg/templates/template.jinja +249 -0
esgvoc/apps/test_cv/README.md +214 -0
esgvoc/apps/test_cv/cv_tester.py +1368 -0
esgvoc/apps/test_cv/example_usage.py +216 -0
esgvoc/apps/vr/__init__.py +12 -0
esgvoc/apps/vr/build_variable_registry.py +71 -0
esgvoc/apps/vr/example_usage.py +60 -0
esgvoc/apps/vr/vr_app.py +333 -0
esgvoc/cli/config.py +671 -86
esgvoc/cli/drs.py +39 -21
esgvoc/cli/main.py +2 -0
esgvoc/cli/test_cv.py +257 -0
esgvoc/core/constants.py +10 -7
esgvoc/core/data_handler.py +24 -22
esgvoc/core/db/connection.py +7 -0
esgvoc/core/db/project_ingestion.py +34 -9
esgvoc/core/db/universe_ingestion.py +1 -2
esgvoc/core/service/configuration/setting.py +192 -21
esgvoc/core/service/data_merger.py +1 -1
esgvoc/core/service/state.py +18 -2
{esgvoc-1.0.1.dist-info → esgvoc-1.1.1.dist-info}/METADATA +2 -1
{esgvoc-1.0.1.dist-info → esgvoc-1.1.1.dist-info}/RECORD +40 -29
esgvoc/apps/jsg/cmip6_template.json +0 -74
/esgvoc/apps/{py.typed → test_cv/__init__.py} +0 -0
{esgvoc-1.0.1.dist-info → esgvoc-1.1.1.dist-info}/WHEEL +0 -0
{esgvoc-1.0.1.dist-info → esgvoc-1.1.1.dist-info}/entry_points.txt +0 -0
{esgvoc-1.0.1.dist-info → esgvoc-1.1.1.dist-info}/licenses/LICENSE.txt +0 -0

esgvoc/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 import esgvoc.core.logging_handler  # noqa
-__version__ = "1.0.1"
+__version__ = "1.1.1"

esgvoc/api/__init__.py CHANGED Viewed

@@ -1,8 +1,5 @@
 from esgvoc.api.project_specs import (
-                                      DrsCollection,
-                                      DrsConstant,
                                       DrsPart,
-                                      DrsPartKind,
                                       DrsSpecification,
                                       DrsType,
                                       ProjectSpecs,
@@ -51,10 +48,7 @@ from esgvoc.api.universe import (
 )
 __all__ = [
-    "DrsCollection",
-    "DrsConstant",
     "DrsPart",
-    "DrsPartKind",
     "DrsSpecification",
     "DrsType",
     "find_collections_in_project",

esgvoc/api/data_descriptors/__init__.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from esgvoc.api.data_descriptors.activity import Activity
+from esgvoc.api.data_descriptors.archive import Archive
 from esgvoc.api.data_descriptors.area_label import AreaLabel
 from esgvoc.api.data_descriptors.branded_suffix import BrandedSuffix
 from esgvoc.api.data_descriptors.branded_variable import BrandedVariable
+from esgvoc.api.data_descriptors.citation_url import CitationUrl
 from esgvoc.api.data_descriptors.consortium import Consortium
 from esgvoc.api.data_descriptors.contact import Contact
 from esgvoc.api.data_descriptors.conventions import Convention
@@ -30,6 +32,7 @@ from esgvoc.api.data_descriptors.product import Product
 from esgvoc.api.data_descriptors.publication_status import PublicationStatus
 from esgvoc.api.data_descriptors.realisation_index import RealisationIndex
 from esgvoc.api.data_descriptors.realm import Realm
+from esgvoc.api.data_descriptors.regex import Regex
 from esgvoc.api.data_descriptors.region import Region
 from esgvoc.api.data_descriptors.resolution import Resolution
 from esgvoc.api.data_descriptors.source import Source
@@ -90,4 +93,7 @@ DATA_DESCRIPTOR_CLASS_MAPPING: dict[str, type[DataDescriptor]] = {
     "region": Region,
     "member_id": MemberId,
     "obs_type": ObsType,  # obs4Mips
+    "regex": Regex,
+    "citation_url": CitationUrl,
+    "archive": Archive,
 }

esgvoc/api/data_descriptors/archive.py ADDED Viewed

@@ -0,0 +1,5 @@
+from esgvoc.api.data_descriptors.data_descriptor import PlainTermDataDescriptor
+class Archive(PlainTermDataDescriptor):
+    pass

esgvoc/api/data_descriptors/citation_url.py ADDED Viewed

@@ -0,0 +1,5 @@
+from esgvoc.api.data_descriptors.data_descriptor import PatternTermDataDescriptor
+class CitationUrl(PatternTermDataDescriptor):
+    pass

esgvoc/api/data_descriptors/experiment.py CHANGED Viewed

@@ -20,8 +20,8 @@ class Experiment(PlainTermDataDescriptor):
     experiment: str
     required_model_components: list[str] | None
     additional_allowed_model_components: list[str] = Field(default_factory=list)
-    start_year: int | None
-    end_year: int | None
+    start_year: str | int | None
+    end_year: str | int | None
     min_number_yrs_per_sim: int | None
     parent_activity_id: list[str] | None
     parent_experiment_id: list[str] | None

esgvoc/api/data_descriptors/known_branded_variable.py CHANGED Viewed

@@ -1,7 +1,30 @@
+from typing import Any, Dict, List, Optional
 from pydantic import Field
 from esgvoc.api.data_descriptors.data_descriptor import PlainTermDataDescriptor
+#
+# class KnownBrandedVariable(PlainTermDataDescriptor):
+#     """
+#     A climate-related quantity or measurement, including information about sampling.
+#
+#     The concept of a branded variable was introduced in CMIP7.
+#     A branded variable is composed of two parts.
+#     The first part is the root variable (see :py:class:`Variable`).
+#     The second is the suffix (see :py:class:`BrandedSuffix`).
+#
+#     For further details on the development of branded variables,
+#     see [this paper draft](https://docs.google.com/document/d/19jzecgymgiiEsTDzaaqeLP6pTvLT-NzCMaq-wu-QoOc/edit?pli=1&tab=t.0).
+#     """
+#
+#     description: str
+#     dimensions: list[str] = Field(default_factory=list)
+#     cell_methods: str
+#     variable: str
+#     label: str
+#
 class KnownBrandedVariable(PlainTermDataDescriptor):
     """
@@ -16,8 +39,38 @@ class KnownBrandedVariable(PlainTermDataDescriptor):
     see [this paper draft](https://docs.google.com/document/d/19jzecgymgiiEsTDzaaqeLP6pTvLT-NzCMaq-wu-QoOc/edit?pli=1&tab=t.0).
     """
-    description: str
-    dimensions: list[str] = Field(default_factory=list)
-    cell_methods: str
-    variable: str
-    label: str
+    # # ESGVoc required fields
+    # id: str = Field(description="Unique identifier, e.g., 'ta_tavg-p19-hxy-air'")
+    # type: str = Field(default="branded_variable", description="ESGVoc type identifier")
+    # drs_name: str = Field(description="DRS name, same as id")
+    # => already in PlainTermDataDescriptor
+    # CF Standard Name context (flattened from hierarchy)
+    cf_standard_name: str = Field(description="CF standard name, e.g., 'air_temperature'")
+    cf_units: str = Field(description="CF standard units, e.g., 'K'")
+    cf_sn_status: str = Field(description="CF standard name status, e.g., 'approved'")
+    # Variable Root context (flattened from hierarchy)
+    variable_root_name: str = Field(description="Variable root name, e.g., 'ta'")
+    var_def_qualifier: str = Field(default="", description="Variable definition qualifier")
+    branding_suffix_name: str = Field(description="Branding suffix, e.g., 'tavg-p19-hxy-air'")
+    # Variable metadata
+    description: str = Field(description="Human-readable description")
+    dimensions: List[str] = Field(description="NetCDF dimensions")
+    cell_methods: str = Field(default="", description="CF cell_methods attribute")
+    cell_measures: str = Field(default="", description="CF cell_measures attribute")
+    history: str = Field(default="", description="Processing history")
+    realm: str = Field(description="Earth system realm, e.g., 'atmos'")
+    # Label components (embedded, not references)
+    temporal_label: str = Field(description="Temporal label, e.g., 'tavg'")
+    vertical_label: str = Field(description="Vertical label, e.g., 'p19'")
+    horizontal_label: str = Field(description="Horizontal label, e.g., 'hxy'")
+    area_label: str = Field(description="Area label, e.g., 'air'")
+    # Status
+    bn_status: str = Field(description="Branded variable status, e.g., 'accepted'")
+    # Additional required fields from specifications
+    positive_direction: str = Field(default="", description="Positive direction for the variable")

esgvoc/api/data_descriptors/regex.py ADDED Viewed

@@ -0,0 +1,5 @@
+from esgvoc.api.data_descriptors.data_descriptor import PatternTermDataDescriptor
+class Regex(PatternTermDataDescriptor):
+    pass

esgvoc/api/data_descriptors/vertical_label.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from esgvoc.api.data_descriptors.data_descriptor import PatternTermDataDescriptor
+from esgvoc.api.data_descriptors.data_descriptor import PlainTermDataDescriptor
-class VerticalLabel(PatternTermDataDescriptor):
+class VerticalLabel(PlainTermDataDescriptor):
     """
     Vertical label.

esgvoc/api/project_specs.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from enum import Enum
-from typing import Annotated, Any, Literal, Optional, Protocol
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict
 class DrsType(str, Enum):
@@ -17,49 +16,18 @@ class DrsType(str, Enum):
     """The DRS dataset id specification type."""
-class DrsPartKind(str, Enum):
-    """
-    The kinds of DRS part (constant and collection).
-    """
-    CONSTANT = "constant"
-    """The constant part type."""
-    COLLECTION = "collection"
-    """The collection part type."""
-class DrsConstant(BaseModel):
-    """
-    A constant part of a DRS specification (e.g., cmip5).
-    """
-    value: str
-    """The value of the a constant part."""
-    kind: Literal[DrsPartKind.CONSTANT] = DrsPartKind.CONSTANT
-    """The DRS part kind."""
-    def __str__(self) -> str:
-        return self.value
+class DrsPart(BaseModel):
+    """A fragment of a DRS specification"""
-class DrsCollection(BaseModel):
-    """
-    A collection part of a DRS specification (e.g., institution_id for CMIP6).
-    """
-    collection_id: str
+    source_collection: str
     """The collection id."""
+    source_collection_term: str | None = None
+    "Specifies a specific term in the collection."
     is_required: bool
     """Whether the collection is required for the DRS specification or not."""
-    kind: Literal[DrsPartKind.COLLECTION] = DrsPartKind.COLLECTION
-    """The DRS part kind."""
     def __str__(self) -> str:
-        return self.collection_id
-DrsPart = Annotated[DrsConstant | DrsCollection, Field(discriminator="kind")]
-"""A fragment of a DRS specification"""
+        return self.source_collection
 class DrsSpecification(BaseModel):
@@ -69,6 +37,8 @@ class DrsSpecification(BaseModel):
     type: DrsType
     """The type of the specification."""
+    regex: str
+    """General pattern for simples checks"""
     separator: str
     """The textual separator string or character."""
     properties: dict | None = None
@@ -77,109 +47,56 @@ class DrsSpecification(BaseModel):
     """The parts of the DRS specification."""
-class GlobalAttributeValueType(str, Enum):
+class CatalogProperty(BaseModel):
     """
-    The types of global attribute values.
-    """
-    STRING = "string"
-    """String value type."""
-    INTEGER = "integer"
-    """Integer value type."""
-    FLOAT = "float"
-    """Float value type."""
-class GlobalAttributeVisitor(Protocol):
-    """
-    Specifications for a global attribute visitor.
-    """
-    def visit_base_attribute(self,
-                             attribute_name: str,
-                             attribute: "GlobalAttributeSpecBase") -> Any:
-        """Visit a base global attribute."""
-        pass
-    def visit_specific_attribute(self,
-                                 attribute_name: str,
-                                 attribute: "GlobalAttributeSpecSpecific") -> Any:
-        """Visit a specific global attribute."""
-        pass
-class GlobalAttributeSpecBase(BaseModel):
-    """
-    Specification for a global attribute.
+    A dataset property described in a catalog.
     """
     source_collection: str
-    """the source_collection to get the term from"""
-    value_type: GlobalAttributeValueType
-    """The expected value type."""
-    def accept(self, attribute_name: str, visitor: GlobalAttributeVisitor) -> Any:
-        return visitor.visit_base_attribute(attribute_name, self)
-class GlobalAttributeSpecSpecific(GlobalAttributeSpecBase):
-    """
-    Specification for a global attribute.
-    with a specific key
-    """
-    specific_key: str
-    """If the validation is for the value of a specific key, for instance description or ui-label """
+    "The project collection that originated the property."
+    catalog_field_value_type: str
+    "The type of the field value."
+    is_required: bool
+    "Specifies if the property must be present in the dataset properties."
+    source_collection_term: str | None = None
+    "Specifies a specific term in the collection."
+    catalog_field_name: str | None = None
+    "The name of the collection referenced in the catalog."
+    source_collection_key: str | None = None
+    "Specifies a key other than drs_name in the collection."
-    def accept(self, attribute_name: str, visitor: GlobalAttributeVisitor) -> Any:
-        """
-        Accept a global attribute visitor.
-        :param attribute_name: The attribute name.
-        :param visitor: The global attribute visitor.
-        :type visitor: GlobalAttributeVisitor
-        :return: Depending on the visitor.
-        :rtype: Any
-        """
-        return visitor.visit_specific_attribute(attribute_name, self)
+class CatalogExtension(BaseModel):
+    name: str
+    """The name of the extension"""
+    version: str
+    """The version of the extension"""
-GlobalAttributeSpec = GlobalAttributeSpecSpecific | GlobalAttributeSpecBase
+class CatalogProperties(BaseModel):
+    name: str
+    """The name of the catalog system."""
+    url_template: str
+    """The URI template of the catalog system."""
+    extensions: list[CatalogExtension]
+    """The extensions of the catalog."""
-class GlobalAttributeSpecs(BaseModel):
+class CatalogSpecification(BaseModel):
     """
-    Container for global attribute specifications.
+    A catalog specifications.
     """
-    specs: dict[str, GlobalAttributeSpec] = Field(default_factory=dict)
-    """The global attributes specifications dictionary."""
-    def __str__(self) -> str:
-        """Return all keys when printing."""
-        return str(list(self.specs.keys()))
-    def __repr__(self) -> str:
-        """Return all keys when using repr."""
-        return f"GlobalAttributeSpecs(keys={list(self.specs.keys())})"
-    # Dictionary-like access methods
-    def __getitem__(self, key: str) -> GlobalAttributeSpec:
-        return self.specs[key]
-    def __setitem__(self, key: str, value: GlobalAttributeSpec) -> None:
-        self.specs[key] = value
-    def __contains__(self, key: str) -> bool:
-        return key in self.specs
-    def keys(self):
-        return self.specs.keys()
+    version: str
+    """The version of the catalog."""
-    def values(self):
-        return self.specs.values()
+    catalog_properties: CatalogProperties
+    """The properties of the catalog."""
-    def items(self):
-        return self.specs.items()
+    dataset_properties: list[CatalogProperty]
+    "The properties of the dataset described in a catalog."
+    file_properties: list[CatalogProperty]
+    "The properties of the files described in a catalog."
 class ProjectSpecs(BaseModel):
@@ -191,8 +108,9 @@ class ProjectSpecs(BaseModel):
     """The project id."""
     description: str
     """The description of the project."""
-    drs_specs: list[DrsSpecification]
+    drs_specs: dict[DrsType, DrsSpecification]
     """The DRS specifications of the project (directory, file name and dataset id)."""
-    global_attributes_specs: Optional[GlobalAttributeSpecs] = None
-    """The global attributes specifications of the project."""
+    # TODO: release = None when all projects have catalog_specs.yaml.
+    catalog_specs: CatalogSpecification | None = None
+    """The catalog specifications of the project."""
     model_config = ConfigDict(extra="allow")

esgvoc/api/projects.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import itertools
 import re
-from typing import Iterable, Sequence
+from typing import Iterable, Sequence, cast
 from sqlalchemy import text
 from sqlmodel import Session, and_, col, select
@@ -49,22 +49,36 @@ def _get_project_session_with_exception(project_id: str) -> Session:
         raise EsgvocNotFoundError(f"unable to find project '{project_id}'")
-def _resolve_term(composite_term_part: dict, universe_session: Session, project_session: Session) -> UTerm | PTerm:
-    # First find the term in the universe than in the current project
-    term_id = composite_term_part[constants.TERM_ID_JSON_KEY]
-    term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
-    uterm = universe._get_term_in_data_descriptor(
-        data_descriptor_id=term_type, term_id=term_id, session=universe_session
-    )
-    if uterm:
-        return uterm
-    else:
-        pterm = _get_term_in_collection(collection_id=term_type, term_id=term_id, session=project_session)
-    if pterm:
-        return pterm
+def _resolve_composite_term_part(composite_term_part: dict,
+                                 universe_session: Session,
+                                 project_session: Session) -> UTerm | PTerm | Sequence[UTerm | PTerm]:
+    if constants.TERM_ID_JSON_KEY in composite_term_part:
+        # First find the term in the universe than in the current project
+        term_id = composite_term_part[constants.TERM_ID_JSON_KEY]
+        term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
+        uterm = universe._get_term_in_data_descriptor(data_descriptor_id=term_type,
+                                                      term_id=term_id, session=universe_session)
+        if uterm:
+            return uterm
+        else:
+            pterm = _get_term_in_collection(collection_id=term_type, term_id=term_id, session=project_session)
+        if pterm:
+            return pterm
+        else:
+            msg = f"unable to find the term '{term_id}' in '{term_type}'"
+            raise EsgvocNotFoundError(msg)
     else:
-        msg = f"unable to find the term '{term_id}' in '{term_type}'"
-        raise EsgvocNotFoundError(msg)
+        term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
+        data_descriptor = universe._get_data_descriptor_in_universe(term_type, universe_session)
+        if data_descriptor is not None:
+            return data_descriptor.terms
+        else:
+            collection = _get_collection_in_project(term_type, project_session)
+            if collection is not None:
+                return collection.terms
+            else:
+                msg = f"unable to find the terms of '{term_type}'"
+                raise EsgvocNotFoundError(msg)
 def _get_composite_term_separator_parts(term: UTerm | PTerm) -> tuple[str, list]:
@@ -76,7 +90,6 @@ def _get_composite_term_separator_parts(term: UTerm | PTerm) -> tuple[str, list]
 def _valid_value_composite_term_with_separator(
     value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
 ) -> list[UniverseTermError | ProjectTermError]:
-    result = []
     separator, parts = _get_composite_term_separator_parts(term)
     required_indices = {i for i, p in enumerate(parts) if p.get("is_required", False)}
@@ -135,7 +148,9 @@ def _valid_value_composite_term_with_separator(
             for id in part["id"]:
                 part_copy = dict(part)
                 part_copy["id"] = id
-                resolved_term = _resolve_term(part_copy, universe_session, project_session)
+                resolved_term = _resolve_composite_term_part(part_copy, universe_session, project_session)
+                # resolved_term can't be a list of terms here.
+                resolved_term = cast(UTerm | PTerm, resolved_term)
                 errors = _valid_value(given_value, resolved_term, universe_session, project_session)
                 if not errors:
                     valid_for_this_part = True
@@ -150,44 +165,6 @@ def _valid_value_composite_term_with_separator(
     return [_create_term_error(value, term)]  # No valid combination found
-# TODO: support optionality of parts of composite.
-# It is backtrack possible for more than one missing parts.
-def _valid_value_composite_term_with_separator2(
-    value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
-) -> list[UniverseTermError | ProjectTermError]:
-    result = list()
-    separator, parts = _get_composite_term_separator_parts(term)
-    if separator in value:
-        splits = value.split(separator)
-        if len(splits) == len(parts):
-            for index in range(0, len(splits)):
-                given_value = splits[index]
-                if "id" not in parts[index].keys():
-                    terms = universe.get_all_terms_in_data_descriptor(parts[index]["type"], None)
-                    parts[index]["id"] = [term.id for term in terms]
-                if type(parts[index]["id"]) is str:
-                    parts[index]["id"] = [parts[index]["id"]]
-                errors_list = list()
-                for id in parts[index]["id"]:
-                    part_parts = dict(parts[index])
-                    part_parts["id"] = id
-                    resolved_term = _resolve_term(part_parts, universe_session, project_session)
-                    errors = _valid_value(given_value, resolved_term, universe_session, project_session)
-                    if len(errors) == 0:
-                        errors_list = errors
-                        break
-                    else:
-                        errors_list.extend(errors)
-                else:
-                    result.append(_create_term_error(value, term))
-        else:
-            result.append(_create_term_error(value, term))
-    else:
-        result.append(_create_term_error(value, term))
-    return result
 def _transform_to_pattern(term: UTerm | PTerm, universe_session: Session, project_session: Session) -> str:
     match term.kind:
         case TermKind.PLAIN:
@@ -201,8 +178,13 @@ def _transform_to_pattern(term: UTerm | PTerm, universe_session: Session, projec
             separator, parts = _get_composite_term_separator_parts(term)
             result = ""
             for part in parts:
-                resolved_term = _resolve_term(part, universe_session, project_session)
-                pattern = _transform_to_pattern(resolved_term, universe_session, project_session)
+                resolved_term = _resolve_composite_term_part(part, universe_session, project_session)
+                if isinstance(resolved_term, Sequence):
+                    pattern = ""
+                    for r_term in resolved_term:
+                        pattern += _transform_to_pattern(r_term, universe_session, project_session)
+                else:
+                    pattern = _transform_to_pattern(resolved_term, universe_session, project_session)
                 result = f"{result}{pattern}{separator}"
             result = result.rstrip(separator)
         case _:
@@ -530,7 +512,52 @@ def get_all_terms_in_collection(
 def _get_all_collections_in_project(session: Session) -> list[PCollection]:
     project = session.get(Project, constants.SQLITE_FIRST_PK)
     # Project can't be missing if session exists.
-    return project.collections  # type: ignore
+    try:
+        return project.collections  # type: ignore
+    except Exception as e:
+        # Enhanced error context for collection retrieval failures
+        import logging
+        logger = logging.getLogger(__name__)
+        logger.error(f"Failed to retrieve collections for project '{project.id}': {str(e)}")
+        # Use raw SQL to inspect collections without Pydantic validation
+        from sqlalchemy import text
+        try:
+            # Query raw data to identify problematic collections
+            raw_query = text("""
+                SELECT id, term_kind, data_descriptor_id
+                FROM pcollections
+                WHERE project_pk = :project_pk
+            """)
+            result = session.execute(raw_query, {"project_pk": project.pk})
+            problematic_collections = []
+            for row in result:
+                collection_id, term_kind_value, data_descriptor_id = row
+                # Only empty string is invalid - indicates ingestion couldn't determine termkind
+                if term_kind_value == '' or term_kind_value is None:
+                    problematic_collections.append((collection_id, term_kind_value, data_descriptor_id))
+                    msg = f"Collection '{collection_id}' has empty term_kind (data_descriptor: " + \
+                          f"{data_descriptor_id}) - CV ingestion failed to determine termkind"
+                    logger.error(msg)
+            if problematic_collections:
+                error_details = []
+                for col_id, _, data_desc in problematic_collections:
+                    error_details.append(f"  • Collection '{col_id}' (data_descriptor: {data_desc}): EMPTY termkind")
+                error_msg = (
+                    f"Found {len(problematic_collections)} collections with empty term_kind:\n" +
+                    "\n".join(error_details)
+                )
+                raise ValueError(error_msg) from e
+        except Exception as inner_e:
+            logger.error(f"Failed to analyze problematic collections using raw SQL: {inner_e}")
+        raise e
 def get_all_collections_in_project(project_id: str) -> list[str]:
@@ -547,10 +574,24 @@ def get_all_collections_in_project(project_id: str) -> list[str]:
     """
     result = list()
     if connection := _get_project_connection(project_id):
-        with connection.create_session() as session:
-            collections = _get_all_collections_in_project(session)
-            for collection in collections:
-                result.append(collection.id)
+        try:
+            with connection.create_session() as session:
+                collections = _get_all_collections_in_project(session)
+                for collection in collections:
+                    result.append(collection.id)
+        except Exception as e:
+            # Enhanced error context for project collection retrieval
+            import logging
+            logger = logging.getLogger(__name__)
+            logger.error(f"Failed to get collections for project '{project_id}': {str(e)}")
+            # Re-raise with enhanced context
+            raise ValueError(
+                f"Failed to retrieve collections for project '{project_id}'. "
+                f"This may be due to invalid termkind values in the database. "
+                f"Check the project database for collections with empty or invalid termkind values. "
+                f"Original error: {str(e)}"
+            ) from e
     return result

esgvoc 1.0.1__py3-none-any.whl → 1.1.1__py3-none-any.whl

Potentially problematic release.

esgvoc 1.0.1py3-none-any.whl → 1.1.1py3-none-any.whl