esgvoc 1.0.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of esgvoc might be problematic. Click here for more details.

Files changed (41) hide show
  1. esgvoc/__init__.py +1 -1
  2. esgvoc/api/__init__.py +0 -6
  3. esgvoc/api/data_descriptors/__init__.py +6 -0
  4. esgvoc/api/data_descriptors/archive.py +5 -0
  5. esgvoc/api/data_descriptors/citation_url.py +5 -0
  6. esgvoc/api/data_descriptors/experiment.py +2 -2
  7. esgvoc/api/data_descriptors/known_branded_variable.py +58 -5
  8. esgvoc/api/data_descriptors/regex.py +5 -0
  9. esgvoc/api/data_descriptors/vertical_label.py +2 -2
  10. esgvoc/api/project_specs.py +48 -130
  11. esgvoc/api/projects.py +104 -63
  12. esgvoc/apps/drs/generator.py +47 -42
  13. esgvoc/apps/drs/validator.py +22 -38
  14. esgvoc/apps/jsg/json_schema_generator.py +252 -136
  15. esgvoc/apps/jsg/templates/template.jinja +249 -0
  16. esgvoc/apps/test_cv/README.md +214 -0
  17. esgvoc/apps/test_cv/cv_tester.py +1368 -0
  18. esgvoc/apps/test_cv/example_usage.py +216 -0
  19. esgvoc/apps/vr/__init__.py +12 -0
  20. esgvoc/apps/vr/build_variable_registry.py +71 -0
  21. esgvoc/apps/vr/example_usage.py +60 -0
  22. esgvoc/apps/vr/vr_app.py +333 -0
  23. esgvoc/cli/config.py +671 -86
  24. esgvoc/cli/drs.py +39 -21
  25. esgvoc/cli/main.py +2 -0
  26. esgvoc/cli/test_cv.py +257 -0
  27. esgvoc/core/constants.py +10 -7
  28. esgvoc/core/data_handler.py +24 -22
  29. esgvoc/core/db/connection.py +7 -0
  30. esgvoc/core/db/project_ingestion.py +34 -9
  31. esgvoc/core/db/universe_ingestion.py +1 -2
  32. esgvoc/core/service/configuration/setting.py +192 -21
  33. esgvoc/core/service/data_merger.py +1 -1
  34. esgvoc/core/service/state.py +18 -2
  35. {esgvoc-1.0.1.dist-info → esgvoc-1.1.2.dist-info}/METADATA +3 -1
  36. {esgvoc-1.0.1.dist-info → esgvoc-1.1.2.dist-info}/RECORD +40 -29
  37. esgvoc/apps/jsg/cmip6_template.json +0 -74
  38. /esgvoc/apps/{py.typed → test_cv/__init__.py} +0 -0
  39. {esgvoc-1.0.1.dist-info → esgvoc-1.1.2.dist-info}/WHEEL +0 -0
  40. {esgvoc-1.0.1.dist-info → esgvoc-1.1.2.dist-info}/entry_points.txt +0 -0
  41. {esgvoc-1.0.1.dist-info → esgvoc-1.1.2.dist-info}/licenses/LICENSE.txt +0 -0
esgvoc/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  import esgvoc.core.logging_handler # noqa
2
2
 
3
- __version__ = "1.0.1"
3
+ __version__ = "1.1.2"
esgvoc/api/__init__.py CHANGED
@@ -1,8 +1,5 @@
1
1
  from esgvoc.api.project_specs import (
2
- DrsCollection,
3
- DrsConstant,
4
2
  DrsPart,
5
- DrsPartKind,
6
3
  DrsSpecification,
7
4
  DrsType,
8
5
  ProjectSpecs,
@@ -51,10 +48,7 @@ from esgvoc.api.universe import (
51
48
  )
52
49
 
53
50
  __all__ = [
54
- "DrsCollection",
55
- "DrsConstant",
56
51
  "DrsPart",
57
- "DrsPartKind",
58
52
  "DrsSpecification",
59
53
  "DrsType",
60
54
  "find_collections_in_project",
@@ -1,7 +1,9 @@
1
1
  from esgvoc.api.data_descriptors.activity import Activity
2
+ from esgvoc.api.data_descriptors.archive import Archive
2
3
  from esgvoc.api.data_descriptors.area_label import AreaLabel
3
4
  from esgvoc.api.data_descriptors.branded_suffix import BrandedSuffix
4
5
  from esgvoc.api.data_descriptors.branded_variable import BrandedVariable
6
+ from esgvoc.api.data_descriptors.citation_url import CitationUrl
5
7
  from esgvoc.api.data_descriptors.consortium import Consortium
6
8
  from esgvoc.api.data_descriptors.contact import Contact
7
9
  from esgvoc.api.data_descriptors.conventions import Convention
@@ -30,6 +32,7 @@ from esgvoc.api.data_descriptors.product import Product
30
32
  from esgvoc.api.data_descriptors.publication_status import PublicationStatus
31
33
  from esgvoc.api.data_descriptors.realisation_index import RealisationIndex
32
34
  from esgvoc.api.data_descriptors.realm import Realm
35
+ from esgvoc.api.data_descriptors.regex import Regex
33
36
  from esgvoc.api.data_descriptors.region import Region
34
37
  from esgvoc.api.data_descriptors.resolution import Resolution
35
38
  from esgvoc.api.data_descriptors.source import Source
@@ -90,4 +93,7 @@ DATA_DESCRIPTOR_CLASS_MAPPING: dict[str, type[DataDescriptor]] = {
90
93
  "region": Region,
91
94
  "member_id": MemberId,
92
95
  "obs_type": ObsType, # obs4Mips
96
+ "regex": Regex,
97
+ "citation_url": CitationUrl,
98
+ "archive": Archive,
93
99
  }
@@ -0,0 +1,5 @@
1
+ from esgvoc.api.data_descriptors.data_descriptor import PlainTermDataDescriptor
2
+
3
+
4
+ class Archive(PlainTermDataDescriptor):
5
+ pass
@@ -0,0 +1,5 @@
1
+ from esgvoc.api.data_descriptors.data_descriptor import PatternTermDataDescriptor
2
+
3
+
4
+ class CitationUrl(PatternTermDataDescriptor):
5
+ pass
@@ -20,8 +20,8 @@ class Experiment(PlainTermDataDescriptor):
20
20
  experiment: str
21
21
  required_model_components: list[str] | None
22
22
  additional_allowed_model_components: list[str] = Field(default_factory=list)
23
- start_year: int | None
24
- end_year: int | None
23
+ start_year: str | int | None
24
+ end_year: str | int | None
25
25
  min_number_yrs_per_sim: int | None
26
26
  parent_activity_id: list[str] | None
27
27
  parent_experiment_id: list[str] | None
@@ -1,7 +1,30 @@
1
+ from typing import Any, Dict, List, Optional
2
+
1
3
  from pydantic import Field
2
4
 
3
5
  from esgvoc.api.data_descriptors.data_descriptor import PlainTermDataDescriptor
4
6
 
7
+ #
8
+ # class KnownBrandedVariable(PlainTermDataDescriptor):
9
+ # """
10
+ # A climate-related quantity or measurement, including information about sampling.
11
+ #
12
+ # The concept of a branded variable was introduced in CMIP7.
13
+ # A branded variable is composed of two parts.
14
+ # The first part is the root variable (see :py:class:`Variable`).
15
+ # The second is the suffix (see :py:class:`BrandedSuffix`).
16
+ #
17
+ # For further details on the development of branded variables,
18
+ # see [this paper draft](https://docs.google.com/document/d/19jzecgymgiiEsTDzaaqeLP6pTvLT-NzCMaq-wu-QoOc/edit?pli=1&tab=t.0).
19
+ # """
20
+ #
21
+ # description: str
22
+ # dimensions: list[str] = Field(default_factory=list)
23
+ # cell_methods: str
24
+ # variable: str
25
+ # label: str
26
+ #
27
+
5
28
 
6
29
  class KnownBrandedVariable(PlainTermDataDescriptor):
7
30
  """
@@ -16,8 +39,38 @@ class KnownBrandedVariable(PlainTermDataDescriptor):
16
39
  see [this paper draft](https://docs.google.com/document/d/19jzecgymgiiEsTDzaaqeLP6pTvLT-NzCMaq-wu-QoOc/edit?pli=1&tab=t.0).
17
40
  """
18
41
 
19
- description: str
20
- dimensions: list[str] = Field(default_factory=list)
21
- cell_methods: str
22
- variable: str
23
- label: str
42
+ # # ESGVoc required fields
43
+ # id: str = Field(description="Unique identifier, e.g., 'ta_tavg-p19-hxy-air'")
44
+ # type: str = Field(default="branded_variable", description="ESGVoc type identifier")
45
+ # drs_name: str = Field(description="DRS name, same as id")
46
+ # => already in PlainTermDataDescriptor
47
+
48
+ # CF Standard Name context (flattened from hierarchy)
49
+ cf_standard_name: str = Field(description="CF standard name, e.g., 'air_temperature'")
50
+ cf_units: str = Field(description="CF standard units, e.g., 'K'")
51
+ cf_sn_status: str = Field(description="CF standard name status, e.g., 'approved'")
52
+
53
+ # Variable Root context (flattened from hierarchy)
54
+ variable_root_name: str = Field(description="Variable root name, e.g., 'ta'")
55
+ var_def_qualifier: str = Field(default="", description="Variable definition qualifier")
56
+ branding_suffix_name: str = Field(description="Branding suffix, e.g., 'tavg-p19-hxy-air'")
57
+
58
+ # Variable metadata
59
+ description: str = Field(description="Human-readable description")
60
+ dimensions: List[str] = Field(description="NetCDF dimensions")
61
+ cell_methods: str = Field(default="", description="CF cell_methods attribute")
62
+ cell_measures: str = Field(default="", description="CF cell_measures attribute")
63
+ history: str = Field(default="", description="Processing history")
64
+ realm: str = Field(description="Earth system realm, e.g., 'atmos'")
65
+
66
+ # Label components (embedded, not references)
67
+ temporal_label: str = Field(description="Temporal label, e.g., 'tavg'")
68
+ vertical_label: str = Field(description="Vertical label, e.g., 'p19'")
69
+ horizontal_label: str = Field(description="Horizontal label, e.g., 'hxy'")
70
+ area_label: str = Field(description="Area label, e.g., 'air'")
71
+
72
+ # Status
73
+ bn_status: str = Field(description="Branded variable status, e.g., 'accepted'")
74
+
75
+ # Additional required fields from specifications
76
+ positive_direction: str = Field(default="", description="Positive direction for the variable")
@@ -0,0 +1,5 @@
1
+ from esgvoc.api.data_descriptors.data_descriptor import PatternTermDataDescriptor
2
+
3
+
4
+ class Regex(PatternTermDataDescriptor):
5
+ pass
@@ -1,7 +1,7 @@
1
- from esgvoc.api.data_descriptors.data_descriptor import PatternTermDataDescriptor
1
+ from esgvoc.api.data_descriptors.data_descriptor import PlainTermDataDescriptor
2
2
 
3
3
 
4
- class VerticalLabel(PatternTermDataDescriptor):
4
+ class VerticalLabel(PlainTermDataDescriptor):
5
5
  """
6
6
  Vertical label.
7
7
 
@@ -1,7 +1,6 @@
1
1
  from enum import Enum
2
- from typing import Annotated, Any, Literal, Optional, Protocol
3
2
 
4
- from pydantic import BaseModel, ConfigDict, Field
3
+ from pydantic import BaseModel, ConfigDict
5
4
 
6
5
 
7
6
  class DrsType(str, Enum):
@@ -17,49 +16,18 @@ class DrsType(str, Enum):
17
16
  """The DRS dataset id specification type."""
18
17
 
19
18
 
20
- class DrsPartKind(str, Enum):
21
- """
22
- The kinds of DRS part (constant and collection).
23
- """
24
-
25
- CONSTANT = "constant"
26
- """The constant part type."""
27
- COLLECTION = "collection"
28
- """The collection part type."""
29
-
30
-
31
- class DrsConstant(BaseModel):
32
- """
33
- A constant part of a DRS specification (e.g., cmip5).
34
- """
35
-
36
- value: str
37
- """The value of the a constant part."""
38
- kind: Literal[DrsPartKind.CONSTANT] = DrsPartKind.CONSTANT
39
- """The DRS part kind."""
40
-
41
- def __str__(self) -> str:
42
- return self.value
43
-
19
+ class DrsPart(BaseModel):
20
+ """A fragment of a DRS specification"""
44
21
 
45
- class DrsCollection(BaseModel):
46
- """
47
- A collection part of a DRS specification (e.g., institution_id for CMIP6).
48
- """
49
-
50
- collection_id: str
22
+ source_collection: str
51
23
  """The collection id."""
24
+ source_collection_term: str | None = None
25
+ "Specifies a specific term in the collection."
52
26
  is_required: bool
53
27
  """Whether the collection is required for the DRS specification or not."""
54
- kind: Literal[DrsPartKind.COLLECTION] = DrsPartKind.COLLECTION
55
- """The DRS part kind."""
56
28
 
57
29
  def __str__(self) -> str:
58
- return self.collection_id
59
-
60
-
61
- DrsPart = Annotated[DrsConstant | DrsCollection, Field(discriminator="kind")]
62
- """A fragment of a DRS specification"""
30
+ return self.source_collection
63
31
 
64
32
 
65
33
  class DrsSpecification(BaseModel):
@@ -69,6 +37,8 @@ class DrsSpecification(BaseModel):
69
37
 
70
38
  type: DrsType
71
39
  """The type of the specification."""
40
+ regex: str
41
+ """General pattern for simples checks"""
72
42
  separator: str
73
43
  """The textual separator string or character."""
74
44
  properties: dict | None = None
@@ -77,109 +47,56 @@ class DrsSpecification(BaseModel):
77
47
  """The parts of the DRS specification."""
78
48
 
79
49
 
80
- class GlobalAttributeValueType(str, Enum):
50
+ class CatalogProperty(BaseModel):
81
51
  """
82
- The types of global attribute values.
83
- """
84
-
85
- STRING = "string"
86
- """String value type."""
87
- INTEGER = "integer"
88
- """Integer value type."""
89
- FLOAT = "float"
90
- """Float value type."""
91
-
92
-
93
- class GlobalAttributeVisitor(Protocol):
94
- """
95
- Specifications for a global attribute visitor.
96
- """
97
- def visit_base_attribute(self,
98
- attribute_name: str,
99
- attribute: "GlobalAttributeSpecBase") -> Any:
100
- """Visit a base global attribute."""
101
- pass
102
-
103
- def visit_specific_attribute(self,
104
- attribute_name: str,
105
- attribute: "GlobalAttributeSpecSpecific") -> Any:
106
- """Visit a specific global attribute."""
107
- pass
108
-
109
-
110
- class GlobalAttributeSpecBase(BaseModel):
111
- """
112
- Specification for a global attribute.
52
+ A dataset property described in a catalog.
113
53
  """
114
54
 
115
55
  source_collection: str
116
- """the source_collection to get the term from"""
117
- value_type: GlobalAttributeValueType
118
- """The expected value type."""
119
-
120
- def accept(self, attribute_name: str, visitor: GlobalAttributeVisitor) -> Any:
121
- return visitor.visit_base_attribute(attribute_name, self)
122
-
123
-
124
- class GlobalAttributeSpecSpecific(GlobalAttributeSpecBase):
125
- """
126
- Specification for a global attribute.
127
- with a specific key
128
- """
129
-
130
- specific_key: str
131
- """If the validation is for the value of a specific key, for instance description or ui-label """
56
+ "The project collection that originated the property."
57
+ catalog_field_value_type: str
58
+ "The type of the field value."
59
+ is_required: bool
60
+ "Specifies if the property must be present in the dataset properties."
61
+ source_collection_term: str | None = None
62
+ "Specifies a specific term in the collection."
63
+ catalog_field_name: str | None = None
64
+ "The name of the collection referenced in the catalog."
65
+ source_collection_key: str | None = None
66
+ "Specifies a key other than drs_name in the collection."
132
67
 
133
- def accept(self, attribute_name: str, visitor: GlobalAttributeVisitor) -> Any:
134
- """
135
- Accept a global attribute visitor.
136
68
 
137
- :param attribute_name: The attribute name.
138
- :param visitor: The global attribute visitor.
139
- :type visitor: GlobalAttributeVisitor
140
- :return: Depending on the visitor.
141
- :rtype: Any
142
- """
143
- return visitor.visit_specific_attribute(attribute_name, self)
69
+ class CatalogExtension(BaseModel):
70
+ name: str
71
+ """The name of the extension"""
72
+ version: str
73
+ """The version of the extension"""
144
74
 
145
75
 
146
- GlobalAttributeSpec = GlobalAttributeSpecSpecific | GlobalAttributeSpecBase
76
+ class CatalogProperties(BaseModel):
77
+ name: str
78
+ """The name of the catalog system."""
79
+ url_template: str
80
+ """The URI template of the catalog system."""
81
+ extensions: list[CatalogExtension]
82
+ """The extensions of the catalog."""
147
83
 
148
84
 
149
- class GlobalAttributeSpecs(BaseModel):
85
+ class CatalogSpecification(BaseModel):
150
86
  """
151
- Container for global attribute specifications.
87
+ A catalog specifications.
152
88
  """
153
89
 
154
- specs: dict[str, GlobalAttributeSpec] = Field(default_factory=dict)
155
- """The global attributes specifications dictionary."""
156
-
157
- def __str__(self) -> str:
158
- """Return all keys when printing."""
159
- return str(list(self.specs.keys()))
160
-
161
- def __repr__(self) -> str:
162
- """Return all keys when using repr."""
163
- return f"GlobalAttributeSpecs(keys={list(self.specs.keys())})"
164
-
165
- # Dictionary-like access methods
166
- def __getitem__(self, key: str) -> GlobalAttributeSpec:
167
- return self.specs[key]
168
-
169
- def __setitem__(self, key: str, value: GlobalAttributeSpec) -> None:
170
- self.specs[key] = value
171
-
172
- def __contains__(self, key: str) -> bool:
173
- return key in self.specs
174
-
175
- def keys(self):
176
- return self.specs.keys()
90
+ version: str
91
+ """The version of the catalog."""
177
92
 
178
- def values(self):
179
- return self.specs.values()
93
+ catalog_properties: CatalogProperties
94
+ """The properties of the catalog."""
180
95
 
181
- def items(self):
182
- return self.specs.items()
96
+ dataset_properties: list[CatalogProperty]
97
+ "The properties of the dataset described in a catalog."
98
+ file_properties: list[CatalogProperty]
99
+ "The properties of the files described in a catalog."
183
100
 
184
101
 
185
102
  class ProjectSpecs(BaseModel):
@@ -191,8 +108,9 @@ class ProjectSpecs(BaseModel):
191
108
  """The project id."""
192
109
  description: str
193
110
  """The description of the project."""
194
- drs_specs: list[DrsSpecification]
111
+ drs_specs: dict[DrsType, DrsSpecification]
195
112
  """The DRS specifications of the project (directory, file name and dataset id)."""
196
- global_attributes_specs: Optional[GlobalAttributeSpecs] = None
197
- """The global attributes specifications of the project."""
113
+ # TODO: release = None when all projects have catalog_specs.yaml.
114
+ catalog_specs: CatalogSpecification | None = None
115
+ """The catalog specifications of the project."""
198
116
  model_config = ConfigDict(extra="allow")
esgvoc/api/projects.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import itertools
2
2
  import re
3
- from typing import Iterable, Sequence
3
+ from typing import Iterable, Sequence, cast
4
4
 
5
5
  from sqlalchemy import text
6
6
  from sqlmodel import Session, and_, col, select
@@ -49,22 +49,36 @@ def _get_project_session_with_exception(project_id: str) -> Session:
49
49
  raise EsgvocNotFoundError(f"unable to find project '{project_id}'")
50
50
 
51
51
 
52
- def _resolve_term(composite_term_part: dict, universe_session: Session, project_session: Session) -> UTerm | PTerm:
53
- # First find the term in the universe than in the current project
54
- term_id = composite_term_part[constants.TERM_ID_JSON_KEY]
55
- term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
56
- uterm = universe._get_term_in_data_descriptor(
57
- data_descriptor_id=term_type, term_id=term_id, session=universe_session
58
- )
59
- if uterm:
60
- return uterm
61
- else:
62
- pterm = _get_term_in_collection(collection_id=term_type, term_id=term_id, session=project_session)
63
- if pterm:
64
- return pterm
52
+ def _resolve_composite_term_part(composite_term_part: dict,
53
+ universe_session: Session,
54
+ project_session: Session) -> UTerm | PTerm | Sequence[UTerm | PTerm]:
55
+ if constants.TERM_ID_JSON_KEY in composite_term_part:
56
+ # First find the term in the universe than in the current project
57
+ term_id = composite_term_part[constants.TERM_ID_JSON_KEY]
58
+ term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
59
+ uterm = universe._get_term_in_data_descriptor(data_descriptor_id=term_type,
60
+ term_id=term_id, session=universe_session)
61
+ if uterm:
62
+ return uterm
63
+ else:
64
+ pterm = _get_term_in_collection(collection_id=term_type, term_id=term_id, session=project_session)
65
+ if pterm:
66
+ return pterm
67
+ else:
68
+ msg = f"unable to find the term '{term_id}' in '{term_type}'"
69
+ raise EsgvocNotFoundError(msg)
65
70
  else:
66
- msg = f"unable to find the term '{term_id}' in '{term_type}'"
67
- raise EsgvocNotFoundError(msg)
71
+ term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
72
+ data_descriptor = universe._get_data_descriptor_in_universe(term_type, universe_session)
73
+ if data_descriptor is not None:
74
+ return data_descriptor.terms
75
+ else:
76
+ collection = _get_collection_in_project(term_type, project_session)
77
+ if collection is not None:
78
+ return collection.terms
79
+ else:
80
+ msg = f"unable to find the terms of '{term_type}'"
81
+ raise EsgvocNotFoundError(msg)
68
82
 
69
83
 
70
84
  def _get_composite_term_separator_parts(term: UTerm | PTerm) -> tuple[str, list]:
@@ -76,7 +90,6 @@ def _get_composite_term_separator_parts(term: UTerm | PTerm) -> tuple[str, list]
76
90
  def _valid_value_composite_term_with_separator(
77
91
  value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
78
92
  ) -> list[UniverseTermError | ProjectTermError]:
79
- result = []
80
93
  separator, parts = _get_composite_term_separator_parts(term)
81
94
  required_indices = {i for i, p in enumerate(parts) if p.get("is_required", False)}
82
95
 
@@ -135,7 +148,9 @@ def _valid_value_composite_term_with_separator(
135
148
  for id in part["id"]:
136
149
  part_copy = dict(part)
137
150
  part_copy["id"] = id
138
- resolved_term = _resolve_term(part_copy, universe_session, project_session)
151
+ resolved_term = _resolve_composite_term_part(part_copy, universe_session, project_session)
152
+ # resolved_term can't be a list of terms here.
153
+ resolved_term = cast(UTerm | PTerm, resolved_term)
139
154
  errors = _valid_value(given_value, resolved_term, universe_session, project_session)
140
155
  if not errors:
141
156
  valid_for_this_part = True
@@ -150,44 +165,6 @@ def _valid_value_composite_term_with_separator(
150
165
  return [_create_term_error(value, term)] # No valid combination found
151
166
 
152
167
 
153
- # TODO: support optionality of parts of composite.
154
- # It is backtrack possible for more than one missing parts.
155
- def _valid_value_composite_term_with_separator2(
156
- value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
157
- ) -> list[UniverseTermError | ProjectTermError]:
158
- result = list()
159
- separator, parts = _get_composite_term_separator_parts(term)
160
- if separator in value:
161
- splits = value.split(separator)
162
- if len(splits) == len(parts):
163
- for index in range(0, len(splits)):
164
- given_value = splits[index]
165
- if "id" not in parts[index].keys():
166
- terms = universe.get_all_terms_in_data_descriptor(parts[index]["type"], None)
167
- parts[index]["id"] = [term.id for term in terms]
168
- if type(parts[index]["id"]) is str:
169
- parts[index]["id"] = [parts[index]["id"]]
170
-
171
- errors_list = list()
172
- for id in parts[index]["id"]:
173
- part_parts = dict(parts[index])
174
- part_parts["id"] = id
175
- resolved_term = _resolve_term(part_parts, universe_session, project_session)
176
- errors = _valid_value(given_value, resolved_term, universe_session, project_session)
177
- if len(errors) == 0:
178
- errors_list = errors
179
- break
180
- else:
181
- errors_list.extend(errors)
182
- else:
183
- result.append(_create_term_error(value, term))
184
- else:
185
- result.append(_create_term_error(value, term))
186
- else:
187
- result.append(_create_term_error(value, term))
188
- return result
189
-
190
-
191
168
  def _transform_to_pattern(term: UTerm | PTerm, universe_session: Session, project_session: Session) -> str:
192
169
  match term.kind:
193
170
  case TermKind.PLAIN:
@@ -201,8 +178,13 @@ def _transform_to_pattern(term: UTerm | PTerm, universe_session: Session, projec
201
178
  separator, parts = _get_composite_term_separator_parts(term)
202
179
  result = ""
203
180
  for part in parts:
204
- resolved_term = _resolve_term(part, universe_session, project_session)
205
- pattern = _transform_to_pattern(resolved_term, universe_session, project_session)
181
+ resolved_term = _resolve_composite_term_part(part, universe_session, project_session)
182
+ if isinstance(resolved_term, Sequence):
183
+ pattern = ""
184
+ for r_term in resolved_term:
185
+ pattern += _transform_to_pattern(r_term, universe_session, project_session)
186
+ else:
187
+ pattern = _transform_to_pattern(resolved_term, universe_session, project_session)
206
188
  result = f"{result}{pattern}{separator}"
207
189
  result = result.rstrip(separator)
208
190
  case _:
@@ -530,7 +512,52 @@ def get_all_terms_in_collection(
530
512
  def _get_all_collections_in_project(session: Session) -> list[PCollection]:
531
513
  project = session.get(Project, constants.SQLITE_FIRST_PK)
532
514
  # Project can't be missing if session exists.
533
- return project.collections # type: ignore
515
+ try:
516
+ return project.collections # type: ignore
517
+ except Exception as e:
518
+ # Enhanced error context for collection retrieval failures
519
+ import logging
520
+ logger = logging.getLogger(__name__)
521
+ logger.error(f"Failed to retrieve collections for project '{project.id}': {str(e)}")
522
+
523
+ # Use raw SQL to inspect collections without Pydantic validation
524
+ from sqlalchemy import text
525
+ try:
526
+ # Query raw data to identify problematic collections
527
+ raw_query = text("""
528
+ SELECT id, term_kind, data_descriptor_id
529
+ FROM pcollections
530
+ WHERE project_pk = :project_pk
531
+ """)
532
+ result = session.execute(raw_query, {"project_pk": project.pk})
533
+
534
+ problematic_collections = []
535
+
536
+ for row in result:
537
+ collection_id, term_kind_value, data_descriptor_id = row
538
+
539
+ # Only empty string is invalid - indicates ingestion couldn't determine termkind
540
+ if term_kind_value == '' or term_kind_value is None:
541
+ problematic_collections.append((collection_id, term_kind_value, data_descriptor_id))
542
+ msg = f"Collection '{collection_id}' has empty term_kind (data_descriptor: " + \
543
+ f"{data_descriptor_id}) - CV ingestion failed to determine termkind"
544
+ logger.error(msg)
545
+
546
+ if problematic_collections:
547
+ error_details = []
548
+ for col_id, _, data_desc in problematic_collections:
549
+ error_details.append(f" • Collection '{col_id}' (data_descriptor: {data_desc}): EMPTY termkind")
550
+
551
+ error_msg = (
552
+ f"Found {len(problematic_collections)} collections with empty term_kind:\n" +
553
+ "\n".join(error_details)
554
+ )
555
+ raise ValueError(error_msg) from e
556
+
557
+ except Exception as inner_e:
558
+ logger.error(f"Failed to analyze problematic collections using raw SQL: {inner_e}")
559
+
560
+ raise e
534
561
 
535
562
 
536
563
  def get_all_collections_in_project(project_id: str) -> list[str]:
@@ -547,10 +574,24 @@ def get_all_collections_in_project(project_id: str) -> list[str]:
547
574
  """
548
575
  result = list()
549
576
  if connection := _get_project_connection(project_id):
550
- with connection.create_session() as session:
551
- collections = _get_all_collections_in_project(session)
552
- for collection in collections:
553
- result.append(collection.id)
577
+ try:
578
+ with connection.create_session() as session:
579
+ collections = _get_all_collections_in_project(session)
580
+ for collection in collections:
581
+ result.append(collection.id)
582
+ except Exception as e:
583
+ # Enhanced error context for project collection retrieval
584
+ import logging
585
+ logger = logging.getLogger(__name__)
586
+ logger.error(f"Failed to get collections for project '{project_id}': {str(e)}")
587
+
588
+ # Re-raise with enhanced context
589
+ raise ValueError(
590
+ f"Failed to retrieve collections for project '{project_id}'. "
591
+ f"This may be due to invalid termkind values in the database. "
592
+ f"Check the project database for collections with empty or invalid termkind values. "
593
+ f"Original error: {str(e)}"
594
+ ) from e
554
595
  return result
555
596
 
556
597