esgvoc 1.0.1__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of esgvoc might be problematic. Click here for more details.
- esgvoc/__init__.py +1 -1
- esgvoc/api/__init__.py +0 -6
- esgvoc/api/data_descriptors/__init__.py +6 -0
- esgvoc/api/data_descriptors/archive.py +5 -0
- esgvoc/api/data_descriptors/citation_url.py +5 -0
- esgvoc/api/data_descriptors/experiment.py +2 -2
- esgvoc/api/data_descriptors/known_branded_variable.py +58 -5
- esgvoc/api/data_descriptors/regex.py +5 -0
- esgvoc/api/data_descriptors/vertical_label.py +2 -2
- esgvoc/api/project_specs.py +48 -130
- esgvoc/api/projects.py +104 -63
- esgvoc/apps/drs/generator.py +47 -42
- esgvoc/apps/drs/validator.py +22 -38
- esgvoc/apps/jsg/json_schema_generator.py +252 -136
- esgvoc/apps/jsg/templates/template.jinja +249 -0
- esgvoc/apps/test_cv/README.md +214 -0
- esgvoc/apps/test_cv/cv_tester.py +1368 -0
- esgvoc/apps/test_cv/example_usage.py +216 -0
- esgvoc/apps/vr/__init__.py +12 -0
- esgvoc/apps/vr/build_variable_registry.py +71 -0
- esgvoc/apps/vr/example_usage.py +60 -0
- esgvoc/apps/vr/vr_app.py +333 -0
- esgvoc/cli/config.py +671 -86
- esgvoc/cli/drs.py +39 -21
- esgvoc/cli/main.py +2 -0
- esgvoc/cli/test_cv.py +257 -0
- esgvoc/core/constants.py +10 -7
- esgvoc/core/data_handler.py +24 -22
- esgvoc/core/db/connection.py +7 -0
- esgvoc/core/db/project_ingestion.py +34 -9
- esgvoc/core/db/universe_ingestion.py +1 -2
- esgvoc/core/service/configuration/setting.py +192 -21
- esgvoc/core/service/data_merger.py +1 -1
- esgvoc/core/service/state.py +18 -2
- {esgvoc-1.0.1.dist-info → esgvoc-1.1.1.dist-info}/METADATA +2 -1
- {esgvoc-1.0.1.dist-info → esgvoc-1.1.1.dist-info}/RECORD +40 -29
- esgvoc/apps/jsg/cmip6_template.json +0 -74
- /esgvoc/apps/{py.typed → test_cv/__init__.py} +0 -0
- {esgvoc-1.0.1.dist-info → esgvoc-1.1.1.dist-info}/WHEEL +0 -0
- {esgvoc-1.0.1.dist-info → esgvoc-1.1.1.dist-info}/entry_points.txt +0 -0
- {esgvoc-1.0.1.dist-info → esgvoc-1.1.1.dist-info}/licenses/LICENSE.txt +0 -0
esgvoc/__init__.py
CHANGED
esgvoc/api/__init__.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
|
1
1
|
from esgvoc.api.project_specs import (
|
|
2
|
-
DrsCollection,
|
|
3
|
-
DrsConstant,
|
|
4
2
|
DrsPart,
|
|
5
|
-
DrsPartKind,
|
|
6
3
|
DrsSpecification,
|
|
7
4
|
DrsType,
|
|
8
5
|
ProjectSpecs,
|
|
@@ -51,10 +48,7 @@ from esgvoc.api.universe import (
|
|
|
51
48
|
)
|
|
52
49
|
|
|
53
50
|
__all__ = [
|
|
54
|
-
"DrsCollection",
|
|
55
|
-
"DrsConstant",
|
|
56
51
|
"DrsPart",
|
|
57
|
-
"DrsPartKind",
|
|
58
52
|
"DrsSpecification",
|
|
59
53
|
"DrsType",
|
|
60
54
|
"find_collections_in_project",
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from esgvoc.api.data_descriptors.activity import Activity
|
|
2
|
+
from esgvoc.api.data_descriptors.archive import Archive
|
|
2
3
|
from esgvoc.api.data_descriptors.area_label import AreaLabel
|
|
3
4
|
from esgvoc.api.data_descriptors.branded_suffix import BrandedSuffix
|
|
4
5
|
from esgvoc.api.data_descriptors.branded_variable import BrandedVariable
|
|
6
|
+
from esgvoc.api.data_descriptors.citation_url import CitationUrl
|
|
5
7
|
from esgvoc.api.data_descriptors.consortium import Consortium
|
|
6
8
|
from esgvoc.api.data_descriptors.contact import Contact
|
|
7
9
|
from esgvoc.api.data_descriptors.conventions import Convention
|
|
@@ -30,6 +32,7 @@ from esgvoc.api.data_descriptors.product import Product
|
|
|
30
32
|
from esgvoc.api.data_descriptors.publication_status import PublicationStatus
|
|
31
33
|
from esgvoc.api.data_descriptors.realisation_index import RealisationIndex
|
|
32
34
|
from esgvoc.api.data_descriptors.realm import Realm
|
|
35
|
+
from esgvoc.api.data_descriptors.regex import Regex
|
|
33
36
|
from esgvoc.api.data_descriptors.region import Region
|
|
34
37
|
from esgvoc.api.data_descriptors.resolution import Resolution
|
|
35
38
|
from esgvoc.api.data_descriptors.source import Source
|
|
@@ -90,4 +93,7 @@ DATA_DESCRIPTOR_CLASS_MAPPING: dict[str, type[DataDescriptor]] = {
|
|
|
90
93
|
"region": Region,
|
|
91
94
|
"member_id": MemberId,
|
|
92
95
|
"obs_type": ObsType, # obs4Mips
|
|
96
|
+
"regex": Regex,
|
|
97
|
+
"citation_url": CitationUrl,
|
|
98
|
+
"archive": Archive,
|
|
93
99
|
}
|
|
@@ -20,8 +20,8 @@ class Experiment(PlainTermDataDescriptor):
|
|
|
20
20
|
experiment: str
|
|
21
21
|
required_model_components: list[str] | None
|
|
22
22
|
additional_allowed_model_components: list[str] = Field(default_factory=list)
|
|
23
|
-
start_year: int | None
|
|
24
|
-
end_year: int | None
|
|
23
|
+
start_year: str | int | None
|
|
24
|
+
end_year: str | int | None
|
|
25
25
|
min_number_yrs_per_sim: int | None
|
|
26
26
|
parent_activity_id: list[str] | None
|
|
27
27
|
parent_experiment_id: list[str] | None
|
|
@@ -1,7 +1,30 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
1
3
|
from pydantic import Field
|
|
2
4
|
|
|
3
5
|
from esgvoc.api.data_descriptors.data_descriptor import PlainTermDataDescriptor
|
|
4
6
|
|
|
7
|
+
#
|
|
8
|
+
# class KnownBrandedVariable(PlainTermDataDescriptor):
|
|
9
|
+
# """
|
|
10
|
+
# A climate-related quantity or measurement, including information about sampling.
|
|
11
|
+
#
|
|
12
|
+
# The concept of a branded variable was introduced in CMIP7.
|
|
13
|
+
# A branded variable is composed of two parts.
|
|
14
|
+
# The first part is the root variable (see :py:class:`Variable`).
|
|
15
|
+
# The second is the suffix (see :py:class:`BrandedSuffix`).
|
|
16
|
+
#
|
|
17
|
+
# For further details on the development of branded variables,
|
|
18
|
+
# see [this paper draft](https://docs.google.com/document/d/19jzecgymgiiEsTDzaaqeLP6pTvLT-NzCMaq-wu-QoOc/edit?pli=1&tab=t.0).
|
|
19
|
+
# """
|
|
20
|
+
#
|
|
21
|
+
# description: str
|
|
22
|
+
# dimensions: list[str] = Field(default_factory=list)
|
|
23
|
+
# cell_methods: str
|
|
24
|
+
# variable: str
|
|
25
|
+
# label: str
|
|
26
|
+
#
|
|
27
|
+
|
|
5
28
|
|
|
6
29
|
class KnownBrandedVariable(PlainTermDataDescriptor):
|
|
7
30
|
"""
|
|
@@ -16,8 +39,38 @@ class KnownBrandedVariable(PlainTermDataDescriptor):
|
|
|
16
39
|
see [this paper draft](https://docs.google.com/document/d/19jzecgymgiiEsTDzaaqeLP6pTvLT-NzCMaq-wu-QoOc/edit?pli=1&tab=t.0).
|
|
17
40
|
"""
|
|
18
41
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
42
|
+
# # ESGVoc required fields
|
|
43
|
+
# id: str = Field(description="Unique identifier, e.g., 'ta_tavg-p19-hxy-air'")
|
|
44
|
+
# type: str = Field(default="branded_variable", description="ESGVoc type identifier")
|
|
45
|
+
# drs_name: str = Field(description="DRS name, same as id")
|
|
46
|
+
# => already in PlainTermDataDescriptor
|
|
47
|
+
|
|
48
|
+
# CF Standard Name context (flattened from hierarchy)
|
|
49
|
+
cf_standard_name: str = Field(description="CF standard name, e.g., 'air_temperature'")
|
|
50
|
+
cf_units: str = Field(description="CF standard units, e.g., 'K'")
|
|
51
|
+
cf_sn_status: str = Field(description="CF standard name status, e.g., 'approved'")
|
|
52
|
+
|
|
53
|
+
# Variable Root context (flattened from hierarchy)
|
|
54
|
+
variable_root_name: str = Field(description="Variable root name, e.g., 'ta'")
|
|
55
|
+
var_def_qualifier: str = Field(default="", description="Variable definition qualifier")
|
|
56
|
+
branding_suffix_name: str = Field(description="Branding suffix, e.g., 'tavg-p19-hxy-air'")
|
|
57
|
+
|
|
58
|
+
# Variable metadata
|
|
59
|
+
description: str = Field(description="Human-readable description")
|
|
60
|
+
dimensions: List[str] = Field(description="NetCDF dimensions")
|
|
61
|
+
cell_methods: str = Field(default="", description="CF cell_methods attribute")
|
|
62
|
+
cell_measures: str = Field(default="", description="CF cell_measures attribute")
|
|
63
|
+
history: str = Field(default="", description="Processing history")
|
|
64
|
+
realm: str = Field(description="Earth system realm, e.g., 'atmos'")
|
|
65
|
+
|
|
66
|
+
# Label components (embedded, not references)
|
|
67
|
+
temporal_label: str = Field(description="Temporal label, e.g., 'tavg'")
|
|
68
|
+
vertical_label: str = Field(description="Vertical label, e.g., 'p19'")
|
|
69
|
+
horizontal_label: str = Field(description="Horizontal label, e.g., 'hxy'")
|
|
70
|
+
area_label: str = Field(description="Area label, e.g., 'air'")
|
|
71
|
+
|
|
72
|
+
# Status
|
|
73
|
+
bn_status: str = Field(description="Branded variable status, e.g., 'accepted'")
|
|
74
|
+
|
|
75
|
+
# Additional required fields from specifications
|
|
76
|
+
positive_direction: str = Field(default="", description="Positive direction for the variable")
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from esgvoc.api.data_descriptors.data_descriptor import
|
|
1
|
+
from esgvoc.api.data_descriptors.data_descriptor import PlainTermDataDescriptor
|
|
2
2
|
|
|
3
3
|
|
|
4
|
-
class VerticalLabel(
|
|
4
|
+
class VerticalLabel(PlainTermDataDescriptor):
|
|
5
5
|
"""
|
|
6
6
|
Vertical label.
|
|
7
7
|
|
esgvoc/api/project_specs.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import Annotated, Any, Literal, Optional, Protocol
|
|
3
2
|
|
|
4
|
-
from pydantic import BaseModel, ConfigDict
|
|
3
|
+
from pydantic import BaseModel, ConfigDict
|
|
5
4
|
|
|
6
5
|
|
|
7
6
|
class DrsType(str, Enum):
|
|
@@ -17,49 +16,18 @@ class DrsType(str, Enum):
|
|
|
17
16
|
"""The DRS dataset id specification type."""
|
|
18
17
|
|
|
19
18
|
|
|
20
|
-
class
|
|
21
|
-
"""
|
|
22
|
-
The kinds of DRS part (constant and collection).
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
CONSTANT = "constant"
|
|
26
|
-
"""The constant part type."""
|
|
27
|
-
COLLECTION = "collection"
|
|
28
|
-
"""The collection part type."""
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
class DrsConstant(BaseModel):
|
|
32
|
-
"""
|
|
33
|
-
A constant part of a DRS specification (e.g., cmip5).
|
|
34
|
-
"""
|
|
35
|
-
|
|
36
|
-
value: str
|
|
37
|
-
"""The value of the a constant part."""
|
|
38
|
-
kind: Literal[DrsPartKind.CONSTANT] = DrsPartKind.CONSTANT
|
|
39
|
-
"""The DRS part kind."""
|
|
40
|
-
|
|
41
|
-
def __str__(self) -> str:
|
|
42
|
-
return self.value
|
|
43
|
-
|
|
19
|
+
class DrsPart(BaseModel):
|
|
20
|
+
"""A fragment of a DRS specification"""
|
|
44
21
|
|
|
45
|
-
|
|
46
|
-
"""
|
|
47
|
-
A collection part of a DRS specification (e.g., institution_id for CMIP6).
|
|
48
|
-
"""
|
|
49
|
-
|
|
50
|
-
collection_id: str
|
|
22
|
+
source_collection: str
|
|
51
23
|
"""The collection id."""
|
|
24
|
+
source_collection_term: str | None = None
|
|
25
|
+
"Specifies a specific term in the collection."
|
|
52
26
|
is_required: bool
|
|
53
27
|
"""Whether the collection is required for the DRS specification or not."""
|
|
54
|
-
kind: Literal[DrsPartKind.COLLECTION] = DrsPartKind.COLLECTION
|
|
55
|
-
"""The DRS part kind."""
|
|
56
28
|
|
|
57
29
|
def __str__(self) -> str:
|
|
58
|
-
return self.
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
DrsPart = Annotated[DrsConstant | DrsCollection, Field(discriminator="kind")]
|
|
62
|
-
"""A fragment of a DRS specification"""
|
|
30
|
+
return self.source_collection
|
|
63
31
|
|
|
64
32
|
|
|
65
33
|
class DrsSpecification(BaseModel):
|
|
@@ -69,6 +37,8 @@ class DrsSpecification(BaseModel):
|
|
|
69
37
|
|
|
70
38
|
type: DrsType
|
|
71
39
|
"""The type of the specification."""
|
|
40
|
+
regex: str
|
|
41
|
+
"""General pattern for simples checks"""
|
|
72
42
|
separator: str
|
|
73
43
|
"""The textual separator string or character."""
|
|
74
44
|
properties: dict | None = None
|
|
@@ -77,109 +47,56 @@ class DrsSpecification(BaseModel):
|
|
|
77
47
|
"""The parts of the DRS specification."""
|
|
78
48
|
|
|
79
49
|
|
|
80
|
-
class
|
|
50
|
+
class CatalogProperty(BaseModel):
|
|
81
51
|
"""
|
|
82
|
-
|
|
83
|
-
"""
|
|
84
|
-
|
|
85
|
-
STRING = "string"
|
|
86
|
-
"""String value type."""
|
|
87
|
-
INTEGER = "integer"
|
|
88
|
-
"""Integer value type."""
|
|
89
|
-
FLOAT = "float"
|
|
90
|
-
"""Float value type."""
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
class GlobalAttributeVisitor(Protocol):
|
|
94
|
-
"""
|
|
95
|
-
Specifications for a global attribute visitor.
|
|
96
|
-
"""
|
|
97
|
-
def visit_base_attribute(self,
|
|
98
|
-
attribute_name: str,
|
|
99
|
-
attribute: "GlobalAttributeSpecBase") -> Any:
|
|
100
|
-
"""Visit a base global attribute."""
|
|
101
|
-
pass
|
|
102
|
-
|
|
103
|
-
def visit_specific_attribute(self,
|
|
104
|
-
attribute_name: str,
|
|
105
|
-
attribute: "GlobalAttributeSpecSpecific") -> Any:
|
|
106
|
-
"""Visit a specific global attribute."""
|
|
107
|
-
pass
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
class GlobalAttributeSpecBase(BaseModel):
|
|
111
|
-
"""
|
|
112
|
-
Specification for a global attribute.
|
|
52
|
+
A dataset property described in a catalog.
|
|
113
53
|
"""
|
|
114
54
|
|
|
115
55
|
source_collection: str
|
|
116
|
-
"
|
|
117
|
-
|
|
118
|
-
"
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
with a specific key
|
|
128
|
-
"""
|
|
129
|
-
|
|
130
|
-
specific_key: str
|
|
131
|
-
"""If the validation is for the value of a specific key, for instance description or ui-label """
|
|
56
|
+
"The project collection that originated the property."
|
|
57
|
+
catalog_field_value_type: str
|
|
58
|
+
"The type of the field value."
|
|
59
|
+
is_required: bool
|
|
60
|
+
"Specifies if the property must be present in the dataset properties."
|
|
61
|
+
source_collection_term: str | None = None
|
|
62
|
+
"Specifies a specific term in the collection."
|
|
63
|
+
catalog_field_name: str | None = None
|
|
64
|
+
"The name of the collection referenced in the catalog."
|
|
65
|
+
source_collection_key: str | None = None
|
|
66
|
+
"Specifies a key other than drs_name in the collection."
|
|
132
67
|
|
|
133
|
-
def accept(self, attribute_name: str, visitor: GlobalAttributeVisitor) -> Any:
|
|
134
|
-
"""
|
|
135
|
-
Accept a global attribute visitor.
|
|
136
68
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
"""
|
|
143
|
-
return visitor.visit_specific_attribute(attribute_name, self)
|
|
69
|
+
class CatalogExtension(BaseModel):
|
|
70
|
+
name: str
|
|
71
|
+
"""The name of the extension"""
|
|
72
|
+
version: str
|
|
73
|
+
"""The version of the extension"""
|
|
144
74
|
|
|
145
75
|
|
|
146
|
-
|
|
76
|
+
class CatalogProperties(BaseModel):
|
|
77
|
+
name: str
|
|
78
|
+
"""The name of the catalog system."""
|
|
79
|
+
url_template: str
|
|
80
|
+
"""The URI template of the catalog system."""
|
|
81
|
+
extensions: list[CatalogExtension]
|
|
82
|
+
"""The extensions of the catalog."""
|
|
147
83
|
|
|
148
84
|
|
|
149
|
-
class
|
|
85
|
+
class CatalogSpecification(BaseModel):
|
|
150
86
|
"""
|
|
151
|
-
|
|
87
|
+
A catalog specifications.
|
|
152
88
|
"""
|
|
153
89
|
|
|
154
|
-
|
|
155
|
-
"""The
|
|
156
|
-
|
|
157
|
-
def __str__(self) -> str:
|
|
158
|
-
"""Return all keys when printing."""
|
|
159
|
-
return str(list(self.specs.keys()))
|
|
160
|
-
|
|
161
|
-
def __repr__(self) -> str:
|
|
162
|
-
"""Return all keys when using repr."""
|
|
163
|
-
return f"GlobalAttributeSpecs(keys={list(self.specs.keys())})"
|
|
164
|
-
|
|
165
|
-
# Dictionary-like access methods
|
|
166
|
-
def __getitem__(self, key: str) -> GlobalAttributeSpec:
|
|
167
|
-
return self.specs[key]
|
|
168
|
-
|
|
169
|
-
def __setitem__(self, key: str, value: GlobalAttributeSpec) -> None:
|
|
170
|
-
self.specs[key] = value
|
|
171
|
-
|
|
172
|
-
def __contains__(self, key: str) -> bool:
|
|
173
|
-
return key in self.specs
|
|
174
|
-
|
|
175
|
-
def keys(self):
|
|
176
|
-
return self.specs.keys()
|
|
90
|
+
version: str
|
|
91
|
+
"""The version of the catalog."""
|
|
177
92
|
|
|
178
|
-
|
|
179
|
-
|
|
93
|
+
catalog_properties: CatalogProperties
|
|
94
|
+
"""The properties of the catalog."""
|
|
180
95
|
|
|
181
|
-
|
|
182
|
-
|
|
96
|
+
dataset_properties: list[CatalogProperty]
|
|
97
|
+
"The properties of the dataset described in a catalog."
|
|
98
|
+
file_properties: list[CatalogProperty]
|
|
99
|
+
"The properties of the files described in a catalog."
|
|
183
100
|
|
|
184
101
|
|
|
185
102
|
class ProjectSpecs(BaseModel):
|
|
@@ -191,8 +108,9 @@ class ProjectSpecs(BaseModel):
|
|
|
191
108
|
"""The project id."""
|
|
192
109
|
description: str
|
|
193
110
|
"""The description of the project."""
|
|
194
|
-
drs_specs:
|
|
111
|
+
drs_specs: dict[DrsType, DrsSpecification]
|
|
195
112
|
"""The DRS specifications of the project (directory, file name and dataset id)."""
|
|
196
|
-
|
|
197
|
-
|
|
113
|
+
# TODO: release = None when all projects have catalog_specs.yaml.
|
|
114
|
+
catalog_specs: CatalogSpecification | None = None
|
|
115
|
+
"""The catalog specifications of the project."""
|
|
198
116
|
model_config = ConfigDict(extra="allow")
|
esgvoc/api/projects.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import re
|
|
3
|
-
from typing import Iterable, Sequence
|
|
3
|
+
from typing import Iterable, Sequence, cast
|
|
4
4
|
|
|
5
5
|
from sqlalchemy import text
|
|
6
6
|
from sqlmodel import Session, and_, col, select
|
|
@@ -49,22 +49,36 @@ def _get_project_session_with_exception(project_id: str) -> Session:
|
|
|
49
49
|
raise EsgvocNotFoundError(f"unable to find project '{project_id}'")
|
|
50
50
|
|
|
51
51
|
|
|
52
|
-
def
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
52
|
+
def _resolve_composite_term_part(composite_term_part: dict,
|
|
53
|
+
universe_session: Session,
|
|
54
|
+
project_session: Session) -> UTerm | PTerm | Sequence[UTerm | PTerm]:
|
|
55
|
+
if constants.TERM_ID_JSON_KEY in composite_term_part:
|
|
56
|
+
# First find the term in the universe than in the current project
|
|
57
|
+
term_id = composite_term_part[constants.TERM_ID_JSON_KEY]
|
|
58
|
+
term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
|
|
59
|
+
uterm = universe._get_term_in_data_descriptor(data_descriptor_id=term_type,
|
|
60
|
+
term_id=term_id, session=universe_session)
|
|
61
|
+
if uterm:
|
|
62
|
+
return uterm
|
|
63
|
+
else:
|
|
64
|
+
pterm = _get_term_in_collection(collection_id=term_type, term_id=term_id, session=project_session)
|
|
65
|
+
if pterm:
|
|
66
|
+
return pterm
|
|
67
|
+
else:
|
|
68
|
+
msg = f"unable to find the term '{term_id}' in '{term_type}'"
|
|
69
|
+
raise EsgvocNotFoundError(msg)
|
|
65
70
|
else:
|
|
66
|
-
|
|
67
|
-
|
|
71
|
+
term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
|
|
72
|
+
data_descriptor = universe._get_data_descriptor_in_universe(term_type, universe_session)
|
|
73
|
+
if data_descriptor is not None:
|
|
74
|
+
return data_descriptor.terms
|
|
75
|
+
else:
|
|
76
|
+
collection = _get_collection_in_project(term_type, project_session)
|
|
77
|
+
if collection is not None:
|
|
78
|
+
return collection.terms
|
|
79
|
+
else:
|
|
80
|
+
msg = f"unable to find the terms of '{term_type}'"
|
|
81
|
+
raise EsgvocNotFoundError(msg)
|
|
68
82
|
|
|
69
83
|
|
|
70
84
|
def _get_composite_term_separator_parts(term: UTerm | PTerm) -> tuple[str, list]:
|
|
@@ -76,7 +90,6 @@ def _get_composite_term_separator_parts(term: UTerm | PTerm) -> tuple[str, list]
|
|
|
76
90
|
def _valid_value_composite_term_with_separator(
|
|
77
91
|
value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
|
|
78
92
|
) -> list[UniverseTermError | ProjectTermError]:
|
|
79
|
-
result = []
|
|
80
93
|
separator, parts = _get_composite_term_separator_parts(term)
|
|
81
94
|
required_indices = {i for i, p in enumerate(parts) if p.get("is_required", False)}
|
|
82
95
|
|
|
@@ -135,7 +148,9 @@ def _valid_value_composite_term_with_separator(
|
|
|
135
148
|
for id in part["id"]:
|
|
136
149
|
part_copy = dict(part)
|
|
137
150
|
part_copy["id"] = id
|
|
138
|
-
resolved_term =
|
|
151
|
+
resolved_term = _resolve_composite_term_part(part_copy, universe_session, project_session)
|
|
152
|
+
# resolved_term can't be a list of terms here.
|
|
153
|
+
resolved_term = cast(UTerm | PTerm, resolved_term)
|
|
139
154
|
errors = _valid_value(given_value, resolved_term, universe_session, project_session)
|
|
140
155
|
if not errors:
|
|
141
156
|
valid_for_this_part = True
|
|
@@ -150,44 +165,6 @@ def _valid_value_composite_term_with_separator(
|
|
|
150
165
|
return [_create_term_error(value, term)] # No valid combination found
|
|
151
166
|
|
|
152
167
|
|
|
153
|
-
# TODO: support optionality of parts of composite.
|
|
154
|
-
# It is backtrack possible for more than one missing parts.
|
|
155
|
-
def _valid_value_composite_term_with_separator2(
|
|
156
|
-
value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
|
|
157
|
-
) -> list[UniverseTermError | ProjectTermError]:
|
|
158
|
-
result = list()
|
|
159
|
-
separator, parts = _get_composite_term_separator_parts(term)
|
|
160
|
-
if separator in value:
|
|
161
|
-
splits = value.split(separator)
|
|
162
|
-
if len(splits) == len(parts):
|
|
163
|
-
for index in range(0, len(splits)):
|
|
164
|
-
given_value = splits[index]
|
|
165
|
-
if "id" not in parts[index].keys():
|
|
166
|
-
terms = universe.get_all_terms_in_data_descriptor(parts[index]["type"], None)
|
|
167
|
-
parts[index]["id"] = [term.id for term in terms]
|
|
168
|
-
if type(parts[index]["id"]) is str:
|
|
169
|
-
parts[index]["id"] = [parts[index]["id"]]
|
|
170
|
-
|
|
171
|
-
errors_list = list()
|
|
172
|
-
for id in parts[index]["id"]:
|
|
173
|
-
part_parts = dict(parts[index])
|
|
174
|
-
part_parts["id"] = id
|
|
175
|
-
resolved_term = _resolve_term(part_parts, universe_session, project_session)
|
|
176
|
-
errors = _valid_value(given_value, resolved_term, universe_session, project_session)
|
|
177
|
-
if len(errors) == 0:
|
|
178
|
-
errors_list = errors
|
|
179
|
-
break
|
|
180
|
-
else:
|
|
181
|
-
errors_list.extend(errors)
|
|
182
|
-
else:
|
|
183
|
-
result.append(_create_term_error(value, term))
|
|
184
|
-
else:
|
|
185
|
-
result.append(_create_term_error(value, term))
|
|
186
|
-
else:
|
|
187
|
-
result.append(_create_term_error(value, term))
|
|
188
|
-
return result
|
|
189
|
-
|
|
190
|
-
|
|
191
168
|
def _transform_to_pattern(term: UTerm | PTerm, universe_session: Session, project_session: Session) -> str:
|
|
192
169
|
match term.kind:
|
|
193
170
|
case TermKind.PLAIN:
|
|
@@ -201,8 +178,13 @@ def _transform_to_pattern(term: UTerm | PTerm, universe_session: Session, projec
|
|
|
201
178
|
separator, parts = _get_composite_term_separator_parts(term)
|
|
202
179
|
result = ""
|
|
203
180
|
for part in parts:
|
|
204
|
-
resolved_term =
|
|
205
|
-
|
|
181
|
+
resolved_term = _resolve_composite_term_part(part, universe_session, project_session)
|
|
182
|
+
if isinstance(resolved_term, Sequence):
|
|
183
|
+
pattern = ""
|
|
184
|
+
for r_term in resolved_term:
|
|
185
|
+
pattern += _transform_to_pattern(r_term, universe_session, project_session)
|
|
186
|
+
else:
|
|
187
|
+
pattern = _transform_to_pattern(resolved_term, universe_session, project_session)
|
|
206
188
|
result = f"{result}{pattern}{separator}"
|
|
207
189
|
result = result.rstrip(separator)
|
|
208
190
|
case _:
|
|
@@ -530,7 +512,52 @@ def get_all_terms_in_collection(
|
|
|
530
512
|
def _get_all_collections_in_project(session: Session) -> list[PCollection]:
|
|
531
513
|
project = session.get(Project, constants.SQLITE_FIRST_PK)
|
|
532
514
|
# Project can't be missing if session exists.
|
|
533
|
-
|
|
515
|
+
try:
|
|
516
|
+
return project.collections # type: ignore
|
|
517
|
+
except Exception as e:
|
|
518
|
+
# Enhanced error context for collection retrieval failures
|
|
519
|
+
import logging
|
|
520
|
+
logger = logging.getLogger(__name__)
|
|
521
|
+
logger.error(f"Failed to retrieve collections for project '{project.id}': {str(e)}")
|
|
522
|
+
|
|
523
|
+
# Use raw SQL to inspect collections without Pydantic validation
|
|
524
|
+
from sqlalchemy import text
|
|
525
|
+
try:
|
|
526
|
+
# Query raw data to identify problematic collections
|
|
527
|
+
raw_query = text("""
|
|
528
|
+
SELECT id, term_kind, data_descriptor_id
|
|
529
|
+
FROM pcollections
|
|
530
|
+
WHERE project_pk = :project_pk
|
|
531
|
+
""")
|
|
532
|
+
result = session.execute(raw_query, {"project_pk": project.pk})
|
|
533
|
+
|
|
534
|
+
problematic_collections = []
|
|
535
|
+
|
|
536
|
+
for row in result:
|
|
537
|
+
collection_id, term_kind_value, data_descriptor_id = row
|
|
538
|
+
|
|
539
|
+
# Only empty string is invalid - indicates ingestion couldn't determine termkind
|
|
540
|
+
if term_kind_value == '' or term_kind_value is None:
|
|
541
|
+
problematic_collections.append((collection_id, term_kind_value, data_descriptor_id))
|
|
542
|
+
msg = f"Collection '{collection_id}' has empty term_kind (data_descriptor: " + \
|
|
543
|
+
f"{data_descriptor_id}) - CV ingestion failed to determine termkind"
|
|
544
|
+
logger.error(msg)
|
|
545
|
+
|
|
546
|
+
if problematic_collections:
|
|
547
|
+
error_details = []
|
|
548
|
+
for col_id, _, data_desc in problematic_collections:
|
|
549
|
+
error_details.append(f" • Collection '{col_id}' (data_descriptor: {data_desc}): EMPTY termkind")
|
|
550
|
+
|
|
551
|
+
error_msg = (
|
|
552
|
+
f"Found {len(problematic_collections)} collections with empty term_kind:\n" +
|
|
553
|
+
"\n".join(error_details)
|
|
554
|
+
)
|
|
555
|
+
raise ValueError(error_msg) from e
|
|
556
|
+
|
|
557
|
+
except Exception as inner_e:
|
|
558
|
+
logger.error(f"Failed to analyze problematic collections using raw SQL: {inner_e}")
|
|
559
|
+
|
|
560
|
+
raise e
|
|
534
561
|
|
|
535
562
|
|
|
536
563
|
def get_all_collections_in_project(project_id: str) -> list[str]:
|
|
@@ -547,10 +574,24 @@ def get_all_collections_in_project(project_id: str) -> list[str]:
|
|
|
547
574
|
"""
|
|
548
575
|
result = list()
|
|
549
576
|
if connection := _get_project_connection(project_id):
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
577
|
+
try:
|
|
578
|
+
with connection.create_session() as session:
|
|
579
|
+
collections = _get_all_collections_in_project(session)
|
|
580
|
+
for collection in collections:
|
|
581
|
+
result.append(collection.id)
|
|
582
|
+
except Exception as e:
|
|
583
|
+
# Enhanced error context for project collection retrieval
|
|
584
|
+
import logging
|
|
585
|
+
logger = logging.getLogger(__name__)
|
|
586
|
+
logger.error(f"Failed to get collections for project '{project_id}': {str(e)}")
|
|
587
|
+
|
|
588
|
+
# Re-raise with enhanced context
|
|
589
|
+
raise ValueError(
|
|
590
|
+
f"Failed to retrieve collections for project '{project_id}'. "
|
|
591
|
+
f"This may be due to invalid termkind values in the database. "
|
|
592
|
+
f"Check the project database for collections with empty or invalid termkind values. "
|
|
593
|
+
f"Original error: {str(e)}"
|
|
594
|
+
) from e
|
|
554
595
|
return result
|
|
555
596
|
|
|
556
597
|
|