esgvoc 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- esgvoc/__init__.py +3 -0
- esgvoc/api/__init__.py +91 -0
- esgvoc/api/data_descriptors/EMD_models/__init__.py +66 -0
- esgvoc/api/data_descriptors/EMD_models/arrangement.py +21 -0
- esgvoc/api/data_descriptors/EMD_models/calendar.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/cell_variable_type.py +20 -0
- esgvoc/api/data_descriptors/EMD_models/component_type.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/coordinate.py +52 -0
- esgvoc/api/data_descriptors/EMD_models/grid_mapping.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/grid_region.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/grid_type.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_computational_grid.py +56 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_grid_cells.py +230 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_subgrid.py +41 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_units.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/model.py +139 -0
- esgvoc/api/data_descriptors/EMD_models/model_component.py +115 -0
- esgvoc/api/data_descriptors/EMD_models/reference.py +61 -0
- esgvoc/api/data_descriptors/EMD_models/resolution.py +48 -0
- esgvoc/api/data_descriptors/EMD_models/temporal_refinement.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/truncation_method.py +17 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_computational_grid.py +91 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_coordinate.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_units.py +19 -0
- esgvoc/api/data_descriptors/__init__.py +159 -0
- esgvoc/api/data_descriptors/activity.py +72 -0
- esgvoc/api/data_descriptors/archive.py +5 -0
- esgvoc/api/data_descriptors/area_label.py +30 -0
- esgvoc/api/data_descriptors/branded_suffix.py +30 -0
- esgvoc/api/data_descriptors/branded_variable.py +21 -0
- esgvoc/api/data_descriptors/citation_url.py +5 -0
- esgvoc/api/data_descriptors/contact.py +5 -0
- esgvoc/api/data_descriptors/conventions.py +28 -0
- esgvoc/api/data_descriptors/creation_date.py +18 -0
- esgvoc/api/data_descriptors/data_descriptor.py +127 -0
- esgvoc/api/data_descriptors/data_specs_version.py +25 -0
- esgvoc/api/data_descriptors/date.py +5 -0
- esgvoc/api/data_descriptors/directory_date.py +22 -0
- esgvoc/api/data_descriptors/drs_specs.py +38 -0
- esgvoc/api/data_descriptors/experiment.py +215 -0
- esgvoc/api/data_descriptors/forcing_index.py +21 -0
- esgvoc/api/data_descriptors/frequency.py +48 -0
- esgvoc/api/data_descriptors/further_info_url.py +5 -0
- esgvoc/api/data_descriptors/grid.py +43 -0
- esgvoc/api/data_descriptors/horizontal_label.py +20 -0
- esgvoc/api/data_descriptors/initialization_index.py +27 -0
- esgvoc/api/data_descriptors/institution.py +80 -0
- esgvoc/api/data_descriptors/known_branded_variable.py +75 -0
- esgvoc/api/data_descriptors/license.py +31 -0
- esgvoc/api/data_descriptors/member_id.py +9 -0
- esgvoc/api/data_descriptors/mip_era.py +26 -0
- esgvoc/api/data_descriptors/model_component.py +32 -0
- esgvoc/api/data_descriptors/models_test/models.py +17 -0
- esgvoc/api/data_descriptors/nominal_resolution.py +50 -0
- esgvoc/api/data_descriptors/obs_type.py +5 -0
- esgvoc/api/data_descriptors/organisation.py +22 -0
- esgvoc/api/data_descriptors/physics_index.py +21 -0
- esgvoc/api/data_descriptors/product.py +16 -0
- esgvoc/api/data_descriptors/publication_status.py +5 -0
- esgvoc/api/data_descriptors/realization_index.py +24 -0
- esgvoc/api/data_descriptors/realm.py +16 -0
- esgvoc/api/data_descriptors/regex.py +5 -0
- esgvoc/api/data_descriptors/region.py +35 -0
- esgvoc/api/data_descriptors/resolution.py +7 -0
- esgvoc/api/data_descriptors/source.py +120 -0
- esgvoc/api/data_descriptors/source_type.py +5 -0
- esgvoc/api/data_descriptors/sub_experiment.py +5 -0
- esgvoc/api/data_descriptors/table.py +28 -0
- esgvoc/api/data_descriptors/temporal_label.py +20 -0
- esgvoc/api/data_descriptors/time_range.py +17 -0
- esgvoc/api/data_descriptors/title.py +5 -0
- esgvoc/api/data_descriptors/tracking_id.py +67 -0
- esgvoc/api/data_descriptors/variable.py +56 -0
- esgvoc/api/data_descriptors/variant_label.py +25 -0
- esgvoc/api/data_descriptors/vertical_label.py +20 -0
- esgvoc/api/project_specs.py +143 -0
- esgvoc/api/projects.py +1253 -0
- esgvoc/api/py.typed +0 -0
- esgvoc/api/pydantic_handler.py +146 -0
- esgvoc/api/report.py +127 -0
- esgvoc/api/search.py +171 -0
- esgvoc/api/universe.py +434 -0
- esgvoc/apps/__init__.py +6 -0
- esgvoc/apps/cmor_tables/__init__.py +7 -0
- esgvoc/apps/cmor_tables/cvs_table.py +948 -0
- esgvoc/apps/drs/__init__.py +0 -0
- esgvoc/apps/drs/constants.py +2 -0
- esgvoc/apps/drs/generator.py +429 -0
- esgvoc/apps/drs/report.py +540 -0
- esgvoc/apps/drs/validator.py +312 -0
- esgvoc/apps/ga/__init__.py +104 -0
- esgvoc/apps/ga/example_usage.py +315 -0
- esgvoc/apps/ga/models/__init__.py +47 -0
- esgvoc/apps/ga/models/netcdf_header.py +306 -0
- esgvoc/apps/ga/models/validator.py +491 -0
- esgvoc/apps/ga/test_ga.py +161 -0
- esgvoc/apps/ga/validator.py +277 -0
- esgvoc/apps/jsg/json_schema_generator.py +341 -0
- esgvoc/apps/jsg/templates/template.jinja +241 -0
- esgvoc/apps/test_cv/README.md +214 -0
- esgvoc/apps/test_cv/__init__.py +0 -0
- esgvoc/apps/test_cv/cv_tester.py +1611 -0
- esgvoc/apps/test_cv/example_usage.py +216 -0
- esgvoc/apps/vr/__init__.py +12 -0
- esgvoc/apps/vr/build_variable_registry.py +71 -0
- esgvoc/apps/vr/example_usage.py +60 -0
- esgvoc/apps/vr/vr_app.py +333 -0
- esgvoc/cli/clean.py +304 -0
- esgvoc/cli/cmor.py +46 -0
- esgvoc/cli/config.py +1300 -0
- esgvoc/cli/drs.py +267 -0
- esgvoc/cli/find.py +138 -0
- esgvoc/cli/get.py +155 -0
- esgvoc/cli/install.py +41 -0
- esgvoc/cli/main.py +60 -0
- esgvoc/cli/offline.py +269 -0
- esgvoc/cli/status.py +79 -0
- esgvoc/cli/test_cv.py +258 -0
- esgvoc/cli/valid.py +147 -0
- esgvoc/core/constants.py +17 -0
- esgvoc/core/convert.py +0 -0
- esgvoc/core/data_handler.py +206 -0
- esgvoc/core/db/__init__.py +3 -0
- esgvoc/core/db/connection.py +40 -0
- esgvoc/core/db/models/mixins.py +25 -0
- esgvoc/core/db/models/project.py +102 -0
- esgvoc/core/db/models/universe.py +98 -0
- esgvoc/core/db/project_ingestion.py +231 -0
- esgvoc/core/db/universe_ingestion.py +172 -0
- esgvoc/core/exceptions.py +33 -0
- esgvoc/core/logging_handler.py +26 -0
- esgvoc/core/repo_fetcher.py +345 -0
- esgvoc/core/service/__init__.py +41 -0
- esgvoc/core/service/configuration/config_manager.py +196 -0
- esgvoc/core/service/configuration/setting.py +363 -0
- esgvoc/core/service/data_merger.py +634 -0
- esgvoc/core/service/esg_voc.py +77 -0
- esgvoc/core/service/resolver_config.py +56 -0
- esgvoc/core/service/state.py +324 -0
- esgvoc/core/service/string_heuristics.py +98 -0
- esgvoc/core/service/term_cache.py +108 -0
- esgvoc/core/service/uri_resolver.py +133 -0
- esgvoc-2.0.2.dist-info/METADATA +82 -0
- esgvoc-2.0.2.dist-info/RECORD +147 -0
- esgvoc-2.0.2.dist-info/WHEEL +4 -0
- esgvoc-2.0.2.dist-info/entry_points.txt +2 -0
- esgvoc-2.0.2.dist-info/licenses/LICENSE.txt +519 -0
esgvoc/cli/valid.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
|
|
2
|
+
import re
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
import typer
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
|
|
9
|
+
from esgvoc.api.projects import valid_term, valid_term_in_all_projects, valid_term_in_collection, valid_term_in_project
|
|
10
|
+
|
|
11
|
+
app = typer.Typer()
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@app.command()
|
|
16
|
+
def valid(
|
|
17
|
+
strings_targets: List[str] = typer.Argument(
|
|
18
|
+
...,
|
|
19
|
+
help=(
|
|
20
|
+
"Pairs of strings to validate against a key in the form '<StringToValidate> <Project:Collection:Term>'.\n"
|
|
21
|
+
"Multiple pairs can be provided. The key '<Project:Collection:Term>' consists of three parts:\n"
|
|
22
|
+
"- 'Project' (optional)\n"
|
|
23
|
+
"- 'Collection' (optional)\n"
|
|
24
|
+
"- 'Term' (optional)\n"
|
|
25
|
+
"Only the ':' separators are mandatory. For example:\n"
|
|
26
|
+
" - 'my_string ::'\n"
|
|
27
|
+
" - 'my_string Project::'\n"
|
|
28
|
+
" - 'my_string Project:Collection:'\n"
|
|
29
|
+
" - 'my_string Project:Collection:Term'\n"
|
|
30
|
+
"The function validates based on the provided parts."
|
|
31
|
+
)
|
|
32
|
+
),
|
|
33
|
+
verbose: bool = typer.Option(False, "-v", "--verbose", help="Provide detailed validation results")
|
|
34
|
+
):
|
|
35
|
+
"""
|
|
36
|
+
Validates one or more strings against specified Project:Collection:Term configurations.\n
|
|
37
|
+
\n
|
|
38
|
+
Depending on the provided key structure, the function performs different validation operations:\n
|
|
39
|
+
- If all are None (e.g., "::"), validates the term across all projects (`valid_term_in_all_projects`).\n
|
|
40
|
+
- If Term is None (e.g., "Project:Collection:"), validates the term in the specified collection (`valid_term_in_collection`).\n
|
|
41
|
+
- If Term and Collection are None (e.g., "Project::"), validates the term in the specified project (`valid_term_in_project`).\n
|
|
42
|
+
- If all are specified (e.g., "Project:Collection:Term"), validates the term exactly (`valid_term`).\n
|
|
43
|
+
\n
|
|
44
|
+
Parameters:\n
|
|
45
|
+
\tstrings_targets (List[str]): A list of validation pairs, where each pair consists of:\n
|
|
46
|
+
\t\t- A string to validate.\n
|
|
47
|
+
\t\t- A key in the form '<Project:Collection:Term>'.\n
|
|
48
|
+
Usage :\n
|
|
49
|
+
\tValid one:\n
|
|
50
|
+
\tesgvocab valid IPSL cmip6plus:institution_id:ipsl\n
|
|
51
|
+
\tesgvocab valid IPSL cmip6plus:institution_id:\n
|
|
52
|
+
\tesgvocab valid IPSL cmip6plus::\n
|
|
53
|
+
\tesgvocab valid IPSL ::\n
|
|
54
|
+
\n
|
|
55
|
+
\tUnvalid one:\n
|
|
56
|
+
\tesgvocab valid IPSL_invalid cmip6plus:institution_id:ipsl\n
|
|
57
|
+
\tesgvocab valid IPSL cmip6plus:institution_id:isl <= term cant be found\n
|
|
58
|
+
\tesgvocab valid IPSL cmip6plus:institutin_id:ispl <= collection cant be found\n
|
|
59
|
+
\tesgvocab valid IPSL cmip6pls:institution_id:ispl <= project cant be found\n
|
|
60
|
+
\n
|
|
61
|
+
\tMultiple validation for all known projects: \n
|
|
62
|
+
\tesgvocab valid IPSL :: IPS :: \n
|
|
63
|
+
\t\tresult will be [True, False]\n
|
|
64
|
+
\n
|
|
65
|
+
\tesgvocab valid --verbose IPS :: IPSL ::\n
|
|
66
|
+
\tresult will be \n
|
|
67
|
+
\t\t┏━━━━━━━━┳━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n
|
|
68
|
+
\t\t┃ String ┃ Key ┃ Result ┃ Errors ┃\n
|
|
69
|
+
\t\t┡━━━━━━━━╇━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n
|
|
70
|
+
\t\t│ IPS │ :: │ ❌ Invalid │ did not found matching term │\n
|
|
71
|
+
\t\t│ IPSL │ :: │ ✅ Valid │ None │\n
|
|
72
|
+
\t\t└────────┴─────┴────────────┴─────────────────────────────┘\n
|
|
73
|
+
Returns:\n
|
|
74
|
+
\tList[bool]: Validation results for each pair in the input.\n
|
|
75
|
+
"""
|
|
76
|
+
results = []
|
|
77
|
+
detailed_results = []
|
|
78
|
+
|
|
79
|
+
# Combine string and target into pairs
|
|
80
|
+
pairs = [strings_targets[i] + " " + strings_targets[i + 1] for i in range(0, len(strings_targets), 2)]
|
|
81
|
+
|
|
82
|
+
# Validate each string against each target
|
|
83
|
+
for validation in pairs:
|
|
84
|
+
match = re.match(r"(.+)\s+([^:]*):([^:]*):([^:]*)", validation)
|
|
85
|
+
if not match:
|
|
86
|
+
console.print(f"[red]Invalid input format: {validation}[/red]")
|
|
87
|
+
results.append(False)
|
|
88
|
+
detailed_results.append({"validation": validation, "errors": ["Invalid input format"]})
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
string_to_validate, project, collection, term = match.groups()
|
|
92
|
+
exception_message= None
|
|
93
|
+
try:
|
|
94
|
+
# Perform the appropriate validation
|
|
95
|
+
if project and collection and term:
|
|
96
|
+
validation_result = valid_term(string_to_validate, project, collection, term)
|
|
97
|
+
elif project and collection:
|
|
98
|
+
validation_result = valid_term_in_collection(string_to_validate, project, collection)
|
|
99
|
+
elif project:
|
|
100
|
+
validation_result = valid_term_in_project(string_to_validate, project)
|
|
101
|
+
else:
|
|
102
|
+
validation_result = valid_term_in_all_projects(string_to_validate)
|
|
103
|
+
|
|
104
|
+
except Exception as e:
|
|
105
|
+
validation_result=False
|
|
106
|
+
exception_message = repr(e)
|
|
107
|
+
|
|
108
|
+
# Handle validation result
|
|
109
|
+
|
|
110
|
+
if validation_result:
|
|
111
|
+
results.append(True)
|
|
112
|
+
detailed_results.append({"validation": validation, "errors": []})
|
|
113
|
+
else:
|
|
114
|
+
# Parse and collect errors for verbose mode
|
|
115
|
+
if validation_result == []:
|
|
116
|
+
detailed_results.append({"validation":validation, "errors":["did not found matching term"]})
|
|
117
|
+
results.append(False)
|
|
118
|
+
if project and collection and term and exception_message is None:
|
|
119
|
+
errors = [str(error) for error in validation_result.errors]
|
|
120
|
+
detailed_results.append({"validation": validation, "errors": errors})
|
|
121
|
+
if exception_message is not None:
|
|
122
|
+
detailed_results.append({"validation": validation, "errors": [exception_message]})
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# Output results
|
|
126
|
+
if verbose:
|
|
127
|
+
table = Table(title="Validation Results")
|
|
128
|
+
table.add_column("String", style="cyan")
|
|
129
|
+
table.add_column("Key", style="magenta")
|
|
130
|
+
table.add_column("Result", style="green" if all(results) else "red")
|
|
131
|
+
table.add_column("Errors", style="red")
|
|
132
|
+
|
|
133
|
+
for detail in detailed_results:
|
|
134
|
+
validation = detail["validation"]
|
|
135
|
+
validation_parts = validation.split()
|
|
136
|
+
string = validation_parts[0]
|
|
137
|
+
key = validation_parts[1] if len(validation_parts) > 1 else "::"
|
|
138
|
+
result = "✅ Valid" if detail["errors"] == [] else "❌ Invalid"
|
|
139
|
+
print(detail)
|
|
140
|
+
errors = "\n".join(detail["errors"]) if detail["errors"] else "None"
|
|
141
|
+
table.add_row(string, key, result, errors)
|
|
142
|
+
|
|
143
|
+
console.print(table)
|
|
144
|
+
else:
|
|
145
|
+
console.print(results)
|
|
146
|
+
|
|
147
|
+
return results
|
esgvoc/core/constants.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
DIRNAME_AND_FILENAME_SEPARATOR = "_"
|
|
2
|
+
PROJECT_SPECS_FILENAME = "project_specs.yaml"
|
|
3
|
+
DRS_SPECS_FILENAME = "drs_specs.yaml"
|
|
4
|
+
CATALOG_SPECS_FILENAME = "catalog_specs.yaml"
|
|
5
|
+
ATTRIBUTES_SPECS_FILENAME = "attr_specs.yaml"
|
|
6
|
+
PROJECT_ID_JSON_KEY = "project_id"
|
|
7
|
+
CONTEXT_FILENAME = "000_context.jsonld"
|
|
8
|
+
CONTEXT_JSON_KEY = "@context"
|
|
9
|
+
TERM_ID_JSON_KEY = "id"
|
|
10
|
+
COMPOSITE_PARTS_JSON_KEY = "parts"
|
|
11
|
+
COMPOSITE_SEPARATOR_JSON_KEY = "separator"
|
|
12
|
+
COMPOSITE_REQUIRED_KEY = "is_required"
|
|
13
|
+
PATTERN_JSON_KEY = "regex"
|
|
14
|
+
TERM_TYPE_JSON_KEY = "type"
|
|
15
|
+
DRS_SPECS_JSON_KEY = "drs_name"
|
|
16
|
+
SQLITE_FIRST_PK = 1
|
|
17
|
+
DATA_DESCRIPTOR_JSON_KEY = "@base"
|
esgvoc/core/convert.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
from functools import cached_property
|
|
5
|
+
from typing import Any, Optional, Dict
|
|
6
|
+
import requests
|
|
7
|
+
from pyld import jsonld
|
|
8
|
+
from pydantic import BaseModel, model_validator, ConfigDict
|
|
9
|
+
|
|
10
|
+
# Configure logging
|
|
11
|
+
_LOGGER = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def unified_document_loader(uri: str) -> Dict:
|
|
15
|
+
"""Load a document from a local file or a remote URI."""
|
|
16
|
+
if uri.startswith(("http://", "https://")):
|
|
17
|
+
response = requests.get(uri, headers={"accept": "application/json"}, verify=False)
|
|
18
|
+
if response.status_code == 200:
|
|
19
|
+
return response.json()
|
|
20
|
+
else:
|
|
21
|
+
_LOGGER.error(f"Failed to fetch remote document: {response.status_code} - {response.text}")
|
|
22
|
+
return {}
|
|
23
|
+
else:
|
|
24
|
+
with open(uri, "r") as f:
|
|
25
|
+
return json.load(f)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class JsonLdResource(BaseModel):
|
|
29
|
+
uri: str
|
|
30
|
+
local_path: Optional[str] = None
|
|
31
|
+
|
|
32
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
33
|
+
|
|
34
|
+
@model_validator(mode="before")
|
|
35
|
+
@classmethod
|
|
36
|
+
def set_local_path(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
|
37
|
+
"""Set the local path to an absolute path if provided."""
|
|
38
|
+
local_path = values.get("local_path")
|
|
39
|
+
if local_path:
|
|
40
|
+
values["local_path"] = os.path.abspath(local_path) + "/"
|
|
41
|
+
jsonld.set_document_loader(
|
|
42
|
+
lambda uri, options: {
|
|
43
|
+
"contextUrl": None, # No special context URL
|
|
44
|
+
"documentUrl": uri, # The document's actual URL
|
|
45
|
+
# The parsed JSON-LD document
|
|
46
|
+
"document": unified_document_loader(uri),
|
|
47
|
+
}
|
|
48
|
+
)
|
|
49
|
+
return values
|
|
50
|
+
|
|
51
|
+
@cached_property
|
|
52
|
+
def json_dict(self) -> Dict:
|
|
53
|
+
"""Fetch the original JSON data."""
|
|
54
|
+
_LOGGER.debug(f"Fetching JSON data from {self.uri}")
|
|
55
|
+
return unified_document_loader(self.uri)
|
|
56
|
+
|
|
57
|
+
def _preprocess_nested_contexts(self, data: dict, context: dict) -> dict:
|
|
58
|
+
"""
|
|
59
|
+
Pre-process data to resolve @base in nested @context definitions.
|
|
60
|
+
This works around pyld's limitation with scoped contexts.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
data: The JSON-LD data to preprocess
|
|
64
|
+
context: The @context dictionary
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
Preprocessed data with resolved nested contexts
|
|
68
|
+
"""
|
|
69
|
+
if not isinstance(data, dict):
|
|
70
|
+
return data
|
|
71
|
+
|
|
72
|
+
result = {}
|
|
73
|
+
|
|
74
|
+
for key, value in data.items():
|
|
75
|
+
if key == "@context":
|
|
76
|
+
result[key] = value
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# Check if this term has a nested @context with @base
|
|
80
|
+
term_def = context.get(key, {})
|
|
81
|
+
if isinstance(term_def, dict) and "@context" in term_def:
|
|
82
|
+
nested_context = term_def["@context"]
|
|
83
|
+
base_url = nested_context.get("@base", "")
|
|
84
|
+
|
|
85
|
+
# If the value is a string and we have a @base, prepend it
|
|
86
|
+
if isinstance(value, str) and base_url and term_def.get("@type") == "@id":
|
|
87
|
+
# Don't prepend if it's already an absolute URL
|
|
88
|
+
if not value.startswith("http://") and not value.startswith("https://"):
|
|
89
|
+
# Return as {"@id": "full_url"} to preserve @id semantics
|
|
90
|
+
result[key] = {"@id": base_url + value}
|
|
91
|
+
else:
|
|
92
|
+
result[key] = {"@id": value}
|
|
93
|
+
elif isinstance(value, list):
|
|
94
|
+
# Process each item in the list
|
|
95
|
+
result[key] = []
|
|
96
|
+
for item in value:
|
|
97
|
+
if isinstance(item, dict):
|
|
98
|
+
result[key].append(self._preprocess_nested_contexts(item, context))
|
|
99
|
+
elif isinstance(item, str) and base_url and term_def.get("@type") == "@id":
|
|
100
|
+
# Convert string items to {"@id": "..."} when @type is @id
|
|
101
|
+
if not item.startswith("http://") and not item.startswith("https://"):
|
|
102
|
+
result[key].append({"@id": base_url + item})
|
|
103
|
+
else:
|
|
104
|
+
result[key].append({"@id": item})
|
|
105
|
+
else:
|
|
106
|
+
result[key].append(item)
|
|
107
|
+
elif isinstance(value, dict):
|
|
108
|
+
result[key] = self._preprocess_nested_contexts(value, context)
|
|
109
|
+
else:
|
|
110
|
+
result[key] = value
|
|
111
|
+
elif isinstance(value, list):
|
|
112
|
+
# Process each item in the list
|
|
113
|
+
result[key] = []
|
|
114
|
+
for item in value:
|
|
115
|
+
if isinstance(item, dict):
|
|
116
|
+
result[key].append(self._preprocess_nested_contexts(item, context))
|
|
117
|
+
else:
|
|
118
|
+
result[key].append(item)
|
|
119
|
+
elif isinstance(value, dict):
|
|
120
|
+
result[key] = self._preprocess_nested_contexts(value, context)
|
|
121
|
+
else:
|
|
122
|
+
result[key] = value
|
|
123
|
+
|
|
124
|
+
return result
|
|
125
|
+
|
|
126
|
+
@cached_property
|
|
127
|
+
def expanded(self) -> Any:
|
|
128
|
+
"""Expand the JSON-LD data with preprocessing for nested contexts."""
|
|
129
|
+
_LOGGER.debug(f"Expanding JSON-LD data for {self.uri}")
|
|
130
|
+
|
|
131
|
+
# Get the data and context
|
|
132
|
+
data = self.json_dict
|
|
133
|
+
|
|
134
|
+
# Get the context - it should already be the inner dictionary
|
|
135
|
+
context_dict = self.context
|
|
136
|
+
if isinstance(context_dict, dict) and "@context" in context_dict:
|
|
137
|
+
context_dict = context_dict["@context"]
|
|
138
|
+
|
|
139
|
+
# Preprocess to handle nested contexts with @base
|
|
140
|
+
preprocessed = self._preprocess_nested_contexts(data, context_dict)
|
|
141
|
+
|
|
142
|
+
# Add the context back if it was in the original data
|
|
143
|
+
if "@context" in data:
|
|
144
|
+
preprocessed["@context"] = data["@context"]
|
|
145
|
+
|
|
146
|
+
# Expand the preprocessed data
|
|
147
|
+
return jsonld.expand(preprocessed, options={"base": self.uri})
|
|
148
|
+
|
|
149
|
+
@cached_property
|
|
150
|
+
def context(self) -> Dict:
|
|
151
|
+
"""Fetch and return the JSON content of the '@context'."""
|
|
152
|
+
|
|
153
|
+
context_data = JsonLdResource(uri="/".join(self.uri.split("/")[:-1]) + "/" + self.json_dict["@context"])
|
|
154
|
+
# Works only in relative path declaration
|
|
155
|
+
|
|
156
|
+
context_value = context_data.json_dict
|
|
157
|
+
if isinstance(context_value, str):
|
|
158
|
+
# It's a URI, fetch it
|
|
159
|
+
_LOGGER.info(f"Fetching context from URI: {context_value}")
|
|
160
|
+
return unified_document_loader(context_value)
|
|
161
|
+
elif isinstance(context_value, dict):
|
|
162
|
+
# Embedded context
|
|
163
|
+
_LOGGER.info("Using embedded context.")
|
|
164
|
+
return context_value
|
|
165
|
+
else:
|
|
166
|
+
_LOGGER.warning("No valid '@context' found.")
|
|
167
|
+
return {}
|
|
168
|
+
|
|
169
|
+
@cached_property
|
|
170
|
+
def normalized(self) -> str:
|
|
171
|
+
"""Normalize the JSON-LD data."""
|
|
172
|
+
_LOGGER.info(f"Normalizing JSON-LD data for {self.uri}")
|
|
173
|
+
return jsonld.normalize(self.uri, options={"algorithm": "URDNA2015", "format": "application/n-quads"})
|
|
174
|
+
|
|
175
|
+
def _extract_model_key(self, uri: str) -> Optional[str]:
|
|
176
|
+
"""Extract a model key from the URI."""
|
|
177
|
+
parts = uri.strip("/").split("/")
|
|
178
|
+
if len(parts) >= 2:
|
|
179
|
+
return parts[-2]
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
@property
|
|
183
|
+
def info(self) -> str:
|
|
184
|
+
"""Return a detailed summary of the data."""
|
|
185
|
+
res = f"{'#' * 100}\n"
|
|
186
|
+
res += f"### {self.uri.split('/')[-1]} ###\n"
|
|
187
|
+
res += f"JSON Version:\n {json.dumps(self.json_dict, indent=2)}\n"
|
|
188
|
+
res += f"URI: {self.uri}\n"
|
|
189
|
+
res += f"JSON Version:\n {json.dumps(self.json_dict, indent=2)}\n"
|
|
190
|
+
res += f"Expanded Version:\n {json.dumps(self.expanded, indent=2)}\n"
|
|
191
|
+
res += f"Normalized Version:\n {self.normalized}\n"
|
|
192
|
+
return res
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
if __name__ == "__main__":
|
|
196
|
+
# For Universe
|
|
197
|
+
# online
|
|
198
|
+
# d = Data(uri = "https://espri-mod.github.io/mip-cmor-tables/activity/cmip.json")
|
|
199
|
+
# print(d.info)
|
|
200
|
+
# offline
|
|
201
|
+
# print(Data(uri = ".cache/repos/mip-cmor-tables/activity/cmip.json").info)
|
|
202
|
+
# for Project
|
|
203
|
+
# d = Data(uri = "https://espri-mod.github.io/CMIP6Plus_CVs/activity_id/cmip.json")
|
|
204
|
+
# print(d.info)
|
|
205
|
+
# offline
|
|
206
|
+
print(JsonLdResource(uri=".cache/repos/CMIP6Plus_CVs/activity_id/cmip.json").info)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import yaml
|
|
5
|
+
from sqlalchemy import Engine
|
|
6
|
+
from sqlmodel import Session, create_engine
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DBConnection:
|
|
10
|
+
SQLITE_URL_PREFIX = 'sqlite://'
|
|
11
|
+
|
|
12
|
+
def __init__(self, db_file_path: Path, echo: bool = False) -> None:
|
|
13
|
+
self.engine = create_engine(f'{DBConnection.SQLITE_URL_PREFIX}/{db_file_path}', echo=echo)
|
|
14
|
+
self.name = db_file_path.stem
|
|
15
|
+
self.file_path = db_file_path.absolute()
|
|
16
|
+
|
|
17
|
+
def set_echo(self, echo: bool) -> None:
|
|
18
|
+
self.engine.echo = echo
|
|
19
|
+
|
|
20
|
+
def get_engine(self) -> Engine:
|
|
21
|
+
return self.engine
|
|
22
|
+
|
|
23
|
+
def create_session(self) -> Session:
|
|
24
|
+
return Session(self.engine)
|
|
25
|
+
|
|
26
|
+
def get_name(self) -> str | None:
|
|
27
|
+
return self.name
|
|
28
|
+
|
|
29
|
+
def get_file_path(self) -> Path:
|
|
30
|
+
return self.file_path
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def read_json_file(json_file_path: Path) -> dict:
|
|
34
|
+
return json.loads(json_file_path.read_text())
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def read_yaml_file(yaml_file_path: Path) -> dict:
|
|
38
|
+
with open(yaml_file_path, 'r') as file:
|
|
39
|
+
result = yaml.safe_load(file)
|
|
40
|
+
return result
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
from sqlmodel import Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TermKind(Enum):
|
|
7
|
+
"""
|
|
8
|
+
The kinds of term.
|
|
9
|
+
"""
|
|
10
|
+
PLAIN = "plain"
|
|
11
|
+
"""End written term."""
|
|
12
|
+
PATTERN = "pattern"
|
|
13
|
+
"""Regex based terms"""
|
|
14
|
+
COMPOSITE = "composite"
|
|
15
|
+
"""Term composed of terms."""
|
|
16
|
+
MIXED = 'mixed'
|
|
17
|
+
"""To be defined."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PkMixin:
|
|
21
|
+
pk: int | None = Field(default=None, primary_key=True)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class IdMixin:
|
|
25
|
+
id: str = Field(index=True)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import sqlalchemy as sa
|
|
5
|
+
from sqlalchemy import text
|
|
6
|
+
from sqlalchemy.dialects.sqlite import JSON
|
|
7
|
+
from sqlmodel import Column, Field, Relationship, SQLModel
|
|
8
|
+
|
|
9
|
+
import esgvoc.core.db.connection as db
|
|
10
|
+
from esgvoc.core.db.models.mixins import IdMixin, PkMixin, TermKind
|
|
11
|
+
from esgvoc.core.exceptions import EsgvocDbError
|
|
12
|
+
|
|
13
|
+
_LOGGER = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Project(SQLModel, PkMixin, IdMixin, table=True):
|
|
17
|
+
__tablename__ = "projects"
|
|
18
|
+
specs: dict = Field(sa_column=sa.Column(JSON))
|
|
19
|
+
git_hash: str
|
|
20
|
+
collections: list["PCollection"] = Relationship(back_populates="project")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PCollection(SQLModel, PkMixin, IdMixin, table=True):
|
|
24
|
+
__tablename__ = "pcollections"
|
|
25
|
+
data_descriptor_id: str = Field(index=True)
|
|
26
|
+
context: dict = Field(sa_column=sa.Column(JSON))
|
|
27
|
+
project_pk: int | None = Field(default=None, foreign_key="projects.pk")
|
|
28
|
+
project: Project = Relationship(back_populates="collections")
|
|
29
|
+
terms: list["PTerm"] = Relationship(back_populates="collection")
|
|
30
|
+
term_kind: TermKind = Field(sa_column=Column(sa.Enum(TermKind)))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Well, the following instructions are not data duplication. It is more building an index.
|
|
34
|
+
# Read: https://sqlite.org/fts5.html
|
|
35
|
+
class PCollectionFTS5(SQLModel, PkMixin, IdMixin, table=True):
|
|
36
|
+
__tablename__ = "pcollections_fts5"
|
|
37
|
+
data_descriptor_id: str
|
|
38
|
+
context: dict = Field(sa_column=sa.Column(JSON))
|
|
39
|
+
project_pk: int | None = Field(default=None, foreign_key="projects.pk")
|
|
40
|
+
term_kind: TermKind = Field(sa_column=Column(sa.Enum(TermKind)))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class PTerm(SQLModel, PkMixin, IdMixin, table=True):
|
|
44
|
+
__tablename__ = "pterms"
|
|
45
|
+
specs: dict = Field(sa_column=sa.Column(JSON))
|
|
46
|
+
kind: TermKind = Field(sa_column=Column(sa.Enum(TermKind)))
|
|
47
|
+
collection_pk: int | None = Field(default=None, foreign_key="pcollections.pk")
|
|
48
|
+
collection: PCollection = Relationship(back_populates="terms")
|
|
49
|
+
__table_args__ = (sa.Index("drs_name_index", specs.sa_column["drs_name"]), ) # type: ignore
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# Well, the following instructions are not data duplication. It is more building an index.
|
|
53
|
+
# Read: https://sqlite.org/fts5.html
|
|
54
|
+
class PTermFTS5(SQLModel, PkMixin, IdMixin, table=True):
|
|
55
|
+
__tablename__ = "pterms_fts5"
|
|
56
|
+
specs: dict = Field(sa_column=sa.Column(JSON))
|
|
57
|
+
kind: TermKind = Field(sa_column=Column(sa.Enum(TermKind)))
|
|
58
|
+
collection_pk: int | None = Field(default=None, foreign_key="pcollections.pk")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def project_create_db(db_file_path: Path):
|
|
62
|
+
try:
|
|
63
|
+
connection = db.DBConnection(db_file_path)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
msg = f'unable to create SQlite file at {db_file_path}'
|
|
66
|
+
_LOGGER.fatal(msg)
|
|
67
|
+
raise EsgvocDbError(msg) from e
|
|
68
|
+
try:
|
|
69
|
+
# Do not include pterms_fts5 table: it is build from a raw SQL query.
|
|
70
|
+
tables_to_be_created = [SQLModel.metadata.tables['projects'],
|
|
71
|
+
SQLModel.metadata.tables['pcollections'],
|
|
72
|
+
SQLModel.metadata.tables['pterms']]
|
|
73
|
+
SQLModel.metadata.create_all(connection.get_engine(), tables=tables_to_be_created)
|
|
74
|
+
except Exception as e:
|
|
75
|
+
msg = f'unable to create tables in SQLite database at {db_file_path}'
|
|
76
|
+
_LOGGER.fatal(msg)
|
|
77
|
+
raise EsgvocDbError(msg) from e
|
|
78
|
+
try:
|
|
79
|
+
with connection.create_session() as session:
|
|
80
|
+
sql_query = "CREATE VIRTUAL TABLE IF NOT EXISTS pterms_fts5 USING " + \
|
|
81
|
+
"fts5(pk, id, specs, kind, collection_pk, content=pterms, content_rowid=pk, prefix=3);"
|
|
82
|
+
session.exec(text(sql_query)) # type: ignore
|
|
83
|
+
session.commit()
|
|
84
|
+
except Exception as e:
|
|
85
|
+
msg = f'unable to create table pterms_fts5 for {db_file_path}'
|
|
86
|
+
_LOGGER.fatal(msg)
|
|
87
|
+
raise EsgvocDbError(msg) from e
|
|
88
|
+
try:
|
|
89
|
+
with connection.create_session() as session:
|
|
90
|
+
sql_query = 'CREATE VIRTUAL TABLE IF NOT EXISTS pcollections_fts5 USING ' + \
|
|
91
|
+
'fts5(pk, id, data_descriptor_id, context, project_pk, ' + \
|
|
92
|
+
'term_kind, content=pcollections, content_rowid=pk, prefix=3);'
|
|
93
|
+
session.exec(text(sql_query)) # type: ignore
|
|
94
|
+
session.commit()
|
|
95
|
+
except Exception as e:
|
|
96
|
+
msg = f'unable to create table pcollections_fts5 for {db_file_path}'
|
|
97
|
+
_LOGGER.fatal(msg)
|
|
98
|
+
raise EsgvocDbError(msg) from e
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
pass
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import sqlalchemy as sa
|
|
5
|
+
from sqlalchemy import text
|
|
6
|
+
from sqlalchemy.dialects.sqlite import JSON
|
|
7
|
+
from sqlmodel import Column, Field, Relationship, SQLModel
|
|
8
|
+
|
|
9
|
+
import esgvoc.core.db.connection as db
|
|
10
|
+
from esgvoc.core.db.models.mixins import IdMixin, PkMixin, TermKind
|
|
11
|
+
from esgvoc.core.exceptions import EsgvocDbError
|
|
12
|
+
|
|
13
|
+
_LOGGER = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Universe(SQLModel, PkMixin, table=True):
|
|
17
|
+
__tablename__ = "universes"
|
|
18
|
+
git_hash: str
|
|
19
|
+
data_descriptors: list["UDataDescriptor"] = Relationship(back_populates="universe")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class UDataDescriptor(SQLModel, PkMixin, IdMixin, table=True):
|
|
23
|
+
__tablename__ = "udata_descriptors"
|
|
24
|
+
context: dict = Field(sa_column=sa.Column(JSON))
|
|
25
|
+
universe_pk: int | None = Field(default=None, foreign_key="universes.pk")
|
|
26
|
+
universe: Universe = Relationship(back_populates="data_descriptors")
|
|
27
|
+
terms: list["UTerm"] = Relationship(back_populates="data_descriptor")
|
|
28
|
+
term_kind: TermKind = Field(sa_column=Column(sa.Enum(TermKind)))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Well, the following instructions are not data duplication. It is more building an index.
|
|
32
|
+
# Read: https://sqlite.org/fts5.html
|
|
33
|
+
class UDataDescriptorFTS5(SQLModel, PkMixin, IdMixin, table=True):
|
|
34
|
+
__tablename__ = "udata_descriptors_fts5"
|
|
35
|
+
context: dict = Field(sa_column=sa.Column(JSON))
|
|
36
|
+
universe_pk: int | None = Field(default=None, foreign_key="universes.pk")
|
|
37
|
+
term_kind: TermKind = Field(sa_column=Column(sa.Enum(TermKind)))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class UTerm(SQLModel, PkMixin, IdMixin, table=True):
|
|
41
|
+
__tablename__ = "uterms"
|
|
42
|
+
specs: dict = Field(sa_column=sa.Column(JSON))
|
|
43
|
+
kind: TermKind = Field(sa_column=Column(sa.Enum(TermKind)))
|
|
44
|
+
data_descriptor_pk: int | None = Field(default=None, foreign_key="udata_descriptors.pk")
|
|
45
|
+
data_descriptor: UDataDescriptor = Relationship(back_populates="terms")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Well, the following instructions are not data duplication. It is more building an index.
|
|
49
|
+
# Read: https://sqlite.org/fts5.html
|
|
50
|
+
class UTermFTS5(SQLModel, PkMixin, IdMixin, table=True):
|
|
51
|
+
__tablename__ = "uterms_fts5"
|
|
52
|
+
specs: dict = Field(sa_column=sa.Column(JSON))
|
|
53
|
+
kind: TermKind = Field(sa_column=Column(sa.Enum(TermKind)))
|
|
54
|
+
data_descriptor_pk: int | None = Field(default=None, foreign_key="udata_descriptors.pk")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def universe_create_db(db_file_path: Path) -> None:
|
|
58
|
+
try:
|
|
59
|
+
connection = db.DBConnection(db_file_path)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
msg = f'unable to create SQLite file at {db_file_path}'
|
|
62
|
+
_LOGGER.fatal(msg)
|
|
63
|
+
raise EsgvocDbError(msg) from e
|
|
64
|
+
try:
|
|
65
|
+
# Avoid creating project tables.
|
|
66
|
+
tables_to_be_created = [SQLModel.metadata.tables['uterms'],
|
|
67
|
+
SQLModel.metadata.tables['udata_descriptors'],
|
|
68
|
+
SQLModel.metadata.tables['universes']]
|
|
69
|
+
SQLModel.metadata.create_all(connection.get_engine(), tables=tables_to_be_created)
|
|
70
|
+
except Exception as e:
|
|
71
|
+
msg = f'unable to create tables in SQLite database at {db_file_path}'
|
|
72
|
+
_LOGGER.fatal(msg)
|
|
73
|
+
raise EsgvocDbError(msg) from e
|
|
74
|
+
try:
|
|
75
|
+
with connection.create_session() as session:
|
|
76
|
+
sql_query = 'CREATE VIRTUAL TABLE IF NOT EXISTS uterms_fts5 USING ' + \
|
|
77
|
+
'fts5(pk, id, specs, kind, data_descriptor_pk, content=uterms, content_rowid=pk, prefix=3);'
|
|
78
|
+
session.exec(text(sql_query)) # type: ignore
|
|
79
|
+
session.commit()
|
|
80
|
+
except Exception as e:
|
|
81
|
+
msg = f'unable to create table uterms_fts5 for {db_file_path}'
|
|
82
|
+
_LOGGER.fatal(msg)
|
|
83
|
+
raise EsgvocDbError(msg) from e
|
|
84
|
+
try:
|
|
85
|
+
with connection.create_session() as session:
|
|
86
|
+
sql_query = 'CREATE VIRTUAL TABLE IF NOT EXISTS udata_descriptors_fts5 USING ' + \
|
|
87
|
+
'fts5(pk, id, universe_pk, context, ' + \
|
|
88
|
+
'term_kind, content=udata_descriptors, content_rowid=pk, prefix=3);'
|
|
89
|
+
session.exec(text(sql_query)) # type: ignore
|
|
90
|
+
session.commit()
|
|
91
|
+
except Exception as e:
|
|
92
|
+
msg = f'unable to create table udata_descriptors_fts5 for {db_file_path}'
|
|
93
|
+
_LOGGER.fatal(msg)
|
|
94
|
+
raise EsgvocDbError(msg) from e
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
if __name__ == "__main__":
|
|
98
|
+
pass
|