esgvoc 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- esgvoc/__init__.py +3 -0
- esgvoc/api/__init__.py +91 -0
- esgvoc/api/data_descriptors/EMD_models/__init__.py +66 -0
- esgvoc/api/data_descriptors/EMD_models/arrangement.py +21 -0
- esgvoc/api/data_descriptors/EMD_models/calendar.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/cell_variable_type.py +20 -0
- esgvoc/api/data_descriptors/EMD_models/component_type.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/coordinate.py +52 -0
- esgvoc/api/data_descriptors/EMD_models/grid_mapping.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/grid_region.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/grid_type.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_computational_grid.py +56 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_grid_cells.py +230 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_subgrid.py +41 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_units.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/model.py +139 -0
- esgvoc/api/data_descriptors/EMD_models/model_component.py +115 -0
- esgvoc/api/data_descriptors/EMD_models/reference.py +61 -0
- esgvoc/api/data_descriptors/EMD_models/resolution.py +48 -0
- esgvoc/api/data_descriptors/EMD_models/temporal_refinement.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/truncation_method.py +17 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_computational_grid.py +91 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_coordinate.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_units.py +19 -0
- esgvoc/api/data_descriptors/__init__.py +159 -0
- esgvoc/api/data_descriptors/activity.py +72 -0
- esgvoc/api/data_descriptors/archive.py +5 -0
- esgvoc/api/data_descriptors/area_label.py +30 -0
- esgvoc/api/data_descriptors/branded_suffix.py +30 -0
- esgvoc/api/data_descriptors/branded_variable.py +21 -0
- esgvoc/api/data_descriptors/citation_url.py +5 -0
- esgvoc/api/data_descriptors/contact.py +5 -0
- esgvoc/api/data_descriptors/conventions.py +28 -0
- esgvoc/api/data_descriptors/creation_date.py +18 -0
- esgvoc/api/data_descriptors/data_descriptor.py +127 -0
- esgvoc/api/data_descriptors/data_specs_version.py +25 -0
- esgvoc/api/data_descriptors/date.py +5 -0
- esgvoc/api/data_descriptors/directory_date.py +22 -0
- esgvoc/api/data_descriptors/drs_specs.py +38 -0
- esgvoc/api/data_descriptors/experiment.py +215 -0
- esgvoc/api/data_descriptors/forcing_index.py +21 -0
- esgvoc/api/data_descriptors/frequency.py +48 -0
- esgvoc/api/data_descriptors/further_info_url.py +5 -0
- esgvoc/api/data_descriptors/grid.py +43 -0
- esgvoc/api/data_descriptors/horizontal_label.py +20 -0
- esgvoc/api/data_descriptors/initialization_index.py +27 -0
- esgvoc/api/data_descriptors/institution.py +80 -0
- esgvoc/api/data_descriptors/known_branded_variable.py +75 -0
- esgvoc/api/data_descriptors/license.py +31 -0
- esgvoc/api/data_descriptors/member_id.py +9 -0
- esgvoc/api/data_descriptors/mip_era.py +26 -0
- esgvoc/api/data_descriptors/model_component.py +32 -0
- esgvoc/api/data_descriptors/models_test/models.py +17 -0
- esgvoc/api/data_descriptors/nominal_resolution.py +50 -0
- esgvoc/api/data_descriptors/obs_type.py +5 -0
- esgvoc/api/data_descriptors/organisation.py +22 -0
- esgvoc/api/data_descriptors/physics_index.py +21 -0
- esgvoc/api/data_descriptors/product.py +16 -0
- esgvoc/api/data_descriptors/publication_status.py +5 -0
- esgvoc/api/data_descriptors/realization_index.py +24 -0
- esgvoc/api/data_descriptors/realm.py +16 -0
- esgvoc/api/data_descriptors/regex.py +5 -0
- esgvoc/api/data_descriptors/region.py +35 -0
- esgvoc/api/data_descriptors/resolution.py +7 -0
- esgvoc/api/data_descriptors/source.py +120 -0
- esgvoc/api/data_descriptors/source_type.py +5 -0
- esgvoc/api/data_descriptors/sub_experiment.py +5 -0
- esgvoc/api/data_descriptors/table.py +28 -0
- esgvoc/api/data_descriptors/temporal_label.py +20 -0
- esgvoc/api/data_descriptors/time_range.py +17 -0
- esgvoc/api/data_descriptors/title.py +5 -0
- esgvoc/api/data_descriptors/tracking_id.py +67 -0
- esgvoc/api/data_descriptors/variable.py +56 -0
- esgvoc/api/data_descriptors/variant_label.py +25 -0
- esgvoc/api/data_descriptors/vertical_label.py +20 -0
- esgvoc/api/project_specs.py +143 -0
- esgvoc/api/projects.py +1253 -0
- esgvoc/api/py.typed +0 -0
- esgvoc/api/pydantic_handler.py +146 -0
- esgvoc/api/report.py +127 -0
- esgvoc/api/search.py +171 -0
- esgvoc/api/universe.py +434 -0
- esgvoc/apps/__init__.py +6 -0
- esgvoc/apps/cmor_tables/__init__.py +7 -0
- esgvoc/apps/cmor_tables/cvs_table.py +948 -0
- esgvoc/apps/drs/__init__.py +0 -0
- esgvoc/apps/drs/constants.py +2 -0
- esgvoc/apps/drs/generator.py +429 -0
- esgvoc/apps/drs/report.py +540 -0
- esgvoc/apps/drs/validator.py +312 -0
- esgvoc/apps/ga/__init__.py +104 -0
- esgvoc/apps/ga/example_usage.py +315 -0
- esgvoc/apps/ga/models/__init__.py +47 -0
- esgvoc/apps/ga/models/netcdf_header.py +306 -0
- esgvoc/apps/ga/models/validator.py +491 -0
- esgvoc/apps/ga/test_ga.py +161 -0
- esgvoc/apps/ga/validator.py +277 -0
- esgvoc/apps/jsg/json_schema_generator.py +341 -0
- esgvoc/apps/jsg/templates/template.jinja +241 -0
- esgvoc/apps/test_cv/README.md +214 -0
- esgvoc/apps/test_cv/__init__.py +0 -0
- esgvoc/apps/test_cv/cv_tester.py +1611 -0
- esgvoc/apps/test_cv/example_usage.py +216 -0
- esgvoc/apps/vr/__init__.py +12 -0
- esgvoc/apps/vr/build_variable_registry.py +71 -0
- esgvoc/apps/vr/example_usage.py +60 -0
- esgvoc/apps/vr/vr_app.py +333 -0
- esgvoc/cli/clean.py +304 -0
- esgvoc/cli/cmor.py +46 -0
- esgvoc/cli/config.py +1300 -0
- esgvoc/cli/drs.py +267 -0
- esgvoc/cli/find.py +138 -0
- esgvoc/cli/get.py +155 -0
- esgvoc/cli/install.py +41 -0
- esgvoc/cli/main.py +60 -0
- esgvoc/cli/offline.py +269 -0
- esgvoc/cli/status.py +79 -0
- esgvoc/cli/test_cv.py +258 -0
- esgvoc/cli/valid.py +147 -0
- esgvoc/core/constants.py +17 -0
- esgvoc/core/convert.py +0 -0
- esgvoc/core/data_handler.py +206 -0
- esgvoc/core/db/__init__.py +3 -0
- esgvoc/core/db/connection.py +40 -0
- esgvoc/core/db/models/mixins.py +25 -0
- esgvoc/core/db/models/project.py +102 -0
- esgvoc/core/db/models/universe.py +98 -0
- esgvoc/core/db/project_ingestion.py +231 -0
- esgvoc/core/db/universe_ingestion.py +172 -0
- esgvoc/core/exceptions.py +33 -0
- esgvoc/core/logging_handler.py +26 -0
- esgvoc/core/repo_fetcher.py +345 -0
- esgvoc/core/service/__init__.py +41 -0
- esgvoc/core/service/configuration/config_manager.py +196 -0
- esgvoc/core/service/configuration/setting.py +363 -0
- esgvoc/core/service/data_merger.py +634 -0
- esgvoc/core/service/esg_voc.py +77 -0
- esgvoc/core/service/resolver_config.py +56 -0
- esgvoc/core/service/state.py +324 -0
- esgvoc/core/service/string_heuristics.py +98 -0
- esgvoc/core/service/term_cache.py +108 -0
- esgvoc/core/service/uri_resolver.py +133 -0
- esgvoc-2.0.2.dist-info/METADATA +82 -0
- esgvoc-2.0.2.dist-info/RECORD +147 -0
- esgvoc-2.0.2.dist-info/WHEEL +4 -0
- esgvoc-2.0.2.dist-info/entry_points.txt +2 -0
- esgvoc-2.0.2.dist-info/licenses/LICENSE.txt +519 -0
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import traceback
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
from sqlalchemy import text
|
|
7
|
+
|
|
8
|
+
import esgvoc.core.constants
|
|
9
|
+
import esgvoc.core.db.connection as db
|
|
10
|
+
import esgvoc.core.service as service
|
|
11
|
+
from esgvoc.core.data_handler import JsonLdResource
|
|
12
|
+
from esgvoc.core.db.connection import DBConnection, read_json_file, read_yaml_file
|
|
13
|
+
from esgvoc.core.db.models.mixins import TermKind
|
|
14
|
+
from esgvoc.core.db.models.project import PCollection, Project, PTerm
|
|
15
|
+
from esgvoc.core.exceptions import EsgvocDbError
|
|
16
|
+
from esgvoc.core.service.data_merger import DataMerger
|
|
17
|
+
|
|
18
|
+
_LOGGER = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def infer_term_kind(json_specs: dict) -> TermKind:
|
|
22
|
+
if esgvoc.core.constants.PATTERN_JSON_KEY in json_specs:
|
|
23
|
+
return TermKind.PATTERN
|
|
24
|
+
elif esgvoc.core.constants.COMPOSITE_PARTS_JSON_KEY in json_specs:
|
|
25
|
+
return TermKind.COMPOSITE
|
|
26
|
+
else:
|
|
27
|
+
return TermKind.PLAIN
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def ingest_metadata_project(connection: DBConnection, git_hash):
|
|
31
|
+
with connection.create_session() as session:
|
|
32
|
+
project = Project(id=str(connection.file_path.stem), git_hash=git_hash, specs={})
|
|
33
|
+
session.add(project)
|
|
34
|
+
session.commit()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_data_descriptor_id_from_context(collection_context: dict) -> str:
|
|
38
|
+
data_descriptor_url = collection_context[esgvoc.core.constants.CONTEXT_JSON_KEY][
|
|
39
|
+
esgvoc.core.constants.DATA_DESCRIPTOR_JSON_KEY
|
|
40
|
+
] # noqa E211
|
|
41
|
+
return Path(data_descriptor_url).name
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def instantiate_project_term(
|
|
45
|
+
universe_term_json_specs: dict, project_term_json_specs_update: dict, pydantic_class: type[BaseModel]
|
|
46
|
+
) -> dict:
|
|
47
|
+
term_from_universe = pydantic_class(**universe_term_json_specs)
|
|
48
|
+
updated_term = term_from_universe.model_copy(update=project_term_json_specs_update, deep=True)
|
|
49
|
+
return updated_term.model_dump()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def ingest_collection(collection_dir_path: Path, project: Project, project_db_session) -> None:
|
|
53
|
+
collection_id = collection_dir_path.name
|
|
54
|
+
collection_context_file_path = collection_dir_path.joinpath(esgvoc.core.constants.CONTEXT_FILENAME)
|
|
55
|
+
try:
|
|
56
|
+
collection_context = read_json_file(collection_context_file_path)
|
|
57
|
+
data_descriptor_id = get_data_descriptor_id_from_context(collection_context)
|
|
58
|
+
except Exception as e:
|
|
59
|
+
msg = f"unable to read project context file {collection_context_file_path}"
|
|
60
|
+
_LOGGER.fatal(msg)
|
|
61
|
+
raise EsgvocDbError(msg) from e
|
|
62
|
+
# [KEEP]
|
|
63
|
+
collection = PCollection(
|
|
64
|
+
id=collection_id,
|
|
65
|
+
context=collection_context,
|
|
66
|
+
project=project,
|
|
67
|
+
data_descriptor_id=data_descriptor_id,
|
|
68
|
+
term_kind="",
|
|
69
|
+
) # We ll know it only when we ll add a term
|
|
70
|
+
# (hypothesis all term have the same kind in a collection) # noqa E116
|
|
71
|
+
term_kind_collection = None
|
|
72
|
+
|
|
73
|
+
for term_file_path in collection_dir_path.iterdir():
|
|
74
|
+
_LOGGER.debug(f"found term path : {term_file_path}")
|
|
75
|
+
if term_file_path.is_file() and term_file_path.suffix == ".json":
|
|
76
|
+
try:
|
|
77
|
+
# Map both universe and project URLs to their local paths
|
|
78
|
+
locally_avail = {
|
|
79
|
+
"https://esgvoc.ipsl.fr/resource/universe": service.current_state.universe.local_path,
|
|
80
|
+
f"https://esgvoc.ipsl.fr/resource/{project.id}": str(collection_dir_path.parent),
|
|
81
|
+
}
|
|
82
|
+
merger = DataMerger(
|
|
83
|
+
data=JsonLdResource(uri=str(term_file_path)),
|
|
84
|
+
locally_available=locally_avail,
|
|
85
|
+
allowed_base_uris={
|
|
86
|
+
"https://esgvoc.ipsl.fr/resource/universe",
|
|
87
|
+
f"https://esgvoc.ipsl.fr/resource/{project.id}",
|
|
88
|
+
},
|
|
89
|
+
)
|
|
90
|
+
merged_data = merger.merge_linked_json()[-1]
|
|
91
|
+
# Resolve all nested @id references using merged context
|
|
92
|
+
# IMPORTANT: Use universe path for context because:
|
|
93
|
+
# 1. Universe context defines the data structure and esgvoc_resolve_modes
|
|
94
|
+
# 2. Project terms are typically lightweight references to universe terms
|
|
95
|
+
# 3. Even when overriding, the type definition (and resolve modes) live in universe
|
|
96
|
+
json_specs = merger.resolve_merged_ids(
|
|
97
|
+
merged_data, context_base_path=service.current_state.universe.local_path
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
term_kind = infer_term_kind(json_specs)
|
|
101
|
+
term_id = json_specs["id"]
|
|
102
|
+
|
|
103
|
+
if term_kind_collection is None:
|
|
104
|
+
term_kind_collection = term_kind
|
|
105
|
+
|
|
106
|
+
except Exception as e:
|
|
107
|
+
_LOGGER.error(
|
|
108
|
+
f"❌ INGESTION FAILURE - Term skipped\n"
|
|
109
|
+
f" File: {term_file_path}\n"
|
|
110
|
+
f" Collection: {collection_id}\n"
|
|
111
|
+
f" Project: {project.id}\n"
|
|
112
|
+
f" Error Type: {type(e).__name__}\n"
|
|
113
|
+
f" Error Message: {str(e)}\n"
|
|
114
|
+
f" Full Traceback:\n{traceback.format_exc()}"
|
|
115
|
+
)
|
|
116
|
+
continue
|
|
117
|
+
try:
|
|
118
|
+
term = PTerm(
|
|
119
|
+
id=term_id,
|
|
120
|
+
specs=json_specs,
|
|
121
|
+
collection=collection,
|
|
122
|
+
kind=term_kind,
|
|
123
|
+
)
|
|
124
|
+
project_db_session.add(term)
|
|
125
|
+
except Exception as e:
|
|
126
|
+
_LOGGER.error(
|
|
127
|
+
f"❌ DATABASE INSERTION FAILURE\n"
|
|
128
|
+
f" Term ID: {term_id}\n"
|
|
129
|
+
f" Collection: {collection_id}\n"
|
|
130
|
+
f" Project: {project.id}\n"
|
|
131
|
+
f" Error Type: {type(e).__name__}\n"
|
|
132
|
+
f" Error Message: {str(e)}\n"
|
|
133
|
+
f" Full Traceback:\n{traceback.format_exc()}"
|
|
134
|
+
)
|
|
135
|
+
continue
|
|
136
|
+
# Report ingestion results for this collection
|
|
137
|
+
json_file_count = len([f for f in collection_dir_path.glob("*.json")])
|
|
138
|
+
ingested_term_count = len([t for t in collection.terms])
|
|
139
|
+
_LOGGER.info(
|
|
140
|
+
f"Collection '{collection_id}' in project '{project.id}': "
|
|
141
|
+
f"{ingested_term_count}/{json_file_count} terms ingested"
|
|
142
|
+
)
|
|
143
|
+
if ingested_term_count < json_file_count:
|
|
144
|
+
_LOGGER.warning(
|
|
145
|
+
f"⚠️ {json_file_count - ingested_term_count} term(s) failed to ingest "
|
|
146
|
+
f"in collection '{collection_id}'. See error messages above."
|
|
147
|
+
)
|
|
148
|
+
if term_kind_collection is not None:
|
|
149
|
+
collection.term_kind = term_kind_collection
|
|
150
|
+
else:
|
|
151
|
+
# If no terms were found, default to PLAIN
|
|
152
|
+
_LOGGER.warning(
|
|
153
|
+
f"TermKind was not auto-detected for collection '{collection_id}' in project '{project.id}'. "
|
|
154
|
+
f"No terms were successfully ingested. Defaulting to PLAIN."
|
|
155
|
+
)
|
|
156
|
+
collection.term_kind = TermKind.PLAIN
|
|
157
|
+
project_db_session.add(collection)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def ingest_project(project_dir_path: Path, project_db_file_path: Path, git_hash: str):
|
|
161
|
+
try:
|
|
162
|
+
project_connection = db.DBConnection(project_db_file_path)
|
|
163
|
+
except Exception as e:
|
|
164
|
+
msg = f"unable to read project SQLite file at {project_db_file_path}"
|
|
165
|
+
_LOGGER.fatal(msg)
|
|
166
|
+
raise EsgvocDbError(msg) from e
|
|
167
|
+
|
|
168
|
+
with project_connection.create_session() as project_db_session:
|
|
169
|
+
project_specs_file_path = project_dir_path.joinpath(esgvoc.core.constants.PROJECT_SPECS_FILENAME)
|
|
170
|
+
|
|
171
|
+
drs_specs_file_path = project_dir_path.joinpath(esgvoc.core.constants.DRS_SPECS_FILENAME)
|
|
172
|
+
catalog_specs_file_path = project_dir_path.joinpath(esgvoc.core.constants.CATALOG_SPECS_FILENAME)
|
|
173
|
+
attr_specs_file_path = project_dir_path.joinpath(esgvoc.core.constants.ATTRIBUTES_SPECS_FILENAME)
|
|
174
|
+
try:
|
|
175
|
+
raw_project_specs = read_yaml_file(project_specs_file_path)
|
|
176
|
+
project_id = raw_project_specs[esgvoc.core.constants.PROJECT_ID_JSON_KEY]
|
|
177
|
+
project_specs = raw_project_specs
|
|
178
|
+
if drs_specs_file_path.exists():
|
|
179
|
+
raw_drs_specs = read_yaml_file(drs_specs_file_path)
|
|
180
|
+
project_specs["drs_specs"] = raw_drs_specs
|
|
181
|
+
if catalog_specs_file_path.exists():
|
|
182
|
+
raw_catalog_specs = read_yaml_file(catalog_specs_file_path)
|
|
183
|
+
project_specs["catalog_specs"] = raw_catalog_specs
|
|
184
|
+
if attr_specs_file_path.exists():
|
|
185
|
+
raw_attr_specs = read_yaml_file(attr_specs_file_path)
|
|
186
|
+
project_specs["attr_specs"] = raw_attr_specs
|
|
187
|
+
except Exception as e:
|
|
188
|
+
msg = f"unable to read specs files in {project_dir_path}"
|
|
189
|
+
_LOGGER.fatal(msg)
|
|
190
|
+
raise EsgvocDbError(msg) from e
|
|
191
|
+
|
|
192
|
+
project = Project(id=project_id, specs=project_specs, git_hash=git_hash)
|
|
193
|
+
project_db_session.add(project)
|
|
194
|
+
|
|
195
|
+
for collection_dir_path in project_dir_path.iterdir():
|
|
196
|
+
# TODO maybe put that in settings
|
|
197
|
+
if collection_dir_path.is_dir() and (collection_dir_path / "000_context.jsonld").exists():
|
|
198
|
+
_LOGGER.debug(f"found collection dir : {collection_dir_path}")
|
|
199
|
+
try:
|
|
200
|
+
ingest_collection(collection_dir_path, project, project_db_session)
|
|
201
|
+
except Exception as e:
|
|
202
|
+
msg = f"unexpected error while ingesting collection {collection_dir_path}"
|
|
203
|
+
_LOGGER.fatal(msg)
|
|
204
|
+
raise EsgvocDbError(msg) from e
|
|
205
|
+
project_db_session.commit()
|
|
206
|
+
|
|
207
|
+
# Well, the following instructions are not data duplication. It is more building an index.
|
|
208
|
+
# Read: https://sqlite.org/fts5.html
|
|
209
|
+
try:
|
|
210
|
+
sql_query = (
|
|
211
|
+
"INSERT INTO pterms_fts5(pk, id, specs, kind, collection_pk) " # noqa: S608
|
|
212
|
+
+ "SELECT pk, id, specs, kind, collection_pk FROM pterms;"
|
|
213
|
+
)
|
|
214
|
+
project_db_session.exec(text(sql_query)) # type: ignore
|
|
215
|
+
except Exception as e:
|
|
216
|
+
msg = f"unable to insert rows into pterms_fts5 table for {project_db_file_path}"
|
|
217
|
+
_LOGGER.fatal(msg)
|
|
218
|
+
raise EsgvocDbError(msg) from e
|
|
219
|
+
project_db_session.commit()
|
|
220
|
+
try:
|
|
221
|
+
sql_query = (
|
|
222
|
+
"INSERT INTO pcollections_fts5(pk, id, data_descriptor_id, context, " # noqa: S608
|
|
223
|
+
+ "project_pk, term_kind) SELECT pk, id, data_descriptor_id, context, "
|
|
224
|
+
+ "project_pk, term_kind FROM pcollections;"
|
|
225
|
+
)
|
|
226
|
+
project_db_session.exec(text(sql_query)) # type: ignore
|
|
227
|
+
except Exception as e:
|
|
228
|
+
msg = f"unable to insert rows into pcollections_fts5 table for {project_db_file_path}"
|
|
229
|
+
_LOGGER.fatal(msg)
|
|
230
|
+
raise EsgvocDbError(msg) from e
|
|
231
|
+
project_db_session.commit()
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import traceback
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from sqlalchemy import text
|
|
6
|
+
from sqlmodel import Session, select
|
|
7
|
+
|
|
8
|
+
import esgvoc.core.constants
|
|
9
|
+
import esgvoc.core.db.connection as db
|
|
10
|
+
import esgvoc.core.service as service
|
|
11
|
+
from esgvoc.core.data_handler import JsonLdResource
|
|
12
|
+
from esgvoc.core.db.connection import read_json_file
|
|
13
|
+
from esgvoc.core.db.models.mixins import TermKind
|
|
14
|
+
from esgvoc.core.db.models.universe import UDataDescriptor, Universe, UTerm, universe_create_db
|
|
15
|
+
from esgvoc.core.exceptions import EsgvocDbError
|
|
16
|
+
from esgvoc.core.service.data_merger import DataMerger
|
|
17
|
+
|
|
18
|
+
_LOGGER = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def infer_term_kind(json_specs: dict) -> TermKind:
|
|
22
|
+
if esgvoc.core.constants.PATTERN_JSON_KEY in json_specs:
|
|
23
|
+
return TermKind.PATTERN
|
|
24
|
+
elif esgvoc.core.constants.COMPOSITE_PARTS_JSON_KEY in json_specs:
|
|
25
|
+
return TermKind.COMPOSITE
|
|
26
|
+
else:
|
|
27
|
+
return TermKind.PLAIN
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def ingest_universe(universe_repo_dir_path: Path, universe_db_file_path: Path) -> None:
|
|
31
|
+
try:
|
|
32
|
+
connection = db.DBConnection(universe_db_file_path)
|
|
33
|
+
except Exception as e:
|
|
34
|
+
msg = f"Unable to read universe SQLite file at {universe_db_file_path}. Abort."
|
|
35
|
+
_LOGGER.fatal(msg)
|
|
36
|
+
raise IOError(msg) from e
|
|
37
|
+
|
|
38
|
+
for data_descriptor_dir_path in universe_repo_dir_path.iterdir():
|
|
39
|
+
if (
|
|
40
|
+
data_descriptor_dir_path.is_dir() and (data_descriptor_dir_path / "000_context.jsonld").exists()
|
|
41
|
+
): # TODO may be put that in setting
|
|
42
|
+
try:
|
|
43
|
+
ingest_data_descriptor(data_descriptor_dir_path, connection)
|
|
44
|
+
except Exception as e:
|
|
45
|
+
msg = f"unexpected error while processing data descriptor {data_descriptor_dir_path}"
|
|
46
|
+
_LOGGER.fatal(msg)
|
|
47
|
+
raise EsgvocDbError(msg) from e
|
|
48
|
+
|
|
49
|
+
with connection.create_session() as session:
|
|
50
|
+
# Well, the following instructions are not data duplication. It is more building an index.
|
|
51
|
+
# Read: https://sqlite.org/fts5.html
|
|
52
|
+
try:
|
|
53
|
+
sql_query = (
|
|
54
|
+
"INSERT INTO uterms_fts5(pk, id, specs, kind, data_descriptor_pk) "
|
|
55
|
+
+ "SELECT pk, id, specs, kind, data_descriptor_pk FROM uterms;"
|
|
56
|
+
) # noqa: S608
|
|
57
|
+
session.exec(text(sql_query)) # type: ignore
|
|
58
|
+
except Exception as e:
|
|
59
|
+
msg = f"unable to insert rows into uterms_fts5 table for {universe_db_file_path}"
|
|
60
|
+
_LOGGER.fatal(msg)
|
|
61
|
+
raise EsgvocDbError(msg) from e
|
|
62
|
+
session.commit()
|
|
63
|
+
try:
|
|
64
|
+
sql_query = (
|
|
65
|
+
"INSERT INTO udata_descriptors_fts5(pk, id, universe_pk, context, term_kind) "
|
|
66
|
+
+ "SELECT pk, id, universe_pk, context, term_kind FROM udata_descriptors;"
|
|
67
|
+
) # noqa: S608
|
|
68
|
+
session.exec(text(sql_query)) # type: ignore
|
|
69
|
+
except Exception as e:
|
|
70
|
+
msg = f"unable to insert rows into udata_descriptors_fts5 table for {universe_db_file_path}"
|
|
71
|
+
_LOGGER.fatal(msg)
|
|
72
|
+
raise EsgvocDbError(msg) from e
|
|
73
|
+
session.commit()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def ingest_metadata_universe(connection, git_hash):
|
|
77
|
+
with connection.create_session() as session:
|
|
78
|
+
universe = Universe(git_hash=git_hash)
|
|
79
|
+
session.add(universe)
|
|
80
|
+
session.commit()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def ingest_data_descriptor(data_descriptor_path: Path, connection: db.DBConnection) -> None:
|
|
84
|
+
data_descriptor_id = data_descriptor_path.name
|
|
85
|
+
context_file_path = data_descriptor_path.joinpath(esgvoc.core.constants.CONTEXT_FILENAME)
|
|
86
|
+
try:
|
|
87
|
+
context = read_json_file(context_file_path)
|
|
88
|
+
except Exception as e:
|
|
89
|
+
msg = f"Unable to read the context file {context_file_path} of data descriptor \
|
|
90
|
+
{data_descriptor_id}. Skip.\n{str(e)}"
|
|
91
|
+
_LOGGER.warning(msg)
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
with connection.create_session() as session:
|
|
95
|
+
# We ll know it only when we ll add a term (hypothesis all term have the same kind in a data_descriptor)
|
|
96
|
+
data_descriptor = UDataDescriptor(id=data_descriptor_id, context=context, term_kind="")
|
|
97
|
+
term_kind_dd = None
|
|
98
|
+
|
|
99
|
+
_LOGGER.debug(f"add data_descriptor : {data_descriptor_id}")
|
|
100
|
+
for term_file_path in data_descriptor_path.iterdir():
|
|
101
|
+
_LOGGER.debug(f"found term path : {term_file_path}, {term_file_path.suffix}")
|
|
102
|
+
if term_file_path.is_file() and term_file_path.suffix == ".json":
|
|
103
|
+
try:
|
|
104
|
+
locally_available = {
|
|
105
|
+
"https://esgvoc.ipsl.fr/resource/universe": service.current_state.universe.local_path
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
merger = DataMerger(
|
|
109
|
+
data=JsonLdResource(uri=str(term_file_path)),
|
|
110
|
+
locally_available=locally_available,
|
|
111
|
+
allowed_base_uris={"https://esgvoc.ipsl.fr/resource/universe"},
|
|
112
|
+
)
|
|
113
|
+
merged_data = merger.merge_linked_json()[-1]
|
|
114
|
+
# Resolve all nested @id references to full objects
|
|
115
|
+
# Use resolve_merged_ids to properly handle merged data with correct context
|
|
116
|
+
json_specs = merger.resolve_merged_ids(
|
|
117
|
+
merged_data,
|
|
118
|
+
context_base_path=service.current_state.universe.local_path
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
term_kind = infer_term_kind(json_specs)
|
|
122
|
+
term_id = json_specs["id"]
|
|
123
|
+
|
|
124
|
+
if term_kind_dd is None:
|
|
125
|
+
term_kind_dd = term_kind
|
|
126
|
+
except Exception as e:
|
|
127
|
+
_LOGGER.error(
|
|
128
|
+
f"❌ UNIVERSE INGESTION FAILURE - Term skipped\n"
|
|
129
|
+
f" File: {term_file_path}\n"
|
|
130
|
+
f" Descriptor: {data_descriptor_id}\n"
|
|
131
|
+
f" Error Type: {type(e).__name__}\n"
|
|
132
|
+
f" Error Message: {str(e)}\n"
|
|
133
|
+
f" Full Traceback:\n{traceback.format_exc()}"
|
|
134
|
+
)
|
|
135
|
+
continue
|
|
136
|
+
if term_id and json_specs and data_descriptor and term_kind:
|
|
137
|
+
_LOGGER.debug(f"adding {term_id}")
|
|
138
|
+
term = UTerm(
|
|
139
|
+
id=term_id,
|
|
140
|
+
specs=json_specs,
|
|
141
|
+
data_descriptor=data_descriptor,
|
|
142
|
+
kind=term_kind,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
session.add(term)
|
|
146
|
+
if term_kind_dd is not None:
|
|
147
|
+
data_descriptor.term_kind = term_kind_dd
|
|
148
|
+
else:
|
|
149
|
+
# If no terms were found, default to PLAIN
|
|
150
|
+
_LOGGER.warning(
|
|
151
|
+
f"TermKind was not auto-detected for data descriptor '{data_descriptor_id}'. "
|
|
152
|
+
f"No terms were successfully ingested. Defaulting to PLAIN."
|
|
153
|
+
)
|
|
154
|
+
data_descriptor.term_kind = TermKind.PLAIN
|
|
155
|
+
session.add(data_descriptor)
|
|
156
|
+
session.commit()
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def get_universe_term(data_descriptor_id: str, term_id: str, universe_db_session: Session) -> tuple[TermKind, dict]:
|
|
160
|
+
statement = select(UTerm).join(UDataDescriptor).where(UDataDescriptor.id == data_descriptor_id, UTerm.id == term_id)
|
|
161
|
+
results = universe_db_session.exec(statement)
|
|
162
|
+
term = results.one()
|
|
163
|
+
return term.kind, term.specs
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
if __name__ == "__main__":
|
|
167
|
+
import os
|
|
168
|
+
|
|
169
|
+
root_dir = Path(str(os.getcwd())).parent.parent
|
|
170
|
+
print(root_dir)
|
|
171
|
+
universe_create_db(root_dir / Path(".cache/dbs/universe.sqlite"))
|
|
172
|
+
ingest_universe(root_dir / Path(".cache/repos/mip-cmor-tables"), root_dir / Path(".cache/dbs/universe.sqlite"))
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
class EsgvocException(Exception):
|
|
2
|
+
"""
|
|
3
|
+
Class base of all ESGVOC errors.
|
|
4
|
+
"""
|
|
5
|
+
pass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class EsgvocNotFoundError(EsgvocException):
|
|
9
|
+
"""
|
|
10
|
+
Represents the not found errors.
|
|
11
|
+
"""
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EsgvocValueError(EsgvocException):
|
|
16
|
+
"""
|
|
17
|
+
Represents value errors.
|
|
18
|
+
"""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class EsgvocDbError(EsgvocException):
|
|
23
|
+
"""
|
|
24
|
+
Represents errors relative to data base management.
|
|
25
|
+
"""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class EsgvocNotImplementedError(EsgvocException):
|
|
30
|
+
"""
|
|
31
|
+
Represents not implemented errors.
|
|
32
|
+
"""
|
|
33
|
+
pass
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import logging.config
|
|
2
|
+
|
|
3
|
+
LOGGING_CONFIG = {
|
|
4
|
+
'version': 1,
|
|
5
|
+
'disable_existing_loggers': False,
|
|
6
|
+
'formatters': {
|
|
7
|
+
'esgvoc_formatter': {
|
|
8
|
+
'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s',
|
|
9
|
+
},
|
|
10
|
+
},
|
|
11
|
+
'handlers': {
|
|
12
|
+
'esgvoc_stdout': {
|
|
13
|
+
'class': 'logging.StreamHandler',
|
|
14
|
+
'formatter': 'esgvoc_formatter',
|
|
15
|
+
},
|
|
16
|
+
},
|
|
17
|
+
'loggers': {
|
|
18
|
+
'esgvoc': {
|
|
19
|
+
'handlers': ['esgvoc_stdout'],
|
|
20
|
+
'level': 'ERROR',
|
|
21
|
+
'propagate': False,
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
logging.config.dictConfig(LOGGING_CONFIG)
|