esgvoc 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- esgvoc/__init__.py +3 -0
- esgvoc/api/__init__.py +91 -0
- esgvoc/api/data_descriptors/EMD_models/__init__.py +66 -0
- esgvoc/api/data_descriptors/EMD_models/arrangement.py +21 -0
- esgvoc/api/data_descriptors/EMD_models/calendar.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/cell_variable_type.py +20 -0
- esgvoc/api/data_descriptors/EMD_models/component_type.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/coordinate.py +52 -0
- esgvoc/api/data_descriptors/EMD_models/grid_mapping.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/grid_region.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/grid_type.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_computational_grid.py +56 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_grid_cells.py +230 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_subgrid.py +41 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_units.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/model.py +139 -0
- esgvoc/api/data_descriptors/EMD_models/model_component.py +115 -0
- esgvoc/api/data_descriptors/EMD_models/reference.py +61 -0
- esgvoc/api/data_descriptors/EMD_models/resolution.py +48 -0
- esgvoc/api/data_descriptors/EMD_models/temporal_refinement.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/truncation_method.py +17 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_computational_grid.py +91 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_coordinate.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_units.py +19 -0
- esgvoc/api/data_descriptors/__init__.py +159 -0
- esgvoc/api/data_descriptors/activity.py +72 -0
- esgvoc/api/data_descriptors/archive.py +5 -0
- esgvoc/api/data_descriptors/area_label.py +30 -0
- esgvoc/api/data_descriptors/branded_suffix.py +30 -0
- esgvoc/api/data_descriptors/branded_variable.py +21 -0
- esgvoc/api/data_descriptors/citation_url.py +5 -0
- esgvoc/api/data_descriptors/contact.py +5 -0
- esgvoc/api/data_descriptors/conventions.py +28 -0
- esgvoc/api/data_descriptors/creation_date.py +18 -0
- esgvoc/api/data_descriptors/data_descriptor.py +127 -0
- esgvoc/api/data_descriptors/data_specs_version.py +25 -0
- esgvoc/api/data_descriptors/date.py +5 -0
- esgvoc/api/data_descriptors/directory_date.py +22 -0
- esgvoc/api/data_descriptors/drs_specs.py +38 -0
- esgvoc/api/data_descriptors/experiment.py +215 -0
- esgvoc/api/data_descriptors/forcing_index.py +21 -0
- esgvoc/api/data_descriptors/frequency.py +48 -0
- esgvoc/api/data_descriptors/further_info_url.py +5 -0
- esgvoc/api/data_descriptors/grid.py +43 -0
- esgvoc/api/data_descriptors/horizontal_label.py +20 -0
- esgvoc/api/data_descriptors/initialization_index.py +27 -0
- esgvoc/api/data_descriptors/institution.py +80 -0
- esgvoc/api/data_descriptors/known_branded_variable.py +75 -0
- esgvoc/api/data_descriptors/license.py +31 -0
- esgvoc/api/data_descriptors/member_id.py +9 -0
- esgvoc/api/data_descriptors/mip_era.py +26 -0
- esgvoc/api/data_descriptors/model_component.py +32 -0
- esgvoc/api/data_descriptors/models_test/models.py +17 -0
- esgvoc/api/data_descriptors/nominal_resolution.py +50 -0
- esgvoc/api/data_descriptors/obs_type.py +5 -0
- esgvoc/api/data_descriptors/organisation.py +22 -0
- esgvoc/api/data_descriptors/physics_index.py +21 -0
- esgvoc/api/data_descriptors/product.py +16 -0
- esgvoc/api/data_descriptors/publication_status.py +5 -0
- esgvoc/api/data_descriptors/realization_index.py +24 -0
- esgvoc/api/data_descriptors/realm.py +16 -0
- esgvoc/api/data_descriptors/regex.py +5 -0
- esgvoc/api/data_descriptors/region.py +35 -0
- esgvoc/api/data_descriptors/resolution.py +7 -0
- esgvoc/api/data_descriptors/source.py +120 -0
- esgvoc/api/data_descriptors/source_type.py +5 -0
- esgvoc/api/data_descriptors/sub_experiment.py +5 -0
- esgvoc/api/data_descriptors/table.py +28 -0
- esgvoc/api/data_descriptors/temporal_label.py +20 -0
- esgvoc/api/data_descriptors/time_range.py +17 -0
- esgvoc/api/data_descriptors/title.py +5 -0
- esgvoc/api/data_descriptors/tracking_id.py +67 -0
- esgvoc/api/data_descriptors/variable.py +56 -0
- esgvoc/api/data_descriptors/variant_label.py +25 -0
- esgvoc/api/data_descriptors/vertical_label.py +20 -0
- esgvoc/api/project_specs.py +143 -0
- esgvoc/api/projects.py +1253 -0
- esgvoc/api/py.typed +0 -0
- esgvoc/api/pydantic_handler.py +146 -0
- esgvoc/api/report.py +127 -0
- esgvoc/api/search.py +171 -0
- esgvoc/api/universe.py +434 -0
- esgvoc/apps/__init__.py +6 -0
- esgvoc/apps/cmor_tables/__init__.py +7 -0
- esgvoc/apps/cmor_tables/cvs_table.py +948 -0
- esgvoc/apps/drs/__init__.py +0 -0
- esgvoc/apps/drs/constants.py +2 -0
- esgvoc/apps/drs/generator.py +429 -0
- esgvoc/apps/drs/report.py +540 -0
- esgvoc/apps/drs/validator.py +312 -0
- esgvoc/apps/ga/__init__.py +104 -0
- esgvoc/apps/ga/example_usage.py +315 -0
- esgvoc/apps/ga/models/__init__.py +47 -0
- esgvoc/apps/ga/models/netcdf_header.py +306 -0
- esgvoc/apps/ga/models/validator.py +491 -0
- esgvoc/apps/ga/test_ga.py +161 -0
- esgvoc/apps/ga/validator.py +277 -0
- esgvoc/apps/jsg/json_schema_generator.py +341 -0
- esgvoc/apps/jsg/templates/template.jinja +241 -0
- esgvoc/apps/test_cv/README.md +214 -0
- esgvoc/apps/test_cv/__init__.py +0 -0
- esgvoc/apps/test_cv/cv_tester.py +1611 -0
- esgvoc/apps/test_cv/example_usage.py +216 -0
- esgvoc/apps/vr/__init__.py +12 -0
- esgvoc/apps/vr/build_variable_registry.py +71 -0
- esgvoc/apps/vr/example_usage.py +60 -0
- esgvoc/apps/vr/vr_app.py +333 -0
- esgvoc/cli/clean.py +304 -0
- esgvoc/cli/cmor.py +46 -0
- esgvoc/cli/config.py +1300 -0
- esgvoc/cli/drs.py +267 -0
- esgvoc/cli/find.py +138 -0
- esgvoc/cli/get.py +155 -0
- esgvoc/cli/install.py +41 -0
- esgvoc/cli/main.py +60 -0
- esgvoc/cli/offline.py +269 -0
- esgvoc/cli/status.py +79 -0
- esgvoc/cli/test_cv.py +258 -0
- esgvoc/cli/valid.py +147 -0
- esgvoc/core/constants.py +17 -0
- esgvoc/core/convert.py +0 -0
- esgvoc/core/data_handler.py +206 -0
- esgvoc/core/db/__init__.py +3 -0
- esgvoc/core/db/connection.py +40 -0
- esgvoc/core/db/models/mixins.py +25 -0
- esgvoc/core/db/models/project.py +102 -0
- esgvoc/core/db/models/universe.py +98 -0
- esgvoc/core/db/project_ingestion.py +231 -0
- esgvoc/core/db/universe_ingestion.py +172 -0
- esgvoc/core/exceptions.py +33 -0
- esgvoc/core/logging_handler.py +26 -0
- esgvoc/core/repo_fetcher.py +345 -0
- esgvoc/core/service/__init__.py +41 -0
- esgvoc/core/service/configuration/config_manager.py +196 -0
- esgvoc/core/service/configuration/setting.py +363 -0
- esgvoc/core/service/data_merger.py +634 -0
- esgvoc/core/service/esg_voc.py +77 -0
- esgvoc/core/service/resolver_config.py +56 -0
- esgvoc/core/service/state.py +324 -0
- esgvoc/core/service/string_heuristics.py +98 -0
- esgvoc/core/service/term_cache.py +108 -0
- esgvoc/core/service/uri_resolver.py +133 -0
- esgvoc-2.0.2.dist-info/METADATA +82 -0
- esgvoc-2.0.2.dist-info/RECORD +147 -0
- esgvoc-2.0.2.dist-info/WHEEL +4 -0
- esgvoc-2.0.2.dist-info/entry_points.txt +2 -0
- esgvoc-2.0.2.dist-info/licenses/LICENSE.txt +519 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from rich.logging import RichHandler
|
|
4
|
+
from rich.console import Console
|
|
5
|
+
import shutil
|
|
6
|
+
import esgvoc.core.service as service
|
|
7
|
+
|
|
8
|
+
_LOGGER = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
rich_handler = RichHandler(rich_tracebacks=True)
|
|
11
|
+
_LOGGER.addHandler(rich_handler)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def reset_init_repo():
|
|
15
|
+
service_settings = service.service_settings
|
|
16
|
+
if (service_settings.universe.local_path) and os.path.exists(service_settings.universe.local_path):
|
|
17
|
+
shutil.rmtree(service_settings.universe.local_path)
|
|
18
|
+
|
|
19
|
+
for _, proj in service_settings.projects.items():
|
|
20
|
+
if (proj.local_path) and os.path.exists(proj.local_path):
|
|
21
|
+
shutil.rmtree(proj.local_path)
|
|
22
|
+
service.state_service.get_state_summary()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def reset_init_db():
|
|
26
|
+
service_settings = service.service_settings
|
|
27
|
+
if (service_settings.universe.db_path) and os.path.exists(service_settings.universe.db_path):
|
|
28
|
+
os.remove(service_settings.universe.db_path)
|
|
29
|
+
for _, proj in service_settings.projects.items():
|
|
30
|
+
if (proj.db_path) and os.path.exists(proj.db_path):
|
|
31
|
+
os.remove(proj.db_path)
|
|
32
|
+
service.state_service.get_state_summary()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def reset_init_all():
|
|
36
|
+
reset_init_db()
|
|
37
|
+
reset_init_repo()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def display(table):
|
|
41
|
+
console = Console(record=True, width=200)
|
|
42
|
+
console.print(table)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def install():
|
|
46
|
+
service.state_service.synchronize_all()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
if __name__ == "__main__":
|
|
50
|
+
|
|
51
|
+
def Nothing(): # IT WORKS
|
|
52
|
+
reset_init_all()
|
|
53
|
+
display(service.state_service.table())
|
|
54
|
+
service.state_service.universe.sync()
|
|
55
|
+
display(service.state_service.table())
|
|
56
|
+
for _, proj in service.state_service.projects.items():
|
|
57
|
+
proj.sync()
|
|
58
|
+
display(service.state_service.table())
|
|
59
|
+
|
|
60
|
+
def OnlyLocal(): # IT ALSO WORKS
|
|
61
|
+
reset_init_db()
|
|
62
|
+
service.state_service.universe.github_access = False
|
|
63
|
+
for _, proj in service.state_service.projects.items():
|
|
64
|
+
proj.github_access = False
|
|
65
|
+
display(service.state_service.table())
|
|
66
|
+
|
|
67
|
+
service.state_service.universe.sync()
|
|
68
|
+
display(service.state_service.table())
|
|
69
|
+
for _, proj in service.state_service.projects.items():
|
|
70
|
+
proj.sync()
|
|
71
|
+
display(service.state_service.table())
|
|
72
|
+
|
|
73
|
+
# TODO Some other test to do to be complete:
|
|
74
|
+
# Change the settings ... for now .. let say nobody change the settings !
|
|
75
|
+
|
|
76
|
+
OnlyLocal()
|
|
77
|
+
# service.state_service.synchronize_all()
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Configuration for JSON-LD reference resolution behavior."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class ResolverConfig:
|
|
9
|
+
"""
|
|
10
|
+
Configuration for controlling JSON-LD ID reference resolution behavior.
|
|
11
|
+
|
|
12
|
+
This class provides fine-grained control over how the DataMerger resolves
|
|
13
|
+
nested @id references, including depth limits, string filtering, and
|
|
14
|
+
file resolution strategies.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
# Recursion control
|
|
18
|
+
max_depth: int = 5
|
|
19
|
+
"""Maximum recursion depth when resolving nested references"""
|
|
20
|
+
|
|
21
|
+
# String filtering for primitive resolution
|
|
22
|
+
max_string_length: int = 100
|
|
23
|
+
"""Maximum length for strings to be considered as ID references"""
|
|
24
|
+
|
|
25
|
+
exclude_patterns: List[str] = field(default_factory=lambda: [" ", ".", "http", "/", "@"])
|
|
26
|
+
"""Patterns that disqualify a string from being resolved as an ID reference"""
|
|
27
|
+
|
|
28
|
+
# File resolution strategies
|
|
29
|
+
fallback_dirs: List[str] = field(default_factory=lambda: ["horizontal_grid", "vertical_grid", "grid"])
|
|
30
|
+
"""Alternative directories to search when a term file is not found"""
|
|
31
|
+
|
|
32
|
+
min_path_parts: int = 3
|
|
33
|
+
"""Minimum number of path components required for alternate directory search"""
|
|
34
|
+
|
|
35
|
+
# Network and I/O
|
|
36
|
+
verify_ssl: bool = True
|
|
37
|
+
"""Whether to verify SSL certificates when fetching remote resources"""
|
|
38
|
+
|
|
39
|
+
enable_caching: bool = True
|
|
40
|
+
"""Whether to cache fetched terms to improve performance"""
|
|
41
|
+
|
|
42
|
+
cache_size: int = 128
|
|
43
|
+
"""Maximum number of terms to cache (when caching is enabled)"""
|
|
44
|
+
|
|
45
|
+
# Logging and debugging
|
|
46
|
+
log_depth_warnings: bool = True
|
|
47
|
+
"""Whether to log warnings when max_depth is exceeded"""
|
|
48
|
+
|
|
49
|
+
def __post_init__(self):
|
|
50
|
+
"""Validate configuration values."""
|
|
51
|
+
if self.max_depth < 1:
|
|
52
|
+
raise ValueError("max_depth must be at least 1")
|
|
53
|
+
if self.max_string_length < 1:
|
|
54
|
+
raise ValueError("max_string_length must be at least 1")
|
|
55
|
+
if self.cache_size < 1:
|
|
56
|
+
raise ValueError("cache_size must be at least 1")
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from rich.table import Table
|
|
7
|
+
from sqlalchemy.exc import NoResultFound
|
|
8
|
+
from sqlmodel import select
|
|
9
|
+
|
|
10
|
+
from esgvoc.core.db.connection import DBConnection
|
|
11
|
+
from esgvoc.core.db.models.project import Project
|
|
12
|
+
from esgvoc.core.db.models.universe import Universe
|
|
13
|
+
from esgvoc.core.repo_fetcher import RepoFetcher
|
|
14
|
+
from esgvoc.core.service.configuration.setting import ProjectSettings, ServiceSettings, UniverseSettings
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class BaseState:
|
|
20
|
+
def __init__(
|
|
21
|
+
self, github_repo: str, branch: str = "main", local_path: Optional[str] = None, db_path: Optional[str] = None, offline_mode: bool = False
|
|
22
|
+
):
|
|
23
|
+
from esgvoc.core.service import config_manager
|
|
24
|
+
|
|
25
|
+
self.base_dir = config_manager.data_config_dir # needed for repofetcher
|
|
26
|
+
|
|
27
|
+
self.github_repo: str = github_repo
|
|
28
|
+
self.branch: str = branch
|
|
29
|
+
self.offline_mode: bool = offline_mode
|
|
30
|
+
# False if we dont have internet and some other cases
|
|
31
|
+
# In offline mode, disable github access from the start
|
|
32
|
+
self.github_access: bool = not offline_mode
|
|
33
|
+
self.github_version: str | None = None
|
|
34
|
+
|
|
35
|
+
self.local_path: str | None = local_path
|
|
36
|
+
self.local_access: bool = True # False if we dont have cloned the remote repo yet
|
|
37
|
+
self.local_version: str | None = None
|
|
38
|
+
|
|
39
|
+
self.db_path: str | None = db_path
|
|
40
|
+
self.db_access: bool = True # False if we cant access the db for some reason
|
|
41
|
+
self.db_version: str | None = None
|
|
42
|
+
|
|
43
|
+
self.rf = RepoFetcher(local_path=str(self.base_dir), offline_mode=offline_mode)
|
|
44
|
+
self.db_connection: DBConnection | None = None
|
|
45
|
+
self.db_sqlmodel: Universe | Project | None = None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def fetch_version_local(self):
|
|
49
|
+
if self.local_path:
|
|
50
|
+
try:
|
|
51
|
+
self.local_version = self.rf.get_local_repo_version(self.local_path, self.branch)
|
|
52
|
+
logger.debug(f"Local repo commit: {self.local_version}")
|
|
53
|
+
self.local_access = True
|
|
54
|
+
except Exception as e:
|
|
55
|
+
logger.exception(f"Failed to fetch local repo version: {e}")
|
|
56
|
+
self.local_access = False
|
|
57
|
+
|
|
58
|
+
def fetch_version_remote(self):
|
|
59
|
+
if self.offline_mode:
|
|
60
|
+
logger.debug("Skipping remote version fetch due to offline mode")
|
|
61
|
+
self.github_access = False
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
if self.github_repo:
|
|
65
|
+
owner = None
|
|
66
|
+
repo = None
|
|
67
|
+
try:
|
|
68
|
+
owner, repo = self.github_repo.removeprefix("https://github.com/").split("/")
|
|
69
|
+
self.github_version = self.rf.get_github_version(owner, repo, self.branch)
|
|
70
|
+
self.github_access = True
|
|
71
|
+
logger.debug(f"Latest GitHub commit: {self.github_version}")
|
|
72
|
+
except IndexError as e:
|
|
73
|
+
self.github_access = False
|
|
74
|
+
except Exception as e:
|
|
75
|
+
logger.exception(
|
|
76
|
+
f"Failed to fetch GitHub version: {e} ,for {self.github_repo},owner : {owner}, repo : {repo},branch : {self.branch}"
|
|
77
|
+
)
|
|
78
|
+
self.github_access = False
|
|
79
|
+
|
|
80
|
+
if self.github_version is None:
|
|
81
|
+
self.github_access = False
|
|
82
|
+
|
|
83
|
+
def connect_db(self):
|
|
84
|
+
if self.db_path:
|
|
85
|
+
if not os.path.exists(self.db_path):
|
|
86
|
+
self.db_access = False
|
|
87
|
+
else:
|
|
88
|
+
self.db_connection = DBConnection(db_file_path=Path(self.db_path))
|
|
89
|
+
|
|
90
|
+
def fetch_version_db(self):
|
|
91
|
+
if self.db_path:
|
|
92
|
+
if not os.path.exists(self.db_path):
|
|
93
|
+
self.db_version = None
|
|
94
|
+
self.db_access = False
|
|
95
|
+
else:
|
|
96
|
+
try:
|
|
97
|
+
with self.db_connection.create_session() as session:
|
|
98
|
+
self.db_version = session.exec(select(self.db_sqlmodel.git_hash)).one()
|
|
99
|
+
self.db_access = True
|
|
100
|
+
except NoResultFound:
|
|
101
|
+
logger.debug(f"Unable to find git_hash in {self.db_path}")
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.debug(f"Unable to find git_has in {self.db_path} cause {e}")
|
|
104
|
+
|
|
105
|
+
else:
|
|
106
|
+
self.db_version = None
|
|
107
|
+
self.db_access = False
|
|
108
|
+
|
|
109
|
+
def fetch_versions(self):
|
|
110
|
+
if self.github_access:
|
|
111
|
+
self.fetch_version_remote()
|
|
112
|
+
self.fetch_version_local()
|
|
113
|
+
self.fetch_version_db()
|
|
114
|
+
|
|
115
|
+
def check_sync_status(self):
|
|
116
|
+
self.fetch_versions()
|
|
117
|
+
return {
|
|
118
|
+
"github": self.github_version if self.github_version else None,
|
|
119
|
+
"local": self.local_version if self.local_version else None,
|
|
120
|
+
"db": self.db_version if self.db_version else None,
|
|
121
|
+
"github_local_sync": self.github_version == self.local_version
|
|
122
|
+
if self.github_access and self.github_version and self.local_version
|
|
123
|
+
else False,
|
|
124
|
+
"local_db_sync": self.local_version == self.db_version
|
|
125
|
+
if self.local_access and self.local_version
|
|
126
|
+
else False,
|
|
127
|
+
"github_db_sync": self.github_version == self.db_version
|
|
128
|
+
if self.github_access and self.github_version
|
|
129
|
+
else False,
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
def clone_remote(self, force_clean=False):
|
|
133
|
+
if self.offline_mode:
|
|
134
|
+
logger.warning("Cannot clone remote repository in offline mode")
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
# If force_clean is True or if local repo exists and we're handling divergence,
|
|
138
|
+
# remove the existing local repository to ensure clean state
|
|
139
|
+
if force_clean and self.local_path and os.path.exists(self.local_path):
|
|
140
|
+
print(f"Removing existing local repository: {self.local_path}")
|
|
141
|
+
import shutil
|
|
142
|
+
shutil.rmtree(self.local_path)
|
|
143
|
+
|
|
144
|
+
owner, repo = self.github_repo.removeprefix("https://github.com/").split("/")
|
|
145
|
+
# TODO add destination "local_path" in clone_repo, done in a wierd way Improve that:
|
|
146
|
+
self.rf.clone_repository(owner, repo, self.branch, self.local_path)
|
|
147
|
+
self.fetch_version_local()
|
|
148
|
+
|
|
149
|
+
def build_db(self):
|
|
150
|
+
from esgvoc.core.db.models.project import project_create_db
|
|
151
|
+
from esgvoc.core.db.models.universe import universe_create_db
|
|
152
|
+
from esgvoc.core.db.project_ingestion import ingest_project
|
|
153
|
+
from esgvoc.core.db.universe_ingestion import ingest_metadata_universe, ingest_universe
|
|
154
|
+
|
|
155
|
+
if self.db_path:
|
|
156
|
+
if os.path.exists(self.db_path):
|
|
157
|
+
os.remove(self.db_path)
|
|
158
|
+
else:
|
|
159
|
+
os.makedirs(Path(self.db_path).parent, exist_ok=True)
|
|
160
|
+
|
|
161
|
+
if self.db_sqlmodel == Universe: # Ugly
|
|
162
|
+
print("Building Universe DB from ", self.local_path)
|
|
163
|
+
universe_create_db(Path(self.db_path))
|
|
164
|
+
self.db_connection = DBConnection(db_file_path=Path(self.db_path))
|
|
165
|
+
|
|
166
|
+
ingest_metadata_universe(self.db_connection, self.local_version)
|
|
167
|
+
print("Filling Universe DB")
|
|
168
|
+
if self.local_path:
|
|
169
|
+
ingest_universe(Path(self.local_path), Path(self.db_path))
|
|
170
|
+
|
|
171
|
+
elif self.db_sqlmodel == Project:
|
|
172
|
+
print("Building Project DB from ", self.local_path)
|
|
173
|
+
project_create_db(Path(self.db_path))
|
|
174
|
+
print("Filling project DB")
|
|
175
|
+
if self.local_path and self.local_version:
|
|
176
|
+
ingest_project(Path(self.local_path), Path(self.db_path), self.local_version)
|
|
177
|
+
self.fetch_version_db()
|
|
178
|
+
|
|
179
|
+
def sync(self):
|
|
180
|
+
summary = self.check_sync_status()
|
|
181
|
+
updated = False
|
|
182
|
+
|
|
183
|
+
if self.offline_mode:
|
|
184
|
+
print("Running in offline mode - only using local repositories and databases")
|
|
185
|
+
if self.local_access:
|
|
186
|
+
if not summary["local_db_sync"] and summary["local_db_sync"] is not None:
|
|
187
|
+
self.build_db()
|
|
188
|
+
updated = True
|
|
189
|
+
else:
|
|
190
|
+
print("Cache db is uptodate from local repository")
|
|
191
|
+
elif not self.db_access: # it can happen if the db is created but not filled
|
|
192
|
+
if self.local_path and os.path.exists(self.local_path):
|
|
193
|
+
self.build_db()
|
|
194
|
+
updated = True
|
|
195
|
+
else:
|
|
196
|
+
print(f"No local repository found at {self.local_path} - cannot sync in offline mode")
|
|
197
|
+
else:
|
|
198
|
+
print("Nothing to sync in offline mode - local repository and database are up to date")
|
|
199
|
+
return updated
|
|
200
|
+
|
|
201
|
+
# Online sync logic with offline-to-online transition detection
|
|
202
|
+
if (
|
|
203
|
+
self.github_access
|
|
204
|
+
and summary["github_db_sync"] is None
|
|
205
|
+
and summary["local_db_sync"] is None
|
|
206
|
+
and summary["github_local_sync"] is None
|
|
207
|
+
):
|
|
208
|
+
self.clone_remote()
|
|
209
|
+
self.build_db()
|
|
210
|
+
updated = True
|
|
211
|
+
elif self.github_access and not summary["github_db_sync"]:
|
|
212
|
+
if not summary["local_db_sync"] and summary["local_db_sync"] is not None:
|
|
213
|
+
self.clone_remote()
|
|
214
|
+
self.build_db()
|
|
215
|
+
updated = True
|
|
216
|
+
elif not summary["github_local_sync"]:
|
|
217
|
+
# Critical fix: when local and remote diverge in online mode,
|
|
218
|
+
# prioritize remote truth by completely removing local repo and re-cloning
|
|
219
|
+
print(f"Local and remote repositories have diverged (local: {summary['local'][:8] if summary['local'] else 'N/A'}, remote: {summary['github'][:8] if summary['github'] else 'N/A'})")
|
|
220
|
+
print("Prioritizing remote repository truth - removing local repository and re-cloning from GitHub...")
|
|
221
|
+
self.clone_remote(force_clean=True)
|
|
222
|
+
self.build_db()
|
|
223
|
+
updated = True
|
|
224
|
+
else: # can be simply build in root and clone if neccessary
|
|
225
|
+
self.build_db()
|
|
226
|
+
updated = True
|
|
227
|
+
elif self.local_access:
|
|
228
|
+
if not summary["local_db_sync"] and summary["local_db_sync"] is not None:
|
|
229
|
+
self.build_db()
|
|
230
|
+
updated = True
|
|
231
|
+
else:
|
|
232
|
+
print("Cache db is uptodate from local repository")
|
|
233
|
+
elif not self.db_access: # it can happen if the db is created but not filled
|
|
234
|
+
self.build_db()
|
|
235
|
+
updated = True
|
|
236
|
+
else:
|
|
237
|
+
print("Nothing to install, everything up to date")
|
|
238
|
+
print("Try 'esgvoc status' for more details")
|
|
239
|
+
return updated
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class StateUniverse(BaseState):
|
|
243
|
+
def __init__(self, settings: UniverseSettings):
|
|
244
|
+
params = settings.model_dump()
|
|
245
|
+
params['local_path'] = settings.get_absolute_local_path()
|
|
246
|
+
params['db_path'] = settings.get_absolute_db_path()
|
|
247
|
+
super().__init__(**params)
|
|
248
|
+
self.db_sqlmodel = Universe
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
class StateProject(BaseState):
|
|
252
|
+
def __init__(self, settings: ProjectSettings):
|
|
253
|
+
mdict = settings.model_dump()
|
|
254
|
+
self.project_name = mdict.pop("project_name")
|
|
255
|
+
mdict['local_path'] = settings.get_absolute_local_path()
|
|
256
|
+
mdict['db_path'] = settings.get_absolute_db_path()
|
|
257
|
+
super().__init__(**mdict)
|
|
258
|
+
self.db_sqlmodel = Project
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
class StateService:
|
|
262
|
+
def __init__(self, service_settings: ServiceSettings):
|
|
263
|
+
self.universe = StateUniverse(service_settings.universe)
|
|
264
|
+
self.projects = {name: StateProject(proj) for name, proj in service_settings.projects.items()}
|
|
265
|
+
self.connect_db()
|
|
266
|
+
|
|
267
|
+
def get_state_summary(self):
|
|
268
|
+
universe_status = self.universe.check_sync_status()
|
|
269
|
+
project_statuses = {name: proj.check_sync_status() for name, proj in self.projects.items()}
|
|
270
|
+
return {"universe": universe_status, "projects": project_statuses}
|
|
271
|
+
|
|
272
|
+
def fetch_versions(self):
|
|
273
|
+
self.universe.fetch_versions()
|
|
274
|
+
for _, proj_state in self.projects.items():
|
|
275
|
+
proj_state.fetch_versions()
|
|
276
|
+
|
|
277
|
+
def connect_db(self):
|
|
278
|
+
self.universe.connect_db()
|
|
279
|
+
for _, proj_state in self.projects.items():
|
|
280
|
+
proj_state.connect_db()
|
|
281
|
+
|
|
282
|
+
def synchronize_all(self):
|
|
283
|
+
print("sync universe")
|
|
284
|
+
if self.universe.offline_mode:
|
|
285
|
+
print("Universe is in offline mode")
|
|
286
|
+
universe_updated = self.universe.sync()
|
|
287
|
+
print("sync projects")
|
|
288
|
+
for project_name, project in self.projects.items():
|
|
289
|
+
if project.offline_mode:
|
|
290
|
+
print(f"Project {project_name} is in offline mode")
|
|
291
|
+
project_updated = project.sync()
|
|
292
|
+
if universe_updated and not project_updated:
|
|
293
|
+
project.build_db()
|
|
294
|
+
self.connect_db()
|
|
295
|
+
|
|
296
|
+
def table(self):
|
|
297
|
+
table = Table(show_header=False, show_lines=True)
|
|
298
|
+
table.add_row("", "Remote github repo", "Local repository", "Cache Database")
|
|
299
|
+
table.add_row("Universe path", self.universe.github_repo, self.universe.local_path, self.universe.db_path)
|
|
300
|
+
table.add_row("Version", self.universe.github_version, self.universe.local_version, self.universe.db_version)
|
|
301
|
+
for proj_name, proj in self.projects.items():
|
|
302
|
+
# table.add_row("","Remote github repo","Local repository","Cache Database")
|
|
303
|
+
table.add_row(f"{proj_name} path", proj.github_repo, proj.local_path, proj.db_path)
|
|
304
|
+
table.add_row("Version", proj.github_version, proj.local_version, proj.db_version)
|
|
305
|
+
return table
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
if __name__ == "__main__":
|
|
309
|
+
# Load settings from file
|
|
310
|
+
service_settings = ServiceSettings.load_from_file("src/esgvoc/core/service/settings.toml")
|
|
311
|
+
|
|
312
|
+
# Initialize StateService
|
|
313
|
+
state_service = StateService(service_settings)
|
|
314
|
+
state_service.get_state_summary()
|
|
315
|
+
|
|
316
|
+
# Synchronize all
|
|
317
|
+
state_service.synchronize_all()
|
|
318
|
+
|
|
319
|
+
# pprint(state_service.universe.github_version)
|
|
320
|
+
# pprint(state_service.universe.local_version)
|
|
321
|
+
# pprint(state_service.universe.db_version)
|
|
322
|
+
|
|
323
|
+
# Check for differences
|
|
324
|
+
# pprint(state_service.find_version_differences())
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Heuristics for determining if strings should be resolved as ID references."""
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class StringHeuristics:
|
|
7
|
+
"""
|
|
8
|
+
Determine if a string value should be resolved as an ID reference.
|
|
9
|
+
|
|
10
|
+
Uses configurable heuristics to distinguish between:
|
|
11
|
+
- ID references (e.g., "hadgem3_gc31_atmosphere") - should resolve
|
|
12
|
+
- Literal strings (e.g., "A long description...") - should not resolve
|
|
13
|
+
- URLs (e.g., "https://doi.org/...") - should not resolve
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, max_length: int = 100, exclude_patterns: List[str] | None = None):
|
|
17
|
+
"""
|
|
18
|
+
Initialize string heuristics.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
max_length: Maximum length for strings to be considered as ID references.
|
|
22
|
+
Longer strings are assumed to be content, not references.
|
|
23
|
+
exclude_patterns: Patterns that disqualify a string from being an ID reference.
|
|
24
|
+
Defaults to [" ", ".", "http", "/", "@"] which filter out
|
|
25
|
+
descriptions, URLs, DOIs, paths, and emails.
|
|
26
|
+
"""
|
|
27
|
+
self.max_length = max_length
|
|
28
|
+
self.exclude_patterns = exclude_patterns or [" ", ".", "http", "/", "@"]
|
|
29
|
+
|
|
30
|
+
def is_resolvable(self, value: str) -> bool:
|
|
31
|
+
"""
|
|
32
|
+
Check if a string looks like an ID reference that should be resolved.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
value: The string to evaluate
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
True if the string appears to be an ID reference, False otherwise
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
>>> heuristics = StringHeuristics()
|
|
42
|
+
>>> heuristics.is_resolvable("hadgem3_gc31_atmosphere")
|
|
43
|
+
True
|
|
44
|
+
>>> heuristics.is_resolvable("This is a long description text")
|
|
45
|
+
False
|
|
46
|
+
>>> heuristics.is_resolvable("https://doi.org/10.5194/gmd")
|
|
47
|
+
False
|
|
48
|
+
"""
|
|
49
|
+
# Check length
|
|
50
|
+
if len(value) > self.max_length:
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
# Check for exclude patterns
|
|
54
|
+
for pattern in self.exclude_patterns:
|
|
55
|
+
if pattern in value:
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
return True
|
|
59
|
+
|
|
60
|
+
def should_skip_literal(self, expanded_data: dict) -> bool:
|
|
61
|
+
"""
|
|
62
|
+
Check if the expanded data indicates this is a literal value (not a reference).
|
|
63
|
+
|
|
64
|
+
In JSON-LD, literal values are marked with @value in expanded form.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
expanded_data: The expanded JSON-LD data
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
True if this is a literal value that should not be resolved
|
|
71
|
+
|
|
72
|
+
Example:
|
|
73
|
+
>>> heuristics = StringHeuristics()
|
|
74
|
+
>>> heuristics.should_skip_literal({"@value": "some text"})
|
|
75
|
+
True
|
|
76
|
+
>>> heuristics.should_skip_literal({"@id": "some_term"})
|
|
77
|
+
False
|
|
78
|
+
"""
|
|
79
|
+
return isinstance(expanded_data, dict) and "@value" in expanded_data
|
|
80
|
+
|
|
81
|
+
def has_id_in_expanded(self, expanded_data: dict) -> bool:
|
|
82
|
+
"""
|
|
83
|
+
Check if the expanded data contains an @id, indicating it's a reference.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
expanded_data: The expanded JSON-LD data
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
True if the expanded data has an @id field
|
|
90
|
+
|
|
91
|
+
Example:
|
|
92
|
+
>>> heuristics = StringHeuristics()
|
|
93
|
+
>>> heuristics.has_id_in_expanded({"@id": "https://example.com/term"})
|
|
94
|
+
True
|
|
95
|
+
>>> heuristics.has_id_in_expanded({"@value": "literal"})
|
|
96
|
+
False
|
|
97
|
+
"""
|
|
98
|
+
return isinstance(expanded_data, dict) and "@id" in expanded_data
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Caching for resolved JSON-LD terms to improve performance."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TermCache:
|
|
13
|
+
"""
|
|
14
|
+
LRU cache for JSON-LD terms fetched from files or remote sources.
|
|
15
|
+
|
|
16
|
+
Caching reduces redundant file I/O and network calls when the same
|
|
17
|
+
terms are referenced multiple times during resolution.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, max_size: int = 128, enabled: bool = True):
|
|
21
|
+
"""
|
|
22
|
+
Initialize the term cache.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
max_size: Maximum number of terms to cache
|
|
26
|
+
enabled: Whether caching is enabled (can be disabled for debugging)
|
|
27
|
+
"""
|
|
28
|
+
self.max_size = max_size
|
|
29
|
+
self.enabled = enabled
|
|
30
|
+
self._cache: Dict[str, dict] = {}
|
|
31
|
+
self._hits = 0
|
|
32
|
+
self._misses = 0
|
|
33
|
+
|
|
34
|
+
def get(self, uri: str) -> dict | None:
|
|
35
|
+
"""
|
|
36
|
+
Retrieve a cached term by URI.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
uri: The URI key for the cached term
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
The cached term data, or None if not in cache
|
|
43
|
+
"""
|
|
44
|
+
if not self.enabled:
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
if uri in self._cache:
|
|
48
|
+
self._hits += 1
|
|
49
|
+
logger.debug(f"Cache hit for {uri}")
|
|
50
|
+
return self._cache[uri]
|
|
51
|
+
|
|
52
|
+
self._misses += 1
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
def put(self, uri: str, data: dict) -> None:
|
|
56
|
+
"""
|
|
57
|
+
Store a term in the cache.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
uri: The URI key for the term
|
|
61
|
+
data: The term data to cache
|
|
62
|
+
"""
|
|
63
|
+
if not self.enabled:
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
# Simple LRU: if cache is full, remove the oldest entry
|
|
67
|
+
if len(self._cache) >= self.max_size:
|
|
68
|
+
# Remove first item (oldest in insertion order for Python 3.7+)
|
|
69
|
+
oldest_key = next(iter(self._cache))
|
|
70
|
+
del self._cache[oldest_key]
|
|
71
|
+
logger.debug(f"Cache eviction: {oldest_key}")
|
|
72
|
+
|
|
73
|
+
self._cache[uri] = data
|
|
74
|
+
logger.debug(f"Cached {uri}")
|
|
75
|
+
|
|
76
|
+
def clear(self) -> None:
|
|
77
|
+
"""Clear all cached terms."""
|
|
78
|
+
self._cache.clear()
|
|
79
|
+
self._hits = 0
|
|
80
|
+
self._misses = 0
|
|
81
|
+
logger.debug("Cache cleared")
|
|
82
|
+
|
|
83
|
+
def get_stats(self) -> Dict[str, int]:
|
|
84
|
+
"""
|
|
85
|
+
Get cache statistics.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Dictionary with cache hits, misses, size, and hit rate
|
|
89
|
+
"""
|
|
90
|
+
total_requests = self._hits + self._misses
|
|
91
|
+
hit_rate = (self._hits / total_requests * 100) if total_requests > 0 else 0
|
|
92
|
+
|
|
93
|
+
return {
|
|
94
|
+
"hits": self._hits,
|
|
95
|
+
"misses": self._misses,
|
|
96
|
+
"size": len(self._cache),
|
|
97
|
+
"max_size": self.max_size,
|
|
98
|
+
"hit_rate_percent": round(hit_rate, 2),
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
def __repr__(self) -> str:
|
|
102
|
+
"""String representation showing cache stats."""
|
|
103
|
+
stats = self.get_stats()
|
|
104
|
+
return (
|
|
105
|
+
f"TermCache(size={stats['size']}/{stats['max_size']}, "
|
|
106
|
+
f"hits={stats['hits']}, misses={stats['misses']}, "
|
|
107
|
+
f"hit_rate={stats['hit_rate_percent']}%)"
|
|
108
|
+
)
|