esgvoc 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. esgvoc/__init__.py +3 -0
  2. esgvoc/api/__init__.py +91 -0
  3. esgvoc/api/data_descriptors/EMD_models/__init__.py +66 -0
  4. esgvoc/api/data_descriptors/EMD_models/arrangement.py +21 -0
  5. esgvoc/api/data_descriptors/EMD_models/calendar.py +5 -0
  6. esgvoc/api/data_descriptors/EMD_models/cell_variable_type.py +20 -0
  7. esgvoc/api/data_descriptors/EMD_models/component_type.py +5 -0
  8. esgvoc/api/data_descriptors/EMD_models/coordinate.py +52 -0
  9. esgvoc/api/data_descriptors/EMD_models/grid_mapping.py +19 -0
  10. esgvoc/api/data_descriptors/EMD_models/grid_region.py +19 -0
  11. esgvoc/api/data_descriptors/EMD_models/grid_type.py +19 -0
  12. esgvoc/api/data_descriptors/EMD_models/horizontal_computational_grid.py +56 -0
  13. esgvoc/api/data_descriptors/EMD_models/horizontal_grid_cells.py +230 -0
  14. esgvoc/api/data_descriptors/EMD_models/horizontal_subgrid.py +41 -0
  15. esgvoc/api/data_descriptors/EMD_models/horizontal_units.py +5 -0
  16. esgvoc/api/data_descriptors/EMD_models/model.py +139 -0
  17. esgvoc/api/data_descriptors/EMD_models/model_component.py +115 -0
  18. esgvoc/api/data_descriptors/EMD_models/reference.py +61 -0
  19. esgvoc/api/data_descriptors/EMD_models/resolution.py +48 -0
  20. esgvoc/api/data_descriptors/EMD_models/temporal_refinement.py +19 -0
  21. esgvoc/api/data_descriptors/EMD_models/truncation_method.py +17 -0
  22. esgvoc/api/data_descriptors/EMD_models/vertical_computational_grid.py +91 -0
  23. esgvoc/api/data_descriptors/EMD_models/vertical_coordinate.py +5 -0
  24. esgvoc/api/data_descriptors/EMD_models/vertical_units.py +19 -0
  25. esgvoc/api/data_descriptors/__init__.py +159 -0
  26. esgvoc/api/data_descriptors/activity.py +72 -0
  27. esgvoc/api/data_descriptors/archive.py +5 -0
  28. esgvoc/api/data_descriptors/area_label.py +30 -0
  29. esgvoc/api/data_descriptors/branded_suffix.py +30 -0
  30. esgvoc/api/data_descriptors/branded_variable.py +21 -0
  31. esgvoc/api/data_descriptors/citation_url.py +5 -0
  32. esgvoc/api/data_descriptors/contact.py +5 -0
  33. esgvoc/api/data_descriptors/conventions.py +28 -0
  34. esgvoc/api/data_descriptors/creation_date.py +18 -0
  35. esgvoc/api/data_descriptors/data_descriptor.py +127 -0
  36. esgvoc/api/data_descriptors/data_specs_version.py +25 -0
  37. esgvoc/api/data_descriptors/date.py +5 -0
  38. esgvoc/api/data_descriptors/directory_date.py +22 -0
  39. esgvoc/api/data_descriptors/drs_specs.py +38 -0
  40. esgvoc/api/data_descriptors/experiment.py +215 -0
  41. esgvoc/api/data_descriptors/forcing_index.py +21 -0
  42. esgvoc/api/data_descriptors/frequency.py +48 -0
  43. esgvoc/api/data_descriptors/further_info_url.py +5 -0
  44. esgvoc/api/data_descriptors/grid.py +43 -0
  45. esgvoc/api/data_descriptors/horizontal_label.py +20 -0
  46. esgvoc/api/data_descriptors/initialization_index.py +27 -0
  47. esgvoc/api/data_descriptors/institution.py +80 -0
  48. esgvoc/api/data_descriptors/known_branded_variable.py +75 -0
  49. esgvoc/api/data_descriptors/license.py +31 -0
  50. esgvoc/api/data_descriptors/member_id.py +9 -0
  51. esgvoc/api/data_descriptors/mip_era.py +26 -0
  52. esgvoc/api/data_descriptors/model_component.py +32 -0
  53. esgvoc/api/data_descriptors/models_test/models.py +17 -0
  54. esgvoc/api/data_descriptors/nominal_resolution.py +50 -0
  55. esgvoc/api/data_descriptors/obs_type.py +5 -0
  56. esgvoc/api/data_descriptors/organisation.py +22 -0
  57. esgvoc/api/data_descriptors/physics_index.py +21 -0
  58. esgvoc/api/data_descriptors/product.py +16 -0
  59. esgvoc/api/data_descriptors/publication_status.py +5 -0
  60. esgvoc/api/data_descriptors/realization_index.py +24 -0
  61. esgvoc/api/data_descriptors/realm.py +16 -0
  62. esgvoc/api/data_descriptors/regex.py +5 -0
  63. esgvoc/api/data_descriptors/region.py +35 -0
  64. esgvoc/api/data_descriptors/resolution.py +7 -0
  65. esgvoc/api/data_descriptors/source.py +120 -0
  66. esgvoc/api/data_descriptors/source_type.py +5 -0
  67. esgvoc/api/data_descriptors/sub_experiment.py +5 -0
  68. esgvoc/api/data_descriptors/table.py +28 -0
  69. esgvoc/api/data_descriptors/temporal_label.py +20 -0
  70. esgvoc/api/data_descriptors/time_range.py +17 -0
  71. esgvoc/api/data_descriptors/title.py +5 -0
  72. esgvoc/api/data_descriptors/tracking_id.py +67 -0
  73. esgvoc/api/data_descriptors/variable.py +56 -0
  74. esgvoc/api/data_descriptors/variant_label.py +25 -0
  75. esgvoc/api/data_descriptors/vertical_label.py +20 -0
  76. esgvoc/api/project_specs.py +143 -0
  77. esgvoc/api/projects.py +1253 -0
  78. esgvoc/api/py.typed +0 -0
  79. esgvoc/api/pydantic_handler.py +146 -0
  80. esgvoc/api/report.py +127 -0
  81. esgvoc/api/search.py +171 -0
  82. esgvoc/api/universe.py +434 -0
  83. esgvoc/apps/__init__.py +6 -0
  84. esgvoc/apps/cmor_tables/__init__.py +7 -0
  85. esgvoc/apps/cmor_tables/cvs_table.py +948 -0
  86. esgvoc/apps/drs/__init__.py +0 -0
  87. esgvoc/apps/drs/constants.py +2 -0
  88. esgvoc/apps/drs/generator.py +429 -0
  89. esgvoc/apps/drs/report.py +540 -0
  90. esgvoc/apps/drs/validator.py +312 -0
  91. esgvoc/apps/ga/__init__.py +104 -0
  92. esgvoc/apps/ga/example_usage.py +315 -0
  93. esgvoc/apps/ga/models/__init__.py +47 -0
  94. esgvoc/apps/ga/models/netcdf_header.py +306 -0
  95. esgvoc/apps/ga/models/validator.py +491 -0
  96. esgvoc/apps/ga/test_ga.py +161 -0
  97. esgvoc/apps/ga/validator.py +277 -0
  98. esgvoc/apps/jsg/json_schema_generator.py +341 -0
  99. esgvoc/apps/jsg/templates/template.jinja +241 -0
  100. esgvoc/apps/test_cv/README.md +214 -0
  101. esgvoc/apps/test_cv/__init__.py +0 -0
  102. esgvoc/apps/test_cv/cv_tester.py +1611 -0
  103. esgvoc/apps/test_cv/example_usage.py +216 -0
  104. esgvoc/apps/vr/__init__.py +12 -0
  105. esgvoc/apps/vr/build_variable_registry.py +71 -0
  106. esgvoc/apps/vr/example_usage.py +60 -0
  107. esgvoc/apps/vr/vr_app.py +333 -0
  108. esgvoc/cli/clean.py +304 -0
  109. esgvoc/cli/cmor.py +46 -0
  110. esgvoc/cli/config.py +1300 -0
  111. esgvoc/cli/drs.py +267 -0
  112. esgvoc/cli/find.py +138 -0
  113. esgvoc/cli/get.py +155 -0
  114. esgvoc/cli/install.py +41 -0
  115. esgvoc/cli/main.py +60 -0
  116. esgvoc/cli/offline.py +269 -0
  117. esgvoc/cli/status.py +79 -0
  118. esgvoc/cli/test_cv.py +258 -0
  119. esgvoc/cli/valid.py +147 -0
  120. esgvoc/core/constants.py +17 -0
  121. esgvoc/core/convert.py +0 -0
  122. esgvoc/core/data_handler.py +206 -0
  123. esgvoc/core/db/__init__.py +3 -0
  124. esgvoc/core/db/connection.py +40 -0
  125. esgvoc/core/db/models/mixins.py +25 -0
  126. esgvoc/core/db/models/project.py +102 -0
  127. esgvoc/core/db/models/universe.py +98 -0
  128. esgvoc/core/db/project_ingestion.py +231 -0
  129. esgvoc/core/db/universe_ingestion.py +172 -0
  130. esgvoc/core/exceptions.py +33 -0
  131. esgvoc/core/logging_handler.py +26 -0
  132. esgvoc/core/repo_fetcher.py +345 -0
  133. esgvoc/core/service/__init__.py +41 -0
  134. esgvoc/core/service/configuration/config_manager.py +196 -0
  135. esgvoc/core/service/configuration/setting.py +363 -0
  136. esgvoc/core/service/data_merger.py +634 -0
  137. esgvoc/core/service/esg_voc.py +77 -0
  138. esgvoc/core/service/resolver_config.py +56 -0
  139. esgvoc/core/service/state.py +324 -0
  140. esgvoc/core/service/string_heuristics.py +98 -0
  141. esgvoc/core/service/term_cache.py +108 -0
  142. esgvoc/core/service/uri_resolver.py +133 -0
  143. esgvoc-2.0.2.dist-info/METADATA +82 -0
  144. esgvoc-2.0.2.dist-info/RECORD +147 -0
  145. esgvoc-2.0.2.dist-info/WHEEL +4 -0
  146. esgvoc-2.0.2.dist-info/entry_points.txt +2 -0
  147. esgvoc-2.0.2.dist-info/licenses/LICENSE.txt +519 -0
@@ -0,0 +1,77 @@
1
+ import logging
2
+ import os
3
+ from rich.logging import RichHandler
4
+ from rich.console import Console
5
+ import shutil
6
+ import esgvoc.core.service as service
7
+
8
+ _LOGGER = logging.getLogger(__name__)
9
+
10
+ rich_handler = RichHandler(rich_tracebacks=True)
11
+ _LOGGER.addHandler(rich_handler)
12
+
13
+
14
+ def reset_init_repo():
15
+ service_settings = service.service_settings
16
+ if (service_settings.universe.local_path) and os.path.exists(service_settings.universe.local_path):
17
+ shutil.rmtree(service_settings.universe.local_path)
18
+
19
+ for _, proj in service_settings.projects.items():
20
+ if (proj.local_path) and os.path.exists(proj.local_path):
21
+ shutil.rmtree(proj.local_path)
22
+ service.state_service.get_state_summary()
23
+
24
+
25
+ def reset_init_db():
26
+ service_settings = service.service_settings
27
+ if (service_settings.universe.db_path) and os.path.exists(service_settings.universe.db_path):
28
+ os.remove(service_settings.universe.db_path)
29
+ for _, proj in service_settings.projects.items():
30
+ if (proj.db_path) and os.path.exists(proj.db_path):
31
+ os.remove(proj.db_path)
32
+ service.state_service.get_state_summary()
33
+
34
+
35
+ def reset_init_all():
36
+ reset_init_db()
37
+ reset_init_repo()
38
+
39
+
40
+ def display(table):
41
+ console = Console(record=True, width=200)
42
+ console.print(table)
43
+
44
+
45
+ def install():
46
+ service.state_service.synchronize_all()
47
+
48
+
49
+ if __name__ == "__main__":
50
+
51
+ def Nothing(): # IT WORKS
52
+ reset_init_all()
53
+ display(service.state_service.table())
54
+ service.state_service.universe.sync()
55
+ display(service.state_service.table())
56
+ for _, proj in service.state_service.projects.items():
57
+ proj.sync()
58
+ display(service.state_service.table())
59
+
60
+ def OnlyLocal(): # IT ALSO WORKS
61
+ reset_init_db()
62
+ service.state_service.universe.github_access = False
63
+ for _, proj in service.state_service.projects.items():
64
+ proj.github_access = False
65
+ display(service.state_service.table())
66
+
67
+ service.state_service.universe.sync()
68
+ display(service.state_service.table())
69
+ for _, proj in service.state_service.projects.items():
70
+ proj.sync()
71
+ display(service.state_service.table())
72
+
73
+ # TODO Some other test to do to be complete:
74
+ # Change the settings ... for now .. let say nobody change the settings !
75
+
76
+ OnlyLocal()
77
+ # service.state_service.synchronize_all()
@@ -0,0 +1,56 @@
1
+ """Configuration for JSON-LD reference resolution behavior."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import List
5
+
6
+
7
+ @dataclass
8
+ class ResolverConfig:
9
+ """
10
+ Configuration for controlling JSON-LD ID reference resolution behavior.
11
+
12
+ This class provides fine-grained control over how the DataMerger resolves
13
+ nested @id references, including depth limits, string filtering, and
14
+ file resolution strategies.
15
+ """
16
+
17
+ # Recursion control
18
+ max_depth: int = 5
19
+ """Maximum recursion depth when resolving nested references"""
20
+
21
+ # String filtering for primitive resolution
22
+ max_string_length: int = 100
23
+ """Maximum length for strings to be considered as ID references"""
24
+
25
+ exclude_patterns: List[str] = field(default_factory=lambda: [" ", ".", "http", "/", "@"])
26
+ """Patterns that disqualify a string from being resolved as an ID reference"""
27
+
28
+ # File resolution strategies
29
+ fallback_dirs: List[str] = field(default_factory=lambda: ["horizontal_grid", "vertical_grid", "grid"])
30
+ """Alternative directories to search when a term file is not found"""
31
+
32
+ min_path_parts: int = 3
33
+ """Minimum number of path components required for alternate directory search"""
34
+
35
+ # Network and I/O
36
+ verify_ssl: bool = True
37
+ """Whether to verify SSL certificates when fetching remote resources"""
38
+
39
+ enable_caching: bool = True
40
+ """Whether to cache fetched terms to improve performance"""
41
+
42
+ cache_size: int = 128
43
+ """Maximum number of terms to cache (when caching is enabled)"""
44
+
45
+ # Logging and debugging
46
+ log_depth_warnings: bool = True
47
+ """Whether to log warnings when max_depth is exceeded"""
48
+
49
+ def __post_init__(self):
50
+ """Validate configuration values."""
51
+ if self.max_depth < 1:
52
+ raise ValueError("max_depth must be at least 1")
53
+ if self.max_string_length < 1:
54
+ raise ValueError("max_string_length must be at least 1")
55
+ if self.cache_size < 1:
56
+ raise ValueError("cache_size must be at least 1")
@@ -0,0 +1,324 @@
1
+ import logging
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ from rich.table import Table
7
+ from sqlalchemy.exc import NoResultFound
8
+ from sqlmodel import select
9
+
10
+ from esgvoc.core.db.connection import DBConnection
11
+ from esgvoc.core.db.models.project import Project
12
+ from esgvoc.core.db.models.universe import Universe
13
+ from esgvoc.core.repo_fetcher import RepoFetcher
14
+ from esgvoc.core.service.configuration.setting import ProjectSettings, ServiceSettings, UniverseSettings
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class BaseState:
20
+ def __init__(
21
+ self, github_repo: str, branch: str = "main", local_path: Optional[str] = None, db_path: Optional[str] = None, offline_mode: bool = False
22
+ ):
23
+ from esgvoc.core.service import config_manager
24
+
25
+ self.base_dir = config_manager.data_config_dir # needed for repofetcher
26
+
27
+ self.github_repo: str = github_repo
28
+ self.branch: str = branch
29
+ self.offline_mode: bool = offline_mode
30
+ # False if we dont have internet and some other cases
31
+ # In offline mode, disable github access from the start
32
+ self.github_access: bool = not offline_mode
33
+ self.github_version: str | None = None
34
+
35
+ self.local_path: str | None = local_path
36
+ self.local_access: bool = True # False if we dont have cloned the remote repo yet
37
+ self.local_version: str | None = None
38
+
39
+ self.db_path: str | None = db_path
40
+ self.db_access: bool = True # False if we cant access the db for some reason
41
+ self.db_version: str | None = None
42
+
43
+ self.rf = RepoFetcher(local_path=str(self.base_dir), offline_mode=offline_mode)
44
+ self.db_connection: DBConnection | None = None
45
+ self.db_sqlmodel: Universe | Project | None = None
46
+
47
+
48
+ def fetch_version_local(self):
49
+ if self.local_path:
50
+ try:
51
+ self.local_version = self.rf.get_local_repo_version(self.local_path, self.branch)
52
+ logger.debug(f"Local repo commit: {self.local_version}")
53
+ self.local_access = True
54
+ except Exception as e:
55
+ logger.exception(f"Failed to fetch local repo version: {e}")
56
+ self.local_access = False
57
+
58
+ def fetch_version_remote(self):
59
+ if self.offline_mode:
60
+ logger.debug("Skipping remote version fetch due to offline mode")
61
+ self.github_access = False
62
+ return
63
+
64
+ if self.github_repo:
65
+ owner = None
66
+ repo = None
67
+ try:
68
+ owner, repo = self.github_repo.removeprefix("https://github.com/").split("/")
69
+ self.github_version = self.rf.get_github_version(owner, repo, self.branch)
70
+ self.github_access = True
71
+ logger.debug(f"Latest GitHub commit: {self.github_version}")
72
+ except IndexError as e:
73
+ self.github_access = False
74
+ except Exception as e:
75
+ logger.exception(
76
+ f"Failed to fetch GitHub version: {e} ,for {self.github_repo},owner : {owner}, repo : {repo},branch : {self.branch}"
77
+ )
78
+ self.github_access = False
79
+
80
+ if self.github_version is None:
81
+ self.github_access = False
82
+
83
+ def connect_db(self):
84
+ if self.db_path:
85
+ if not os.path.exists(self.db_path):
86
+ self.db_access = False
87
+ else:
88
+ self.db_connection = DBConnection(db_file_path=Path(self.db_path))
89
+
90
+ def fetch_version_db(self):
91
+ if self.db_path:
92
+ if not os.path.exists(self.db_path):
93
+ self.db_version = None
94
+ self.db_access = False
95
+ else:
96
+ try:
97
+ with self.db_connection.create_session() as session:
98
+ self.db_version = session.exec(select(self.db_sqlmodel.git_hash)).one()
99
+ self.db_access = True
100
+ except NoResultFound:
101
+ logger.debug(f"Unable to find git_hash in {self.db_path}")
102
+ except Exception as e:
103
+ logger.debug(f"Unable to find git_has in {self.db_path} cause {e}")
104
+
105
+ else:
106
+ self.db_version = None
107
+ self.db_access = False
108
+
109
+ def fetch_versions(self):
110
+ if self.github_access:
111
+ self.fetch_version_remote()
112
+ self.fetch_version_local()
113
+ self.fetch_version_db()
114
+
115
+ def check_sync_status(self):
116
+ self.fetch_versions()
117
+ return {
118
+ "github": self.github_version if self.github_version else None,
119
+ "local": self.local_version if self.local_version else None,
120
+ "db": self.db_version if self.db_version else None,
121
+ "github_local_sync": self.github_version == self.local_version
122
+ if self.github_access and self.github_version and self.local_version
123
+ else False,
124
+ "local_db_sync": self.local_version == self.db_version
125
+ if self.local_access and self.local_version
126
+ else False,
127
+ "github_db_sync": self.github_version == self.db_version
128
+ if self.github_access and self.github_version
129
+ else False,
130
+ }
131
+
132
+ def clone_remote(self, force_clean=False):
133
+ if self.offline_mode:
134
+ logger.warning("Cannot clone remote repository in offline mode")
135
+ return
136
+
137
+ # If force_clean is True or if local repo exists and we're handling divergence,
138
+ # remove the existing local repository to ensure clean state
139
+ if force_clean and self.local_path and os.path.exists(self.local_path):
140
+ print(f"Removing existing local repository: {self.local_path}")
141
+ import shutil
142
+ shutil.rmtree(self.local_path)
143
+
144
+ owner, repo = self.github_repo.removeprefix("https://github.com/").split("/")
145
+ # TODO add destination "local_path" in clone_repo, done in a wierd way Improve that:
146
+ self.rf.clone_repository(owner, repo, self.branch, self.local_path)
147
+ self.fetch_version_local()
148
+
149
+ def build_db(self):
150
+ from esgvoc.core.db.models.project import project_create_db
151
+ from esgvoc.core.db.models.universe import universe_create_db
152
+ from esgvoc.core.db.project_ingestion import ingest_project
153
+ from esgvoc.core.db.universe_ingestion import ingest_metadata_universe, ingest_universe
154
+
155
+ if self.db_path:
156
+ if os.path.exists(self.db_path):
157
+ os.remove(self.db_path)
158
+ else:
159
+ os.makedirs(Path(self.db_path).parent, exist_ok=True)
160
+
161
+ if self.db_sqlmodel == Universe: # Ugly
162
+ print("Building Universe DB from ", self.local_path)
163
+ universe_create_db(Path(self.db_path))
164
+ self.db_connection = DBConnection(db_file_path=Path(self.db_path))
165
+
166
+ ingest_metadata_universe(self.db_connection, self.local_version)
167
+ print("Filling Universe DB")
168
+ if self.local_path:
169
+ ingest_universe(Path(self.local_path), Path(self.db_path))
170
+
171
+ elif self.db_sqlmodel == Project:
172
+ print("Building Project DB from ", self.local_path)
173
+ project_create_db(Path(self.db_path))
174
+ print("Filling project DB")
175
+ if self.local_path and self.local_version:
176
+ ingest_project(Path(self.local_path), Path(self.db_path), self.local_version)
177
+ self.fetch_version_db()
178
+
179
+ def sync(self):
180
+ summary = self.check_sync_status()
181
+ updated = False
182
+
183
+ if self.offline_mode:
184
+ print("Running in offline mode - only using local repositories and databases")
185
+ if self.local_access:
186
+ if not summary["local_db_sync"] and summary["local_db_sync"] is not None:
187
+ self.build_db()
188
+ updated = True
189
+ else:
190
+ print("Cache db is uptodate from local repository")
191
+ elif not self.db_access: # it can happen if the db is created but not filled
192
+ if self.local_path and os.path.exists(self.local_path):
193
+ self.build_db()
194
+ updated = True
195
+ else:
196
+ print(f"No local repository found at {self.local_path} - cannot sync in offline mode")
197
+ else:
198
+ print("Nothing to sync in offline mode - local repository and database are up to date")
199
+ return updated
200
+
201
+ # Online sync logic with offline-to-online transition detection
202
+ if (
203
+ self.github_access
204
+ and summary["github_db_sync"] is None
205
+ and summary["local_db_sync"] is None
206
+ and summary["github_local_sync"] is None
207
+ ):
208
+ self.clone_remote()
209
+ self.build_db()
210
+ updated = True
211
+ elif self.github_access and not summary["github_db_sync"]:
212
+ if not summary["local_db_sync"] and summary["local_db_sync"] is not None:
213
+ self.clone_remote()
214
+ self.build_db()
215
+ updated = True
216
+ elif not summary["github_local_sync"]:
217
+ # Critical fix: when local and remote diverge in online mode,
218
+ # prioritize remote truth by completely removing local repo and re-cloning
219
+ print(f"Local and remote repositories have diverged (local: {summary['local'][:8] if summary['local'] else 'N/A'}, remote: {summary['github'][:8] if summary['github'] else 'N/A'})")
220
+ print("Prioritizing remote repository truth - removing local repository and re-cloning from GitHub...")
221
+ self.clone_remote(force_clean=True)
222
+ self.build_db()
223
+ updated = True
224
+ else: # can be simply build in root and clone if neccessary
225
+ self.build_db()
226
+ updated = True
227
+ elif self.local_access:
228
+ if not summary["local_db_sync"] and summary["local_db_sync"] is not None:
229
+ self.build_db()
230
+ updated = True
231
+ else:
232
+ print("Cache db is uptodate from local repository")
233
+ elif not self.db_access: # it can happen if the db is created but not filled
234
+ self.build_db()
235
+ updated = True
236
+ else:
237
+ print("Nothing to install, everything up to date")
238
+ print("Try 'esgvoc status' for more details")
239
+ return updated
240
+
241
+
242
+ class StateUniverse(BaseState):
243
+ def __init__(self, settings: UniverseSettings):
244
+ params = settings.model_dump()
245
+ params['local_path'] = settings.get_absolute_local_path()
246
+ params['db_path'] = settings.get_absolute_db_path()
247
+ super().__init__(**params)
248
+ self.db_sqlmodel = Universe
249
+
250
+
251
+ class StateProject(BaseState):
252
+ def __init__(self, settings: ProjectSettings):
253
+ mdict = settings.model_dump()
254
+ self.project_name = mdict.pop("project_name")
255
+ mdict['local_path'] = settings.get_absolute_local_path()
256
+ mdict['db_path'] = settings.get_absolute_db_path()
257
+ super().__init__(**mdict)
258
+ self.db_sqlmodel = Project
259
+
260
+
261
+ class StateService:
262
+ def __init__(self, service_settings: ServiceSettings):
263
+ self.universe = StateUniverse(service_settings.universe)
264
+ self.projects = {name: StateProject(proj) for name, proj in service_settings.projects.items()}
265
+ self.connect_db()
266
+
267
+ def get_state_summary(self):
268
+ universe_status = self.universe.check_sync_status()
269
+ project_statuses = {name: proj.check_sync_status() for name, proj in self.projects.items()}
270
+ return {"universe": universe_status, "projects": project_statuses}
271
+
272
+ def fetch_versions(self):
273
+ self.universe.fetch_versions()
274
+ for _, proj_state in self.projects.items():
275
+ proj_state.fetch_versions()
276
+
277
+ def connect_db(self):
278
+ self.universe.connect_db()
279
+ for _, proj_state in self.projects.items():
280
+ proj_state.connect_db()
281
+
282
+ def synchronize_all(self):
283
+ print("sync universe")
284
+ if self.universe.offline_mode:
285
+ print("Universe is in offline mode")
286
+ universe_updated = self.universe.sync()
287
+ print("sync projects")
288
+ for project_name, project in self.projects.items():
289
+ if project.offline_mode:
290
+ print(f"Project {project_name} is in offline mode")
291
+ project_updated = project.sync()
292
+ if universe_updated and not project_updated:
293
+ project.build_db()
294
+ self.connect_db()
295
+
296
+ def table(self):
297
+ table = Table(show_header=False, show_lines=True)
298
+ table.add_row("", "Remote github repo", "Local repository", "Cache Database")
299
+ table.add_row("Universe path", self.universe.github_repo, self.universe.local_path, self.universe.db_path)
300
+ table.add_row("Version", self.universe.github_version, self.universe.local_version, self.universe.db_version)
301
+ for proj_name, proj in self.projects.items():
302
+ # table.add_row("","Remote github repo","Local repository","Cache Database")
303
+ table.add_row(f"{proj_name} path", proj.github_repo, proj.local_path, proj.db_path)
304
+ table.add_row("Version", proj.github_version, proj.local_version, proj.db_version)
305
+ return table
306
+
307
+
308
+ if __name__ == "__main__":
309
+ # Load settings from file
310
+ service_settings = ServiceSettings.load_from_file("src/esgvoc/core/service/settings.toml")
311
+
312
+ # Initialize StateService
313
+ state_service = StateService(service_settings)
314
+ state_service.get_state_summary()
315
+
316
+ # Synchronize all
317
+ state_service.synchronize_all()
318
+
319
+ # pprint(state_service.universe.github_version)
320
+ # pprint(state_service.universe.local_version)
321
+ # pprint(state_service.universe.db_version)
322
+
323
+ # Check for differences
324
+ # pprint(state_service.find_version_differences())
@@ -0,0 +1,98 @@
1
+ """Heuristics for determining if strings should be resolved as ID references."""
2
+
3
+ from typing import List
4
+
5
+
6
+ class StringHeuristics:
7
+ """
8
+ Determine if a string value should be resolved as an ID reference.
9
+
10
+ Uses configurable heuristics to distinguish between:
11
+ - ID references (e.g., "hadgem3_gc31_atmosphere") - should resolve
12
+ - Literal strings (e.g., "A long description...") - should not resolve
13
+ - URLs (e.g., "https://doi.org/...") - should not resolve
14
+ """
15
+
16
+ def __init__(self, max_length: int = 100, exclude_patterns: List[str] | None = None):
17
+ """
18
+ Initialize string heuristics.
19
+
20
+ Args:
21
+ max_length: Maximum length for strings to be considered as ID references.
22
+ Longer strings are assumed to be content, not references.
23
+ exclude_patterns: Patterns that disqualify a string from being an ID reference.
24
+ Defaults to [" ", ".", "http", "/", "@"] which filter out
25
+ descriptions, URLs, DOIs, paths, and emails.
26
+ """
27
+ self.max_length = max_length
28
+ self.exclude_patterns = exclude_patterns or [" ", ".", "http", "/", "@"]
29
+
30
+ def is_resolvable(self, value: str) -> bool:
31
+ """
32
+ Check if a string looks like an ID reference that should be resolved.
33
+
34
+ Args:
35
+ value: The string to evaluate
36
+
37
+ Returns:
38
+ True if the string appears to be an ID reference, False otherwise
39
+
40
+ Example:
41
+ >>> heuristics = StringHeuristics()
42
+ >>> heuristics.is_resolvable("hadgem3_gc31_atmosphere")
43
+ True
44
+ >>> heuristics.is_resolvable("This is a long description text")
45
+ False
46
+ >>> heuristics.is_resolvable("https://doi.org/10.5194/gmd")
47
+ False
48
+ """
49
+ # Check length
50
+ if len(value) > self.max_length:
51
+ return False
52
+
53
+ # Check for exclude patterns
54
+ for pattern in self.exclude_patterns:
55
+ if pattern in value:
56
+ return False
57
+
58
+ return True
59
+
60
+ def should_skip_literal(self, expanded_data: dict) -> bool:
61
+ """
62
+ Check if the expanded data indicates this is a literal value (not a reference).
63
+
64
+ In JSON-LD, literal values are marked with @value in expanded form.
65
+
66
+ Args:
67
+ expanded_data: The expanded JSON-LD data
68
+
69
+ Returns:
70
+ True if this is a literal value that should not be resolved
71
+
72
+ Example:
73
+ >>> heuristics = StringHeuristics()
74
+ >>> heuristics.should_skip_literal({"@value": "some text"})
75
+ True
76
+ >>> heuristics.should_skip_literal({"@id": "some_term"})
77
+ False
78
+ """
79
+ return isinstance(expanded_data, dict) and "@value" in expanded_data
80
+
81
+ def has_id_in_expanded(self, expanded_data: dict) -> bool:
82
+ """
83
+ Check if the expanded data contains an @id, indicating it's a reference.
84
+
85
+ Args:
86
+ expanded_data: The expanded JSON-LD data
87
+
88
+ Returns:
89
+ True if the expanded data has an @id field
90
+
91
+ Example:
92
+ >>> heuristics = StringHeuristics()
93
+ >>> heuristics.has_id_in_expanded({"@id": "https://example.com/term"})
94
+ True
95
+ >>> heuristics.has_id_in_expanded({"@value": "literal"})
96
+ False
97
+ """
98
+ return isinstance(expanded_data, dict) and "@id" in expanded_data
@@ -0,0 +1,108 @@
1
+ """Caching for resolved JSON-LD terms to improve performance."""
2
+
3
+ import json
4
+ import logging
5
+ from functools import lru_cache
6
+ from pathlib import Path
7
+ from typing import Dict
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class TermCache:
13
+ """
14
+ LRU cache for JSON-LD terms fetched from files or remote sources.
15
+
16
+ Caching reduces redundant file I/O and network calls when the same
17
+ terms are referenced multiple times during resolution.
18
+ """
19
+
20
+ def __init__(self, max_size: int = 128, enabled: bool = True):
21
+ """
22
+ Initialize the term cache.
23
+
24
+ Args:
25
+ max_size: Maximum number of terms to cache
26
+ enabled: Whether caching is enabled (can be disabled for debugging)
27
+ """
28
+ self.max_size = max_size
29
+ self.enabled = enabled
30
+ self._cache: Dict[str, dict] = {}
31
+ self._hits = 0
32
+ self._misses = 0
33
+
34
+ def get(self, uri: str) -> dict | None:
35
+ """
36
+ Retrieve a cached term by URI.
37
+
38
+ Args:
39
+ uri: The URI key for the cached term
40
+
41
+ Returns:
42
+ The cached term data, or None if not in cache
43
+ """
44
+ if not self.enabled:
45
+ return None
46
+
47
+ if uri in self._cache:
48
+ self._hits += 1
49
+ logger.debug(f"Cache hit for {uri}")
50
+ return self._cache[uri]
51
+
52
+ self._misses += 1
53
+ return None
54
+
55
+ def put(self, uri: str, data: dict) -> None:
56
+ """
57
+ Store a term in the cache.
58
+
59
+ Args:
60
+ uri: The URI key for the term
61
+ data: The term data to cache
62
+ """
63
+ if not self.enabled:
64
+ return
65
+
66
+ # Simple LRU: if cache is full, remove the oldest entry
67
+ if len(self._cache) >= self.max_size:
68
+ # Remove first item (oldest in insertion order for Python 3.7+)
69
+ oldest_key = next(iter(self._cache))
70
+ del self._cache[oldest_key]
71
+ logger.debug(f"Cache eviction: {oldest_key}")
72
+
73
+ self._cache[uri] = data
74
+ logger.debug(f"Cached {uri}")
75
+
76
+ def clear(self) -> None:
77
+ """Clear all cached terms."""
78
+ self._cache.clear()
79
+ self._hits = 0
80
+ self._misses = 0
81
+ logger.debug("Cache cleared")
82
+
83
+ def get_stats(self) -> Dict[str, int]:
84
+ """
85
+ Get cache statistics.
86
+
87
+ Returns:
88
+ Dictionary with cache hits, misses, size, and hit rate
89
+ """
90
+ total_requests = self._hits + self._misses
91
+ hit_rate = (self._hits / total_requests * 100) if total_requests > 0 else 0
92
+
93
+ return {
94
+ "hits": self._hits,
95
+ "misses": self._misses,
96
+ "size": len(self._cache),
97
+ "max_size": self.max_size,
98
+ "hit_rate_percent": round(hit_rate, 2),
99
+ }
100
+
101
+ def __repr__(self) -> str:
102
+ """String representation showing cache stats."""
103
+ stats = self.get_stats()
104
+ return (
105
+ f"TermCache(size={stats['size']}/{stats['max_size']}, "
106
+ f"hits={stats['hits']}, misses={stats['misses']}, "
107
+ f"hit_rate={stats['hit_rate_percent']}%)"
108
+ )