dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
"""Base class for all registry managers."""
|
|
2
|
+
|
|
3
|
+
import abc
|
|
4
|
+
import copy
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Self, Type
|
|
8
|
+
|
|
9
|
+
from semver import VersionInfo
|
|
10
|
+
from sqlalchemy import Connection
|
|
11
|
+
|
|
12
|
+
from dsgrid.config.config_base import ConfigBase
|
|
13
|
+
from dsgrid.exceptions import (
|
|
14
|
+
DSGInvalidParameter,
|
|
15
|
+
DSGValueNotRegistered,
|
|
16
|
+
DSGDuplicateValueRegistered,
|
|
17
|
+
)
|
|
18
|
+
from dsgrid.registry.registration_context import RegistrationContext
|
|
19
|
+
from dsgrid.registry.registry_interface import RegistryInterfaceBase
|
|
20
|
+
from dsgrid.registry.common import RegistryManagerParams, VersionUpdateType
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RegistryManagerBase(abc.ABC):
|
|
27
|
+
"""Base class for all registry managers."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, path, params: RegistryManagerParams):
|
|
30
|
+
self._path = path
|
|
31
|
+
self._params = params
|
|
32
|
+
self._db = None
|
|
33
|
+
|
|
34
|
+
if not path.exists():
|
|
35
|
+
logger.warning(
|
|
36
|
+
"The registry data path=%s does not exist. You will able to inspect the registry "
|
|
37
|
+
"contents, but you will not be able to perform any data-related activities.",
|
|
38
|
+
path,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
@abc.abstractmethod
|
|
43
|
+
def db(self) -> RegistryInterfaceBase:
|
|
44
|
+
"""Return the database interface."""
|
|
45
|
+
|
|
46
|
+
@db.setter
|
|
47
|
+
@abc.abstractmethod
|
|
48
|
+
def db(self, db: RegistryInterfaceBase) -> None:
|
|
49
|
+
"""Return the database interface."""
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def load(cls, path, params, db, *args: Any, **kwargs: Any) -> Self:
|
|
53
|
+
"""Load the registry manager.
|
|
54
|
+
|
|
55
|
+
path : str
|
|
56
|
+
params : RegistryManagerParams
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
RegistryManagerBase
|
|
61
|
+
|
|
62
|
+
"""
|
|
63
|
+
mgr = cls(path, params)
|
|
64
|
+
mgr.db = db
|
|
65
|
+
return mgr
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def _load(cls, path, params: RegistryManagerParams, *args):
|
|
69
|
+
mgr = cls(path, params, *args)
|
|
70
|
+
return mgr
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
@abc.abstractmethod
|
|
74
|
+
def config_class() -> Type:
|
|
75
|
+
"""Return the class used for storing the config."""
|
|
76
|
+
|
|
77
|
+
@abc.abstractmethod
|
|
78
|
+
def get_by_id(
|
|
79
|
+
self, config_id: str, version: str | None = None, conn: Connection | None = None
|
|
80
|
+
) -> ConfigBase:
|
|
81
|
+
"""Get the item matching matching ID. Returns from cache if already loaded.
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
config_id : str
|
|
86
|
+
version : str
|
|
87
|
+
If None, return the latest version.
|
|
88
|
+
|
|
89
|
+
Returns
|
|
90
|
+
-------
|
|
91
|
+
DSGBaseModel
|
|
92
|
+
|
|
93
|
+
Raises
|
|
94
|
+
------
|
|
95
|
+
DSGValueNotRegistered
|
|
96
|
+
Raised if the ID is not stored.
|
|
97
|
+
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
@abc.abstractmethod
|
|
102
|
+
def name() -> str:
|
|
103
|
+
"""Return the name of the registry, used for reporting.
|
|
104
|
+
|
|
105
|
+
Returns
|
|
106
|
+
-------
|
|
107
|
+
str
|
|
108
|
+
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
@abc.abstractmethod
|
|
112
|
+
def register(self, *args: Any, **kwargs: Any) -> Any:
|
|
113
|
+
"""Registers a config file in the registry.
|
|
114
|
+
|
|
115
|
+
Raises
|
|
116
|
+
------
|
|
117
|
+
ValueError
|
|
118
|
+
Raised if the config_file is invalid.
|
|
119
|
+
DSGDuplicateValueRegistered
|
|
120
|
+
Raised if the config ID is already registered.
|
|
121
|
+
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
@abc.abstractmethod
|
|
125
|
+
def register_from_config(self, config: ConfigBase, *args: Any, **kwargs) -> Any:
|
|
126
|
+
"""Registers a config file in the registry.
|
|
127
|
+
|
|
128
|
+
Raises
|
|
129
|
+
------
|
|
130
|
+
ValueError
|
|
131
|
+
Raised if the config_file is invalid.
|
|
132
|
+
DSGDuplicateValueRegistered
|
|
133
|
+
Raised if the config ID is already registered.
|
|
134
|
+
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
@abc.abstractmethod
|
|
138
|
+
def update_from_file(self, *args: Any, **kwargs: Any) -> ConfigBase:
|
|
139
|
+
"""Updates the current registry with new parameters or data from a config file.
|
|
140
|
+
|
|
141
|
+
Raises
|
|
142
|
+
------
|
|
143
|
+
ValueError
|
|
144
|
+
Raised if the config_file is invalid.
|
|
145
|
+
DSGInvalidParameter
|
|
146
|
+
Raised if config_id does not match config_file.
|
|
147
|
+
Raised if the version is not the current version.
|
|
148
|
+
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
@abc.abstractmethod
|
|
152
|
+
def update(
|
|
153
|
+
self,
|
|
154
|
+
config: ConfigBase,
|
|
155
|
+
*args: Any,
|
|
156
|
+
**kwargs: Any,
|
|
157
|
+
) -> ConfigBase:
|
|
158
|
+
"""Updates the current registry with new parameters or data.
|
|
159
|
+
|
|
160
|
+
Raises
|
|
161
|
+
------
|
|
162
|
+
ValueError
|
|
163
|
+
Raised if the config_file is invalid.
|
|
164
|
+
DSGInvalidParameter
|
|
165
|
+
Raised if config_id does not match config_file.
|
|
166
|
+
Raised if the version is not the current version.
|
|
167
|
+
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
def _check_update(
|
|
171
|
+
self, conn: Connection, config: ConfigBase, config_id: str, version: str
|
|
172
|
+
) -> None:
|
|
173
|
+
if config.config_id != config_id:
|
|
174
|
+
msg = f"ID={config_id} does not match ID in file: {config.config_id}"
|
|
175
|
+
raise DSGInvalidParameter(msg)
|
|
176
|
+
|
|
177
|
+
cur_version = self.get_latest_version(config_id, conn=conn)
|
|
178
|
+
if version != cur_version:
|
|
179
|
+
msg = f"version={version} is not current. Current={cur_version}"
|
|
180
|
+
raise DSGInvalidParameter(msg)
|
|
181
|
+
|
|
182
|
+
@staticmethod
|
|
183
|
+
def get_next_version(version: str, update_type: VersionUpdateType):
|
|
184
|
+
ver = VersionInfo.parse(version)
|
|
185
|
+
if update_type == VersionUpdateType.MAJOR:
|
|
186
|
+
next_version = ver.bump_major()
|
|
187
|
+
elif update_type == VersionUpdateType.MINOR:
|
|
188
|
+
next_version = ver.bump_minor()
|
|
189
|
+
elif update_type == VersionUpdateType.PATCH:
|
|
190
|
+
next_version = ver.bump_patch()
|
|
191
|
+
else:
|
|
192
|
+
msg = f"invalid version {update_type=}"
|
|
193
|
+
raise NotImplementedError(msg)
|
|
194
|
+
|
|
195
|
+
return str(next_version)
|
|
196
|
+
|
|
197
|
+
def _update_config(self, config, context: RegistrationContext):
|
|
198
|
+
config_id = config.config_id
|
|
199
|
+
cur_version = config.model.version
|
|
200
|
+
new_model = copy.deepcopy(config.model)
|
|
201
|
+
new_model.version = self.get_next_version(cur_version, context.registration.update_type)
|
|
202
|
+
updated_model = self.db.update(context.connection, new_model, context.registration)
|
|
203
|
+
logger.info(
|
|
204
|
+
"Updated registry and config information for %s ID=%s version=%s",
|
|
205
|
+
self.name(),
|
|
206
|
+
config_id,
|
|
207
|
+
updated_model.version,
|
|
208
|
+
)
|
|
209
|
+
return updated_model
|
|
210
|
+
|
|
211
|
+
def _check_if_already_registered(self, conn: Connection, config_id):
|
|
212
|
+
if self.db.has(conn, config_id):
|
|
213
|
+
msg = f"{self.name()}={config_id}"
|
|
214
|
+
raise DSGDuplicateValueRegistered(msg)
|
|
215
|
+
|
|
216
|
+
def _check_if_not_registered(self, conn: Connection, config_id):
|
|
217
|
+
if not self.db.has(conn, config_id):
|
|
218
|
+
msg = f"{self.name()}={config_id}"
|
|
219
|
+
raise DSGValueNotRegistered(msg)
|
|
220
|
+
|
|
221
|
+
def _log_offline_mode_prefix(self):
|
|
222
|
+
return "* OFFLINE MODE * |" if self.offline_mode else ""
|
|
223
|
+
|
|
224
|
+
@property
|
|
225
|
+
def cloud_interface(self):
|
|
226
|
+
"""Return the CloudStorageInterface to sync remote data."""
|
|
227
|
+
return self._params.cloud_interface
|
|
228
|
+
|
|
229
|
+
@cloud_interface.setter
|
|
230
|
+
def cloud_interface(self, cloud_interface):
|
|
231
|
+
"""Set the CloudStorageInterface (used in testing)"""
|
|
232
|
+
self._params = self._params._replace(cloud_interface=cloud_interface)
|
|
233
|
+
|
|
234
|
+
def dump(
|
|
235
|
+
self,
|
|
236
|
+
config_id,
|
|
237
|
+
directory,
|
|
238
|
+
version=None,
|
|
239
|
+
conn: Connection | None = None,
|
|
240
|
+
force: bool = False,
|
|
241
|
+
):
|
|
242
|
+
"""Dump the config file to directory.
|
|
243
|
+
|
|
244
|
+
Parameters
|
|
245
|
+
----------
|
|
246
|
+
config_id : str
|
|
247
|
+
directory : str
|
|
248
|
+
version : VersionInfo | None
|
|
249
|
+
Defaults to current version.
|
|
250
|
+
force : bool
|
|
251
|
+
If True, overwrite files if they exist.
|
|
252
|
+
|
|
253
|
+
"""
|
|
254
|
+
path = Path(directory)
|
|
255
|
+
path.mkdir(exist_ok=True, parents=True)
|
|
256
|
+
config = self.get_by_id(config_id, version, conn=conn)
|
|
257
|
+
filename = config.serialize(path, force=force)
|
|
258
|
+
logger.info(
|
|
259
|
+
"Dumped config for type=%s ID=%s version=%s to %s",
|
|
260
|
+
self.name(),
|
|
261
|
+
config_id,
|
|
262
|
+
config.model.version,
|
|
263
|
+
filename,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
def finalize_registration(self, conn: Connection, config_ids: set[str], error_occurred: bool):
|
|
267
|
+
"""Peform final actions after a registration process.
|
|
268
|
+
|
|
269
|
+
Parameters
|
|
270
|
+
----------
|
|
271
|
+
config_ids : set[str]
|
|
272
|
+
Config IDs that were registered
|
|
273
|
+
error_occurred : bool
|
|
274
|
+
Set to True if an error occurred and all intermediately-registered IDs should be
|
|
275
|
+
removed.
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
@property
|
|
279
|
+
def fs_interface(self):
|
|
280
|
+
"""Return the FilesystemInterface to list directories and read/write files."""
|
|
281
|
+
return self._params.fs_interface
|
|
282
|
+
|
|
283
|
+
@property
|
|
284
|
+
def offline_mode(self):
|
|
285
|
+
"""Return True if there is to be no syncing with the remote registry."""
|
|
286
|
+
return self._params.offline
|
|
287
|
+
|
|
288
|
+
def get_latest_version(self, config_id, conn: Connection | None = None):
|
|
289
|
+
"""Return the current version in the registry.
|
|
290
|
+
|
|
291
|
+
Returns
|
|
292
|
+
-------
|
|
293
|
+
str
|
|
294
|
+
|
|
295
|
+
"""
|
|
296
|
+
return self.db.get_latest_version(conn, config_id)
|
|
297
|
+
|
|
298
|
+
def get_registry_data_directory(self, config_id):
|
|
299
|
+
"""Return the directory containing data for config_id (parquet files).
|
|
300
|
+
|
|
301
|
+
Parameters
|
|
302
|
+
----------
|
|
303
|
+
config_id : str
|
|
304
|
+
|
|
305
|
+
Returns
|
|
306
|
+
-------
|
|
307
|
+
str
|
|
308
|
+
|
|
309
|
+
"""
|
|
310
|
+
return Path(self._params.base_path) / "data" / config_id
|
|
311
|
+
|
|
312
|
+
def has_id(self, config_id, version=None, conn: Connection | None = None):
|
|
313
|
+
"""Return True if an item matching the parameters is stored.
|
|
314
|
+
|
|
315
|
+
Parameters
|
|
316
|
+
----------
|
|
317
|
+
config_id : str
|
|
318
|
+
version : str
|
|
319
|
+
If None, use latest.
|
|
320
|
+
|
|
321
|
+
Returns
|
|
322
|
+
-------
|
|
323
|
+
bool
|
|
324
|
+
|
|
325
|
+
"""
|
|
326
|
+
return self.db.has(conn, config_id, version=version)
|
|
327
|
+
|
|
328
|
+
def iter_configs(self, conn: Connection | None = None):
|
|
329
|
+
"""Return an iterator over the registered configs."""
|
|
330
|
+
for config_id in self.iter_ids(conn):
|
|
331
|
+
yield self.get_by_id(config_id, conn=conn)
|
|
332
|
+
|
|
333
|
+
def iter_ids(self, conn: Connection | None = None):
|
|
334
|
+
"""Return an iterator over the registered dsgrid IDs."""
|
|
335
|
+
yield from self.db.list_model_ids(conn)
|
|
336
|
+
|
|
337
|
+
def list_ids(self, conn: Connection | None = None, **kwargs: Any):
|
|
338
|
+
"""Return the IDs.
|
|
339
|
+
|
|
340
|
+
Returns
|
|
341
|
+
-------
|
|
342
|
+
list
|
|
343
|
+
|
|
344
|
+
"""
|
|
345
|
+
return sorted(self.iter_ids(conn))
|
|
346
|
+
|
|
347
|
+
def relative_remote_path(self, path):
|
|
348
|
+
"""Return relative remote registry path."""
|
|
349
|
+
relative_path = Path(path).relative_to(self._params.base_path)
|
|
350
|
+
remote_path = f"{self._params.remote_path}/{relative_path}"
|
|
351
|
+
return remote_path
|
|
352
|
+
|
|
353
|
+
@abc.abstractmethod
|
|
354
|
+
def remove(self, config_id: str, conn: Connection | None = None) -> None:
|
|
355
|
+
"""Remove an item from the registry.
|
|
356
|
+
|
|
357
|
+
Parameters
|
|
358
|
+
----------
|
|
359
|
+
config_id : str
|
|
360
|
+
|
|
361
|
+
Raises
|
|
362
|
+
------
|
|
363
|
+
DSGValueNotRegistered
|
|
364
|
+
Raised if the project_id is not registered.
|
|
365
|
+
|
|
366
|
+
"""
|
|
367
|
+
# TODO: Do we want to handle specific versions? This removes all configs.
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json5
|
|
3
|
+
|
|
4
|
+
DATASET_REGISTRY_PATH = "registry/datasets/"
|
|
5
|
+
PROJECT_REGISTRY_PATH = "registry/projects/"
|
|
6
|
+
DIMENSION_REGISTRY_PATH = "registry/dimensions/"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def versioning(registry_type, id_handle, update):
|
|
10
|
+
"""Determine registration or project version for registration.
|
|
11
|
+
|
|
12
|
+
TODO: Current solution is a quick hack. This needs to be better/formalized.
|
|
13
|
+
- Need smarter version updating / checks; use semvar packages
|
|
14
|
+
- Set to work with some central version (like S3)
|
|
15
|
+
- Currently only updating major version
|
|
16
|
+
- NOTE: not currently utilitzing the update_type in
|
|
17
|
+
ConfigRegistrationDetails. Could use this to set
|
|
18
|
+
major/minor/patch update decisiosns
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
registry_type (RegistryType): type of registry (e.g., Project, Dataset)
|
|
22
|
+
id_handle (str): ID handle is either the project_id or dataset_id
|
|
23
|
+
update (bool): config registration update setting
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
# get registry path
|
|
27
|
+
if registry_type == "dataset":
|
|
28
|
+
registry_path = DATASET_REGISTRY_PATH
|
|
29
|
+
if registry_type == "project":
|
|
30
|
+
registry_path = PROJECT_REGISTRY_PATH
|
|
31
|
+
if registry_type == "dimension":
|
|
32
|
+
registry_path = DIMENSION_REGISTRY_PATH
|
|
33
|
+
|
|
34
|
+
# TODO: remove when done. project path should be set somewhere else
|
|
35
|
+
if not os.path.exists(registry_path):
|
|
36
|
+
msg = f"Path does not exist: {registry_path}"
|
|
37
|
+
raise ValueError(msg)
|
|
38
|
+
|
|
39
|
+
# if config.update is False, then assume major=1, minor=0, patch=0
|
|
40
|
+
if not update:
|
|
41
|
+
version = f"{id_handle}-v1.0.0"
|
|
42
|
+
registry_file = f"{registry_path}/{version}.json5"
|
|
43
|
+
# Raise error if v1.0.0 registry exists for project_id
|
|
44
|
+
if os.path.exists(registry_file):
|
|
45
|
+
msg = (
|
|
46
|
+
f'{registry_type} registry for "{registry_file}" already '
|
|
47
|
+
f"exists. If you want to update the project registration"
|
|
48
|
+
f" with a new {registry_type} version, then you will need to"
|
|
49
|
+
f" set update=True in {registry_type} config. Alternatively, "
|
|
50
|
+
f"if you want to initiate a new dsgrid {registry_type}, you "
|
|
51
|
+
"will need to specify a new version handle in the "
|
|
52
|
+
f"{registry_type} config."
|
|
53
|
+
)
|
|
54
|
+
raise ValueError(msg)
|
|
55
|
+
# if update is true...
|
|
56
|
+
else:
|
|
57
|
+
# list existing project registries
|
|
58
|
+
existing_versions = []
|
|
59
|
+
for f in os.listdir(registry_path):
|
|
60
|
+
if f.startswith(id_handle):
|
|
61
|
+
existing_versions.append(int(f.split("-v")[1].split(".")[0]))
|
|
62
|
+
# check for existing project registries
|
|
63
|
+
if len(existing_versions) == 0:
|
|
64
|
+
msg = (
|
|
65
|
+
"Registration.update=True, however, no updates can be made "
|
|
66
|
+
f"because there are no existing registries for {registry_type}"
|
|
67
|
+
f" ID = {id_handle}. Check project_id or set "
|
|
68
|
+
f"Registration.update=True in the {registry_type} Config."
|
|
69
|
+
)
|
|
70
|
+
raise ValueError(msg)
|
|
71
|
+
# find the latest registry version
|
|
72
|
+
# NOTE: this is currently based on major verison only
|
|
73
|
+
last_vmajor_nbr = sorted(existing_versions)[-1]
|
|
74
|
+
old_project_version = f"{id_handle}-v{last_vmajor_nbr}.0.0"
|
|
75
|
+
old_registry_file = f"{registry_path}/{old_project_version}.json5"
|
|
76
|
+
|
|
77
|
+
# depricate old project registry
|
|
78
|
+
t = json5.load(old_registry_file)
|
|
79
|
+
t["status"] = "Deprecated"
|
|
80
|
+
with open(old_registry_file.format(**locals()), "w") as f:
|
|
81
|
+
json5.dump(t, f)
|
|
82
|
+
|
|
83
|
+
# update version
|
|
84
|
+
# TODO NEED REAL LOGIC FOR THIS!
|
|
85
|
+
# - Currently assuming only major version is being updated
|
|
86
|
+
major = int(last_vmajor_nbr) + 1
|
|
87
|
+
minor = 0 # TODO: assume 0 for now
|
|
88
|
+
patch = 0 # TODO: assume 0 for now
|
|
89
|
+
|
|
90
|
+
version = f"{id_handle}-v{major}.{minor}.{patch}"
|
|
91
|
+
|
|
92
|
+
return version
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Python wrapper for Rust-based pattern finding functionality."""
|
|
2
|
+
|
|
3
|
+
from dsgrid.rust_ext.find_minimal_patterns import find_minimal_patterns_from_file
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from dsgrid.minimal_patterns import Pattern, PatternConfig
|
|
7
|
+
except ImportError as e:
|
|
8
|
+
msg = (
|
|
9
|
+
"Failed to import minimal_patterns Rust extension. "
|
|
10
|
+
"Make sure the package was built with maturin: `pip install -e .` or `maturin develop`"
|
|
11
|
+
)
|
|
12
|
+
raise ImportError() from e
|
|
13
|
+
|
|
14
|
+
__all__ = ["Pattern", "PatternConfig", "find_minimal_patterns_from_file"]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Python wrapper for Rust-based pattern finding functionality."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import logging
|
|
5
|
+
import shutil
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from dsgrid.minimal_patterns import Pattern, PatternConfig, find_minimal_patterns
|
|
11
|
+
except ImportError as e:
|
|
12
|
+
msg = (
|
|
13
|
+
"Failed to import minimal_patterns Rust extension. "
|
|
14
|
+
"Make sure the package was built with maturin: `pip install -e .` or `maturin develop`"
|
|
15
|
+
)
|
|
16
|
+
raise ImportError() from e
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def find_minimal_patterns_from_file(
|
|
22
|
+
file_path: str | Path,
|
|
23
|
+
max_depth: int = 0,
|
|
24
|
+
prune_miss_empty: bool = True,
|
|
25
|
+
ratio_threshold: float = 50.0,
|
|
26
|
+
threads: int = 0,
|
|
27
|
+
verbose: bool = False,
|
|
28
|
+
output_dir: str | Path | None = "missing_associations",
|
|
29
|
+
) -> list[Pattern]:
|
|
30
|
+
"""Find minimal closed patterns in a Parquet file containing categorical data.
|
|
31
|
+
|
|
32
|
+
This function analyzes a Parquet file to discover minimal closed patterns -
|
|
33
|
+
the simplest column combinations that characterize complete subsets of your data.
|
|
34
|
+
Patterns are grouped by their column combinations and written to CSV files.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
file_path : str | Path
|
|
39
|
+
Path to the input Parquet file
|
|
40
|
+
max_depth : int, optional
|
|
41
|
+
Maximum pattern size (number of columns). 0 = unlimited. Default: 0.
|
|
42
|
+
prune_miss_empty : bool, optional
|
|
43
|
+
Prune patterns with no matching rows (recommended: True). Default: True.
|
|
44
|
+
ratio_threshold : float, optional
|
|
45
|
+
Ratio threshold for pruning. Default: 50.0.
|
|
46
|
+
threads : int, optional
|
|
47
|
+
Number of threads to use (0 = use all available cores). Default: 0.
|
|
48
|
+
verbose : bool, optional
|
|
49
|
+
Enable verbose progress output. Default: False.
|
|
50
|
+
output_dir : str | Path | None, optional
|
|
51
|
+
Directory to write CSV files grouping patterns by column combinations.
|
|
52
|
+
Each unique combination of columns produces a separate CSV file named
|
|
53
|
+
``<col1>__<col2>__...__<colN>.csv``. If None, no files are written.
|
|
54
|
+
Default: "missing_associations".
|
|
55
|
+
|
|
56
|
+
Returns
|
|
57
|
+
-------
|
|
58
|
+
list[Pattern]
|
|
59
|
+
List of Pattern objects, each containing:
|
|
60
|
+
|
|
61
|
+
- pattern_id : Unique identifier
|
|
62
|
+
- columns : List of column names in the pattern
|
|
63
|
+
- values : List of values for each column
|
|
64
|
+
- num_rows : Number of rows matching this pattern
|
|
65
|
+
|
|
66
|
+
Raises
|
|
67
|
+
------
|
|
68
|
+
RuntimeError
|
|
69
|
+
If there's an error reading the Parquet file or finding patterns.
|
|
70
|
+
|
|
71
|
+
Examples
|
|
72
|
+
--------
|
|
73
|
+
>>> from dsgrid.rust_ext import find_patterns_from_parquet
|
|
74
|
+
>>> patterns = find_patterns_from_parquet("missing_records.parquet", max_depth=3, verbose=True)
|
|
75
|
+
>>> for p in patterns:
|
|
76
|
+
... print(f"Pattern {p.pattern_id}: {p.columns} = {p.values} ({p.num_rows} rows)")
|
|
77
|
+
"""
|
|
78
|
+
config = PatternConfig(
|
|
79
|
+
max_depth=max_depth,
|
|
80
|
+
prune_miss_empty=prune_miss_empty,
|
|
81
|
+
ratio_threshold=ratio_threshold,
|
|
82
|
+
threads=threads,
|
|
83
|
+
verbose=verbose,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
parquet_path_str = str(file_path)
|
|
87
|
+
logger.info("Finding minimal closed patterns in %s", parquet_path_str)
|
|
88
|
+
|
|
89
|
+
patterns = find_minimal_patterns(parquet_path_str, config)
|
|
90
|
+
|
|
91
|
+
logger.info("Found %d minimal closed patterns", len(patterns))
|
|
92
|
+
|
|
93
|
+
if output_dir is not None:
|
|
94
|
+
_write_patterns_to_csv(patterns, output_dir)
|
|
95
|
+
|
|
96
|
+
return patterns
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _write_patterns_to_csv(patterns: list[Pattern], output_dir: str | Path) -> None:
|
|
100
|
+
"""Write patterns to CSV files grouped by column combinations.
|
|
101
|
+
|
|
102
|
+
Parameters
|
|
103
|
+
----------
|
|
104
|
+
patterns : list[Pattern]
|
|
105
|
+
List of Pattern objects to write.
|
|
106
|
+
output_dir : str | Path
|
|
107
|
+
Directory to write CSV files to.
|
|
108
|
+
"""
|
|
109
|
+
output_path = Path(output_dir)
|
|
110
|
+
if output_path.exists():
|
|
111
|
+
shutil.rmtree(output_path)
|
|
112
|
+
output_path.mkdir(parents=True)
|
|
113
|
+
|
|
114
|
+
grouped: dict[tuple[str, ...], list[Pattern]] = defaultdict(list)
|
|
115
|
+
for pattern in patterns:
|
|
116
|
+
key = tuple(pattern.columns)
|
|
117
|
+
grouped[key].append(pattern)
|
|
118
|
+
|
|
119
|
+
for columns, group_patterns in grouped.items():
|
|
120
|
+
filename = "__".join(columns) + ".csv"
|
|
121
|
+
filepath = output_path / filename
|
|
122
|
+
|
|
123
|
+
with open(filepath, "w", newline="") as f:
|
|
124
|
+
writer = csv.writer(f)
|
|
125
|
+
writer.writerow(list(columns))
|
|
126
|
+
for pattern in group_patterns:
|
|
127
|
+
writer.writerow(pattern.values)
|
|
128
|
+
|
|
129
|
+
logger.info("Wrote %d patterns to %s", len(group_patterns), filepath)
|
dsgrid/spark/__init__.py
ADDED
|
File without changes
|