dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Constructs a report class from a type."""
|
|
2
|
+
|
|
3
|
+
from dsgrid.query.models import ReportType
|
|
4
|
+
from dsgrid.query.report_peak_load import PeakLoadReport
|
|
5
|
+
from dsgrid.query.reports_base import ReportsBase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
_TYPE_TO_CLASS = {
|
|
9
|
+
ReportType.PEAK_LOAD: PeakLoadReport,
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def make_report(report_type: ReportType) -> ReportsBase:
|
|
14
|
+
"""Make a report class from a report_type."""
|
|
15
|
+
cls = _TYPE_TO_CLASS.get(report_type)
|
|
16
|
+
if cls is None:
|
|
17
|
+
msg = str(report_type)
|
|
18
|
+
raise NotImplementedError(msg)
|
|
19
|
+
return cls()
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from dsgrid.common import VALUE_COLUMN
|
|
5
|
+
from dsgrid.data_models import DSGBaseModel
|
|
6
|
+
from dsgrid.dataset.models import ValueFormat
|
|
7
|
+
from dsgrid.dimension.base_models import DimensionType
|
|
8
|
+
from dsgrid.exceptions import DSGInvalidQuery
|
|
9
|
+
from dsgrid.query.models import ProjectQueryModel
|
|
10
|
+
from dsgrid.spark.functions import join_multiple_columns
|
|
11
|
+
from dsgrid.spark.types import F
|
|
12
|
+
from dsgrid.utils.dataset import ordered_subset_columns
|
|
13
|
+
from dsgrid.utils.files import delete_if_exists
|
|
14
|
+
from dsgrid.utils.spark import read_dataframe
|
|
15
|
+
from .query_context import QueryContext
|
|
16
|
+
from .reports_base import ReportsBase
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class PeakLoadInputModel(DSGBaseModel):
|
|
23
|
+
group_by_columns: list[str]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PeakLoadReport(ReportsBase):
|
|
27
|
+
"""Find peak load in a derived dataset."""
|
|
28
|
+
|
|
29
|
+
REPORT_FILENAME = "peak_load.parquet"
|
|
30
|
+
|
|
31
|
+
def check_query(self, query: ProjectQueryModel) -> None:
|
|
32
|
+
if query.result.table_format.format_type != ValueFormat.STACKED:
|
|
33
|
+
msg = "The PeakLoadReport requires the value format to be stacked."
|
|
34
|
+
raise DSGInvalidQuery(msg)
|
|
35
|
+
|
|
36
|
+
def generate(
|
|
37
|
+
self,
|
|
38
|
+
filename: Path,
|
|
39
|
+
output_dir: Path,
|
|
40
|
+
context: QueryContext,
|
|
41
|
+
inputs: PeakLoadInputModel,
|
|
42
|
+
) -> Path:
|
|
43
|
+
value_columns = [VALUE_COLUMN]
|
|
44
|
+
metric_columns = context.get_dimension_column_names(DimensionType.METRIC)
|
|
45
|
+
if len(metric_columns) > 1:
|
|
46
|
+
msg = f"Bug: {metric_columns=}"
|
|
47
|
+
raise Exception(msg)
|
|
48
|
+
metric_column = next(iter(metric_columns))
|
|
49
|
+
group_by_columns = inputs.group_by_columns[:]
|
|
50
|
+
if metric_column not in group_by_columns:
|
|
51
|
+
group_by_columns.append(metric_column)
|
|
52
|
+
|
|
53
|
+
df = read_dataframe(filename)
|
|
54
|
+
expr = [F.max(x).alias(x) for x in value_columns]
|
|
55
|
+
peak_load = df.groupBy(*group_by_columns).agg(*expr)
|
|
56
|
+
join_cols = group_by_columns + value_columns
|
|
57
|
+
time_columns = context.get_dimension_column_names(DimensionType.TIME)
|
|
58
|
+
diff = time_columns.difference(df.columns)
|
|
59
|
+
if diff:
|
|
60
|
+
msg = f"BUG: expected time column(s) {diff} are not present in table"
|
|
61
|
+
raise Exception(msg)
|
|
62
|
+
columns = ordered_subset_columns(df, time_columns) + join_cols
|
|
63
|
+
with_time = join_multiple_columns(peak_load, df.select(*columns), join_cols).sort(
|
|
64
|
+
*group_by_columns
|
|
65
|
+
)
|
|
66
|
+
output_file = output_dir / PeakLoadReport.REPORT_FILENAME
|
|
67
|
+
delete_if_exists(output_file)
|
|
68
|
+
with_time.write.parquet(str(output_file))
|
|
69
|
+
logger.info("Wrote Peak Load Report to %s", output_file)
|
|
70
|
+
return output_file
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from dsgrid.query.models import ProjectQueryModel
|
|
6
|
+
from dsgrid.query.query_context import QueryContext
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ReportsBase(abc.ABC):
|
|
10
|
+
"""Base class for pre-defined reports"""
|
|
11
|
+
|
|
12
|
+
@abc.abstractmethod
|
|
13
|
+
def check_query(self, query: ProjectQueryModel) -> None:
|
|
14
|
+
"""Check compatibility of the user query with the report."""
|
|
15
|
+
|
|
16
|
+
@abc.abstractmethod
|
|
17
|
+
def generate(
|
|
18
|
+
self, filename: Path, output_dir: Path, context: QueryContext, inputs: Any
|
|
19
|
+
) -> Path:
|
|
20
|
+
"""Generate the report on df into output_dir."""
|
|
File without changes
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import getpass
|
|
2
|
+
import logging
|
|
3
|
+
import shutil
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from uuid import uuid4
|
|
6
|
+
|
|
7
|
+
from dsgrid.config.registration_models import RegistrationModel, RegistrationJournal
|
|
8
|
+
from dsgrid.dimension.base_models import DatasetDimensionRequirements
|
|
9
|
+
from dsgrid.registry.registry_manager import RegistryManager
|
|
10
|
+
from dsgrid.utils.id_remappings import (
|
|
11
|
+
map_dimension_ids_to_names,
|
|
12
|
+
map_dimension_names_to_ids,
|
|
13
|
+
map_dimension_mapping_names_to_ids,
|
|
14
|
+
replace_dimension_mapping_names_with_current_ids,
|
|
15
|
+
replace_dimension_names_with_current_ids,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def bulk_register(
|
|
23
|
+
registry_manager: RegistryManager,
|
|
24
|
+
registration_file: Path,
|
|
25
|
+
data_base_dir: Path | None = None,
|
|
26
|
+
missing_associations_base_dir: Path | None = None,
|
|
27
|
+
repo_base_dir: Path | None = None,
|
|
28
|
+
journal_file: Path | None = None,
|
|
29
|
+
dataset_dimension_requirements: DatasetDimensionRequirements | None = None,
|
|
30
|
+
):
|
|
31
|
+
"""Bulk register projects, datasets, and their dimensions. If any failure occurs, the code
|
|
32
|
+
records successfully registered project and dataset IDs to a journal file and prints its
|
|
33
|
+
filename to the console. Users can pass that filename with the --journal-file option to
|
|
34
|
+
avoid re-registering those projects and datasets on subsequent attempts.
|
|
35
|
+
|
|
36
|
+
The JSON/JSON5 filename must match the data model defined by this documentation:
|
|
37
|
+
|
|
38
|
+
https://dsgrid.github.io/dsgrid/reference/data_models/project.html#dsgrid.config.registration_models.RegistrationModel
|
|
39
|
+
"""
|
|
40
|
+
registration = RegistrationModel.from_file(registration_file)
|
|
41
|
+
tmp_files = []
|
|
42
|
+
if journal_file is None:
|
|
43
|
+
journal_file = Path(f"journal__{uuid4()}.json5")
|
|
44
|
+
journal = RegistrationJournal()
|
|
45
|
+
else:
|
|
46
|
+
journal = RegistrationJournal.from_file(journal_file)
|
|
47
|
+
registration = registration.filter_by_journal(journal)
|
|
48
|
+
failure_occurred = False
|
|
49
|
+
try:
|
|
50
|
+
return _run_bulk_registration(
|
|
51
|
+
registry_manager,
|
|
52
|
+
registration,
|
|
53
|
+
tmp_files,
|
|
54
|
+
data_base_dir,
|
|
55
|
+
missing_associations_base_dir,
|
|
56
|
+
repo_base_dir,
|
|
57
|
+
journal,
|
|
58
|
+
dataset_dimension_requirements,
|
|
59
|
+
)
|
|
60
|
+
except Exception:
|
|
61
|
+
failure_occurred = True
|
|
62
|
+
raise
|
|
63
|
+
finally:
|
|
64
|
+
if failure_occurred and journal.has_entries():
|
|
65
|
+
journal_file.write_text(journal.model_dump_json(indent=2), encoding="utf-8")
|
|
66
|
+
logger.info(
|
|
67
|
+
"Recorded successfully registered projects and datasets to %s. "
|
|
68
|
+
"Pass this file to the `--journal-file` option of this command to skip those IDs "
|
|
69
|
+
"on subsequent attempts.",
|
|
70
|
+
journal_file,
|
|
71
|
+
)
|
|
72
|
+
elif journal_file.exists():
|
|
73
|
+
journal_file.unlink()
|
|
74
|
+
logger.info("Deleted journal file %s after successful registration.", journal_file)
|
|
75
|
+
for path in tmp_files:
|
|
76
|
+
path.unlink()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _run_bulk_registration(
|
|
80
|
+
mgr: RegistryManager,
|
|
81
|
+
registration: RegistrationModel,
|
|
82
|
+
tmp_files: list[Path],
|
|
83
|
+
data_base_dir: Path | None,
|
|
84
|
+
missing_associations_base_dir: Path | None,
|
|
85
|
+
base_repo_dir: Path | None,
|
|
86
|
+
journal: RegistrationJournal,
|
|
87
|
+
dataset_dimension_requirements: DatasetDimensionRequirements | None,
|
|
88
|
+
):
|
|
89
|
+
user = getpass.getuser()
|
|
90
|
+
project_mgr = mgr.project_manager
|
|
91
|
+
dataset_mgr = mgr.dataset_manager
|
|
92
|
+
dim_mgr = mgr.dimension_manager
|
|
93
|
+
dim_mapping_mgr = mgr.dimension_mapping_manager
|
|
94
|
+
|
|
95
|
+
if base_repo_dir is not None:
|
|
96
|
+
for project in registration.projects:
|
|
97
|
+
if not project.config_file.is_absolute():
|
|
98
|
+
project.config_file = base_repo_dir / project.config_file
|
|
99
|
+
for dataset in registration.datasets:
|
|
100
|
+
if not dataset.config_file.is_absolute():
|
|
101
|
+
dataset.config_file = base_repo_dir / dataset.config_file
|
|
102
|
+
for dataset in registration.dataset_submissions:
|
|
103
|
+
for field in (
|
|
104
|
+
"dimension_mapping_file",
|
|
105
|
+
"dimension_mapping_references_file",
|
|
106
|
+
):
|
|
107
|
+
path = getattr(dataset, field)
|
|
108
|
+
if path is not None and not path.is_absolute():
|
|
109
|
+
setattr(dataset, field, base_repo_dir / path)
|
|
110
|
+
|
|
111
|
+
for project in registration.projects:
|
|
112
|
+
assert project.log_message is not None
|
|
113
|
+
project_mgr.register(project.config_file, user, project.log_message)
|
|
114
|
+
journal.add_project(project.project_id)
|
|
115
|
+
|
|
116
|
+
for dataset in registration.datasets:
|
|
117
|
+
config_file = None
|
|
118
|
+
if dataset.replace_dimension_names_with_ids:
|
|
119
|
+
mappings = map_dimension_names_to_ids(dim_mgr)
|
|
120
|
+
orig = dataset.config_file
|
|
121
|
+
config_file = orig.with_stem(orig.name + "__tmp")
|
|
122
|
+
shutil.copyfile(orig, config_file)
|
|
123
|
+
tmp_files.append(config_file)
|
|
124
|
+
replace_dimension_names_with_current_ids(config_file, mappings)
|
|
125
|
+
else:
|
|
126
|
+
config_file = dataset.config_file
|
|
127
|
+
|
|
128
|
+
assert dataset.log_message is not None
|
|
129
|
+
dataset_mgr.register(
|
|
130
|
+
config_file,
|
|
131
|
+
user,
|
|
132
|
+
dataset.log_message,
|
|
133
|
+
data_base_dir=data_base_dir,
|
|
134
|
+
missing_associations_base_dir=missing_associations_base_dir,
|
|
135
|
+
requirements=dataset_dimension_requirements,
|
|
136
|
+
)
|
|
137
|
+
journal.add_dataset(dataset.dataset_id)
|
|
138
|
+
|
|
139
|
+
for dataset in registration.dataset_submissions:
|
|
140
|
+
refs_file = None
|
|
141
|
+
if (
|
|
142
|
+
dataset.replace_dimension_mapping_names_with_ids
|
|
143
|
+
and dataset.dimension_mapping_references_file is not None
|
|
144
|
+
):
|
|
145
|
+
dim_id_to_name = map_dimension_ids_to_names(mgr.dimension_manager)
|
|
146
|
+
mappings = map_dimension_mapping_names_to_ids(dim_mapping_mgr, dim_id_to_name)
|
|
147
|
+
orig = dataset.dimension_mapping_references_file
|
|
148
|
+
refs_file = orig.with_stem(orig.name + "__tmp")
|
|
149
|
+
shutil.copyfile(orig, refs_file)
|
|
150
|
+
tmp_files.append(refs_file)
|
|
151
|
+
replace_dimension_mapping_names_with_current_ids(refs_file, mappings)
|
|
152
|
+
else:
|
|
153
|
+
refs_file = dataset.dimension_mapping_references_file
|
|
154
|
+
|
|
155
|
+
assert dataset.log_message is not None
|
|
156
|
+
project_mgr.submit_dataset(
|
|
157
|
+
dataset.project_id,
|
|
158
|
+
dataset.dataset_id,
|
|
159
|
+
user,
|
|
160
|
+
dataset.log_message,
|
|
161
|
+
dimension_mapping_file=dataset.dimension_mapping_file,
|
|
162
|
+
dimension_mapping_references_file=refs_file,
|
|
163
|
+
autogen_reverse_supplemental_mappings=dataset.autogen_reverse_supplemental_mappings,
|
|
164
|
+
)
|
|
165
|
+
journal.add_submitted_dataset(dataset.dataset_id, dataset.project_id)
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
"""Common definitions for registry components"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from collections import namedtuple
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from enum import StrEnum
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
from pydantic import Field
|
|
12
|
+
|
|
13
|
+
from dsgrid.data_models import DSGBaseModel
|
|
14
|
+
from dsgrid.exceptions import DSGInvalidParameter
|
|
15
|
+
from dsgrid.utils.versioning import make_version
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
REGISTRY_LOG_FILE = "dsgrid_registry.log"
|
|
19
|
+
# Allows letters, numbers, underscores, spaces, dashes
|
|
20
|
+
# Allows letters, numbers, underscores, dashes, spaces
|
|
21
|
+
REGEX_VALID_REGISTRY_NAME = re.compile(r"^[\w -]+$")
|
|
22
|
+
# Allows letters, numbers, underscores, dashes
|
|
23
|
+
REGEX_VALID_REGISTRY_CONFIG_ID_LOOSE = re.compile(r"^[\w/-]+$")
|
|
24
|
+
# Allows letters, numbers, underscores.
|
|
25
|
+
# dataset_id cannot start with a number because of uses in DatasetExpressionHandler
|
|
26
|
+
# It's likely a good rule everywhere else.
|
|
27
|
+
REGEX_VALID_REGISTRY_CONFIG_ID_STRICT = re.compile(r"^[a-zA-Z][\w]+$")
|
|
28
|
+
|
|
29
|
+
REGISTRY_ID_DELIMITER = "__"
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def check_config_id_loose(config_id, tag):
|
|
35
|
+
# Raises ValueError because this is used in Pydantic models.
|
|
36
|
+
if not REGEX_VALID_REGISTRY_CONFIG_ID_LOOSE.search(config_id):
|
|
37
|
+
msg = f"{tag} ID={config_id} is invalid. Restricted to letters, numbers, underscores, and dashes."
|
|
38
|
+
raise ValueError(msg)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def check_config_id_strict(config_id, tag):
|
|
42
|
+
# Raises ValueError because this is used in Pydantic models.
|
|
43
|
+
if not REGEX_VALID_REGISTRY_CONFIG_ID_STRICT.search(config_id):
|
|
44
|
+
msg = (
|
|
45
|
+
f"{tag} ID={config_id} is invalid. Restricted to letters, numbers, and underscores. "
|
|
46
|
+
"Cannot start with a number."
|
|
47
|
+
)
|
|
48
|
+
raise ValueError(msg)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DatabaseConnection(DSGBaseModel):
|
|
52
|
+
"""Input information to connect to a registry database"""
|
|
53
|
+
|
|
54
|
+
url: str
|
|
55
|
+
# There attributes are commented-out because the registry is currently only
|
|
56
|
+
# supported in SQLite. If/when we add postgres support, these can be added back.
|
|
57
|
+
# database: str = "dsgrid"
|
|
58
|
+
# hostname: str = "localhost"
|
|
59
|
+
# port: int = 8529
|
|
60
|
+
# username: str = "root"
|
|
61
|
+
# password: str = DEFAULT_DB_PASSWORD
|
|
62
|
+
|
|
63
|
+
# @classmethod
|
|
64
|
+
# def from_url(cls, url, **kwargs):
|
|
65
|
+
# """Create a connection from a URL."""
|
|
66
|
+
# regex = re.compile(r"http://(.*):(\d+)")
|
|
67
|
+
# match = regex.search(url)
|
|
68
|
+
# if match is None:
|
|
69
|
+
# raise DSGInvalidParameter(f"Invalid URL format: {url}")
|
|
70
|
+
# hostname = match.group(1)
|
|
71
|
+
# port = match.group(2)
|
|
72
|
+
# return cls(hostname=hostname, port=port, **kwargs)
|
|
73
|
+
|
|
74
|
+
def get_filename(self) -> Path:
|
|
75
|
+
"""Return the filename from the URL. Only valid for SQLite databases.
|
|
76
|
+
|
|
77
|
+
Raises
|
|
78
|
+
------
|
|
79
|
+
DSGInvalidParameter
|
|
80
|
+
Raised if the URL does not conform to the SQLite format.
|
|
81
|
+
"""
|
|
82
|
+
# All call sites will need to be changed if/when we support Postgres.
|
|
83
|
+
filename = self.try_get_filename()
|
|
84
|
+
if filename is None:
|
|
85
|
+
msg = (
|
|
86
|
+
f"Failed to parse '{self.url}' into a SQLite URL. "
|
|
87
|
+
"The SQLite file path must be specified in the format 'sqlite:///</path/to/db_file.db>'. "
|
|
88
|
+
)
|
|
89
|
+
raise DSGInvalidParameter(msg)
|
|
90
|
+
return filename
|
|
91
|
+
|
|
92
|
+
def try_get_filename(self) -> Path | None:
|
|
93
|
+
"""Return the filename from the URL, if file-based, otherwise None."""
|
|
94
|
+
regex = re.compile(r"sqlite:\/\/\/(.*)")
|
|
95
|
+
match = regex.search(self.url)
|
|
96
|
+
if not match:
|
|
97
|
+
return None
|
|
98
|
+
return Path(match.group(1))
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class DataStoreType(StrEnum):
|
|
102
|
+
"""Specifies the type of data store used for the registry."""
|
|
103
|
+
|
|
104
|
+
FILESYSTEM = "filesystem"
|
|
105
|
+
DUCKDB = "duckdb"
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class RegistryType(StrEnum):
|
|
109
|
+
"""Registry types"""
|
|
110
|
+
|
|
111
|
+
DATASET = "dataset"
|
|
112
|
+
DIMENSION = "dimension"
|
|
113
|
+
DIMENSION_MAPPING = "dimension_mapping"
|
|
114
|
+
PROJECT = "project"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
MODEL_TYPE_TO_ID_FIELD_MAPPING = {
|
|
118
|
+
RegistryType.PROJECT: "project_id",
|
|
119
|
+
RegistryType.DATASET: "dataset_id",
|
|
120
|
+
RegistryType.DIMENSION: "dimension_id",
|
|
121
|
+
RegistryType.DIMENSION_MAPPING: "mapping_id",
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class RegistryTables(StrEnum):
|
|
126
|
+
"""Registry tables"""
|
|
127
|
+
|
|
128
|
+
KEY_VALUE = "key_value"
|
|
129
|
+
CURRENT_VERSIONS = "current_versions"
|
|
130
|
+
MODELS = "models"
|
|
131
|
+
REGISTRATIONS = "registrations"
|
|
132
|
+
CONTAINS = "contains"
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class DatasetRegistryStatus(StrEnum):
|
|
136
|
+
"""Statuses for a dataset within a project"""
|
|
137
|
+
|
|
138
|
+
UNREGISTERED = "Unregistered"
|
|
139
|
+
REGISTERED = "Registered"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class ProjectRegistryStatus(StrEnum):
|
|
143
|
+
"""Statuses for a project within the DSGRID registry"""
|
|
144
|
+
|
|
145
|
+
INITIAL_REGISTRATION = "Initial Registration"
|
|
146
|
+
IN_PROGRESS = "In Progress"
|
|
147
|
+
COMPLETE = "Complete"
|
|
148
|
+
PUBLISHED = "Published"
|
|
149
|
+
DEPRECATED = "Deprecated"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class VersionUpdateType(StrEnum):
|
|
153
|
+
"""Types of updates that can be made to projects, datasets, and dimensions"""
|
|
154
|
+
|
|
155
|
+
# TODO: we need to find general version update types that can be mapped to
|
|
156
|
+
# major, minor and patch.
|
|
157
|
+
# i.e., replace input_dataset, fix project_config,
|
|
158
|
+
MAJOR = "major"
|
|
159
|
+
MINOR = "minor"
|
|
160
|
+
PATCH = "patch"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# These keys are used to store references to project/dataset configs and dimensions
|
|
164
|
+
# in dictionaries.
|
|
165
|
+
ConfigKey = namedtuple("ConfigKey", ["id", "version"])
|
|
166
|
+
|
|
167
|
+
# Convenience container to be shared among the registry managers.
|
|
168
|
+
# Obviates the need to pass parameters to many constructors.
|
|
169
|
+
RegistryManagerParams = namedtuple(
|
|
170
|
+
"RegistryManagerParams",
|
|
171
|
+
[
|
|
172
|
+
"base_path",
|
|
173
|
+
"remote_path",
|
|
174
|
+
"use_remote_data",
|
|
175
|
+
"fs_interface",
|
|
176
|
+
"cloud_interface",
|
|
177
|
+
"offline",
|
|
178
|
+
"scratch_dir",
|
|
179
|
+
],
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class RegistrationModel(DSGBaseModel):
|
|
184
|
+
"""Registration fields required by the ProjectConfig and DatasetConfig"""
|
|
185
|
+
|
|
186
|
+
id: int | None = Field(default=None, description="database ID of the registration")
|
|
187
|
+
timestamp: datetime = Field(
|
|
188
|
+
title="timestamp",
|
|
189
|
+
description="Registration timestamp",
|
|
190
|
+
)
|
|
191
|
+
submitter: str = Field(
|
|
192
|
+
title="submitter",
|
|
193
|
+
description="Username that submitted the registration",
|
|
194
|
+
)
|
|
195
|
+
log_message: str | None = Field(
|
|
196
|
+
default=None,
|
|
197
|
+
title="log_message",
|
|
198
|
+
description="Reason for the update",
|
|
199
|
+
)
|
|
200
|
+
update_type: VersionUpdateType = Field(
|
|
201
|
+
title="update_type",
|
|
202
|
+
description="Type of update",
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def get_version_from_filename(filename):
|
|
207
|
+
"""Return the handle and version from a registry file."""
|
|
208
|
+
regex = re.compile(r"(?P<handle>\w+)-v(?P<version>[\d\.]+).json5")
|
|
209
|
+
match = regex.search(filename)
|
|
210
|
+
assert match, filename
|
|
211
|
+
return match.groupdict("handle"), make_version(match.groupdict("version"))
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def make_filename_from_version(handle, version):
|
|
215
|
+
"""Make a filename with the handle and version."""
|
|
216
|
+
return f"{handle}-v{version}.json5"
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
# def update_version(id_handle, update, registry_path):
|
|
220
|
+
# """Determine registration or project version for registration.
|
|
221
|
+
#
|
|
222
|
+
# TODO: Current solution is a quick hack. This needs to be better/formalized.
|
|
223
|
+
# - Need smarter version updating / checks; use semvar packages
|
|
224
|
+
# - Set to work with some central version (like S3)
|
|
225
|
+
# - Currently only updating major version
|
|
226
|
+
# - NOTE: not currently utilitzing the update_type in
|
|
227
|
+
# RegistrationModel. Could use this to set
|
|
228
|
+
# major/minor/patch update decisiosns
|
|
229
|
+
#
|
|
230
|
+
# Args:
|
|
231
|
+
# registry_type (RegistryType): type of registry (e.g., Project, Dataset)
|
|
232
|
+
# id_handle (str): ID handle is either the project_id or dataset_id
|
|
233
|
+
# update (bool): config registration update setting
|
|
234
|
+
# """
|
|
235
|
+
#
|
|
236
|
+
# # TODO: remove when done. project path should be set somewhere else
|
|
237
|
+
# if not os.path.exists(registry_path):
|
|
238
|
+
# raise ValueError(f"Path does not exist: {registry_path}")
|
|
239
|
+
#
|
|
240
|
+
# # if config.update is False, then assume major=1, minor=0, patch=0
|
|
241
|
+
# if not update:
|
|
242
|
+
# version = VersionInfo(major=1)
|
|
243
|
+
# registry_file = Path(registry_path) / make_filename_from_version(id_handle, version)
|
|
244
|
+
# # Raise error if v1.0.0 registry exists for project_id
|
|
245
|
+
# if os.path.exists(registry_file):
|
|
246
|
+
# raise ValueError(
|
|
247
|
+
# f'{registry_type} registry for "{registry_file}" already '
|
|
248
|
+
# f"exists. If you want to update the project registration"
|
|
249
|
+
# f" with a new {registry_type} version, then you will need to"
|
|
250
|
+
# f" set update=True in {registry_type} config. Alternatively, "
|
|
251
|
+
# f"if you want to initiate a new dsgrid {registry_type}, you "
|
|
252
|
+
# "will need to specify a new version handle in the "
|
|
253
|
+
# f"{registry_type} config."
|
|
254
|
+
# )
|
|
255
|
+
# # if update is true...
|
|
256
|
+
# else:
|
|
257
|
+
# # list existing project registries
|
|
258
|
+
# existing_versions = []
|
|
259
|
+
# for f in os.listdir(registry_path):
|
|
260
|
+
# handle, version = get_version_from_filename(f)
|
|
261
|
+
# if handle == id_handle:
|
|
262
|
+
# existing_versions.append(version)
|
|
263
|
+
# # check for existing project registries
|
|
264
|
+
# if not existing_versions:
|
|
265
|
+
# raise ValueError(
|
|
266
|
+
# "Registration.update=True, however, no updates can be made "
|
|
267
|
+
# f"because there are no existing registries for {registry_type}"
|
|
268
|
+
# f" ID = {id_handle}. Check project_id or set "
|
|
269
|
+
# f"Registration.update=True in the {registry_type} Config."
|
|
270
|
+
# )
|
|
271
|
+
# # find the latest registry version
|
|
272
|
+
# # NOTE: this is currently based on major verison only
|
|
273
|
+
# last_version = sorted(existing_versions)[-1]
|
|
274
|
+
# old_project_version = make_filename_from_version(id_handle, last_version)
|
|
275
|
+
# old_registry_file = os.path.join(registry_path, old_project_version)
|
|
276
|
+
#
|
|
277
|
+
# # deprecate old project registry
|
|
278
|
+
# t = deserialize_registry(old_registry_file)
|
|
279
|
+
# # DT: Can we use an enum here? Spelling/capitalization mistakes could be costly.
|
|
280
|
+
# # Deprecated is a project status.
|
|
281
|
+
# t["status"] = "Deprecated"
|
|
282
|
+
# # DT: can we use version
|
|
283
|
+
# t["version"] = last_version.bump_major()
|
|
284
|
+
# # TODO: deserialize_registry should have returned a Pydantic model
|
|
285
|
+
# serialize_registry(t, make_filename_from_version(id_handle, t["version"]))
|
|
286
|
+
#
|
|
287
|
+
# return version
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from dsgrid.exceptions import DSGInvalidOperation
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ConfigUpdateCheckerBase(abc.ABC):
|
|
11
|
+
"""Base class for updating all config models"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, old_model, new_model):
|
|
14
|
+
self._old_model = old_model
|
|
15
|
+
self._new_model = new_model
|
|
16
|
+
assert type(self._old_model) is type(self._new_model) # noqa: E721
|
|
17
|
+
self._type = type(self._old_model)
|
|
18
|
+
self._changed_fields = set()
|
|
19
|
+
|
|
20
|
+
def _check_common(self):
|
|
21
|
+
for field, attrs in self._type.model_fields.items():
|
|
22
|
+
old = getattr(self._old_model, field)
|
|
23
|
+
new = getattr(self._new_model, field)
|
|
24
|
+
if old != new:
|
|
25
|
+
extra = attrs.json_schema_extra
|
|
26
|
+
if extra and not extra.get("updateable", True):
|
|
27
|
+
msg = f"{self._type}.{field} cannot be updated"
|
|
28
|
+
raise DSGInvalidOperation(msg)
|
|
29
|
+
self._changed_fields.add(field)
|
|
30
|
+
logger.info("%s %s changed from %s to %s.", self._type, field, old, new)
|
|
31
|
+
# FUTURE: We could recurse into each dsgrid pydantic model and check each individual
|
|
32
|
+
# field. Would also need to handle lists and dicts of models.
|
|
33
|
+
# This would allow more precise control of changed fields and much better logging.
|
|
34
|
+
|
|
35
|
+
@abc.abstractmethod
|
|
36
|
+
def check_preconditions(self):
|
|
37
|
+
"""Check preconditions for performing an update.
|
|
38
|
+
|
|
39
|
+
Raises
|
|
40
|
+
------
|
|
41
|
+
DSGInvalidRegistryState
|
|
42
|
+
Raised if a precondition is violated.
|
|
43
|
+
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
@abc.abstractmethod
|
|
47
|
+
def handle_postconditions(self):
|
|
48
|
+
"""Handle any required postconditions."""
|
|
49
|
+
|
|
50
|
+
def run(self):
|
|
51
|
+
"""Run all checks.
|
|
52
|
+
|
|
53
|
+
Raises
|
|
54
|
+
------
|
|
55
|
+
DSGInvalidOperation
|
|
56
|
+
Raised if the user is changing an immutable field.
|
|
57
|
+
DSGInvalidRegistryState
|
|
58
|
+
Raised if a precondition is violated.
|
|
59
|
+
|
|
60
|
+
"""
|
|
61
|
+
self.check_preconditions()
|
|
62
|
+
self._check_common()
|
|
63
|
+
self.handle_postconditions()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from dsgrid.registry.common import DataStoreType
|
|
4
|
+
from dsgrid.registry.data_store_interface import DataStoreInterface
|
|
5
|
+
from dsgrid.registry.duckdb_data_store import DuckDbDataStore
|
|
6
|
+
from dsgrid.registry.filesystem_data_store import FilesystemDataStore
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def make_data_store(
|
|
10
|
+
base_path: Path, data_store_type: DataStoreType, initialize: bool = False
|
|
11
|
+
) -> DataStoreInterface:
|
|
12
|
+
"""Factory function to create a data store.
|
|
13
|
+
|
|
14
|
+
Parameters
|
|
15
|
+
----------
|
|
16
|
+
base_path : Path
|
|
17
|
+
The base path for the data store.
|
|
18
|
+
data_store_type : DataStoreType
|
|
19
|
+
The type of data store to create.
|
|
20
|
+
initialize : bool
|
|
21
|
+
Whether to initialize the data store.
|
|
22
|
+
"""
|
|
23
|
+
match data_store_type:
|
|
24
|
+
case DataStoreType.FILESYSTEM:
|
|
25
|
+
cls = FilesystemDataStore
|
|
26
|
+
case DataStoreType.DUCKDB:
|
|
27
|
+
cls = DuckDbDataStore
|
|
28
|
+
case _:
|
|
29
|
+
msg = f"Unsupported data store type: {data_store_type}"
|
|
30
|
+
raise NotImplementedError(msg)
|
|
31
|
+
|
|
32
|
+
if initialize:
|
|
33
|
+
return cls.create(base_path)
|
|
34
|
+
return cls.load(base_path)
|