dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from pydantic import field_validator, Field
|
|
5
|
+
|
|
6
|
+
from dsgrid.data_models import DSGBaseModel
|
|
7
|
+
from dsgrid.utils.utilities import check_uniqueness
|
|
8
|
+
from .config_base import ConfigBase
|
|
9
|
+
from .dimensions import DimensionModel, DimensionsListModel
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DimensionsConfigModel(DSGBaseModel):
|
|
15
|
+
"""Represents multiple dimension models.
|
|
16
|
+
|
|
17
|
+
Used when registering multiple dimensions in one command.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
dimensions: DimensionsListModel = Field(
|
|
21
|
+
title="dimensions",
|
|
22
|
+
description="Dimensions for submission to the dimension registry",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
@field_validator("dimensions")
|
|
26
|
+
@classmethod
|
|
27
|
+
def check_files(cls, values: dict) -> dict:
|
|
28
|
+
"""Validate dimension files are unique across all dimensions"""
|
|
29
|
+
check_uniqueness(
|
|
30
|
+
(x.filename for x in values if isinstance(x, DimensionModel) and x.filename),
|
|
31
|
+
"dimension record filename",
|
|
32
|
+
)
|
|
33
|
+
return values
|
|
34
|
+
|
|
35
|
+
@field_validator("dimensions")
|
|
36
|
+
@classmethod
|
|
37
|
+
def check_names(cls, values: dict) -> dict:
|
|
38
|
+
"""Validate dimension names are unique across all dimensions."""
|
|
39
|
+
check_uniqueness(
|
|
40
|
+
[dim.name for dim in values],
|
|
41
|
+
"dimension record name",
|
|
42
|
+
)
|
|
43
|
+
return values
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DimensionsConfig(ConfigBase):
|
|
47
|
+
"""Provides an interface to a DimensionsConfigModel."""
|
|
48
|
+
|
|
49
|
+
def __init__(self, *args, **kwargs):
|
|
50
|
+
super().__init__(*args, **kwargs)
|
|
51
|
+
self._src_dir = None
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def config_filename():
|
|
55
|
+
return "dimensions.json5"
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def config_id(self):
|
|
59
|
+
assert False, "not correct for this class"
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def model_class():
|
|
63
|
+
return DimensionsConfigModel
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def load(cls, config_filename: Path, *args, **kwargs):
|
|
67
|
+
return super().load(config_filename, *args, **kwargs)
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def load_from_model(cls, model):
|
|
71
|
+
return cls(model)
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Self
|
|
4
|
+
|
|
5
|
+
from pydantic import Field, field_validator, model_validator
|
|
6
|
+
|
|
7
|
+
from dsgrid.data_models import DSGBaseModel
|
|
8
|
+
from dsgrid.dimension.base_models import DimensionType
|
|
9
|
+
from dsgrid.exceptions import DSGInvalidDataset, DSGInvalidField
|
|
10
|
+
from dsgrid.spark.functions import read_csv_duckdb, read_json, read_parquet
|
|
11
|
+
from dsgrid.spark.types import DataFrame, DUCKDB_COLUMN_TYPES, SUPPORTED_TYPES
|
|
12
|
+
from dsgrid.utils.scratch_dir_context import ScratchDirContext
|
|
13
|
+
from dsgrid.utils.spark import write_dataframe
|
|
14
|
+
from dsgrid.utils.utilities import check_uniqueness
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Column(DSGBaseModel):
|
|
21
|
+
name: str = Field(description="Name of the column")
|
|
22
|
+
dimension_type: DimensionType | None = Field(
|
|
23
|
+
default=None,
|
|
24
|
+
description="Dimension represented by the data in the column. Optional if this is a "
|
|
25
|
+
"time column or pivoted column. Required if the column represents a stacked dimension "
|
|
26
|
+
"but an alternate name is being used, such as 'county' instead of 'geography'. "
|
|
27
|
+
"dsgrid will rename any column that is set at runtime, writing out the result to the "
|
|
28
|
+
"registry's data directory. The original dataset is not modified.",
|
|
29
|
+
)
|
|
30
|
+
data_type: str | None = Field(
|
|
31
|
+
default=None, description="Type of the data in the column. If None, infer the type."
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
@field_validator("data_type")
|
|
35
|
+
@classmethod
|
|
36
|
+
def check_data_type(cls, data_type: str | None) -> str | None:
|
|
37
|
+
if data_type is None:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
type_upper = data_type.upper()
|
|
41
|
+
if type_upper not in SUPPORTED_TYPES:
|
|
42
|
+
supported_data_types = sorted(SUPPORTED_TYPES)
|
|
43
|
+
msg = f"{data_type=} is not one of {supported_data_types=}"
|
|
44
|
+
raise ValueError(msg)
|
|
45
|
+
return type_upper
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class FileSchema(DSGBaseModel):
|
|
49
|
+
"""Defines the format of a data file (CSV, JSON, Parquet)."""
|
|
50
|
+
|
|
51
|
+
path: str | None = Field(description="Path to the file. Must be assigned during registration.")
|
|
52
|
+
columns: list[Column] = Field(
|
|
53
|
+
default=[], description="Custom schema for the columns in the file."
|
|
54
|
+
)
|
|
55
|
+
ignore_columns: list[str] = Field(
|
|
56
|
+
default=[],
|
|
57
|
+
description="List of column names to ignore (drop) when reading the file.",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
@model_validator(mode="after")
|
|
61
|
+
def check_consistency(self) -> Self:
|
|
62
|
+
if len(self.columns) > 1:
|
|
63
|
+
check_uniqueness((x.name for x in self.columns), "column names")
|
|
64
|
+
|
|
65
|
+
# Check that ignore_columns don't overlap with columns
|
|
66
|
+
column_names = {x.name for x in self.columns}
|
|
67
|
+
ignore_set = set(self.ignore_columns)
|
|
68
|
+
overlap = column_names & ignore_set
|
|
69
|
+
if overlap:
|
|
70
|
+
msg = f"Columns cannot be in both 'columns' and 'ignore_columns': {overlap}"
|
|
71
|
+
raise ValueError(msg)
|
|
72
|
+
|
|
73
|
+
return self
|
|
74
|
+
|
|
75
|
+
def get_data_type_mapping(self) -> dict[str, str]:
|
|
76
|
+
"""Return the mapping of column to data type."""
|
|
77
|
+
return {x.name: x.data_type for x in self.columns if x.data_type is not None}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def read_data_file(
|
|
81
|
+
schema: FileSchema, scratch_dir_context: ScratchDirContext | None = None
|
|
82
|
+
) -> DataFrame:
|
|
83
|
+
"""Read a data file from a schema.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
schema : FileSchema
|
|
88
|
+
Schema defining the file path and column types.
|
|
89
|
+
scratch_dir_context : ScratchDirContext
|
|
90
|
+
Optional location to write temporary files.
|
|
91
|
+
|
|
92
|
+
Returns
|
|
93
|
+
-------
|
|
94
|
+
DataFrame
|
|
95
|
+
A Spark DataFrame containing the file data.
|
|
96
|
+
"""
|
|
97
|
+
if schema.path is None:
|
|
98
|
+
msg = "File path is not assigned"
|
|
99
|
+
raise DSGInvalidDataset(msg)
|
|
100
|
+
|
|
101
|
+
path = Path(schema.path)
|
|
102
|
+
if not path.exists():
|
|
103
|
+
msg = f"{path} does not exist"
|
|
104
|
+
raise FileNotFoundError(msg)
|
|
105
|
+
|
|
106
|
+
expected_columns = {x.name for x in schema.columns}
|
|
107
|
+
|
|
108
|
+
match path.suffix:
|
|
109
|
+
case ".parquet":
|
|
110
|
+
df = read_parquet(path)
|
|
111
|
+
case ".csv":
|
|
112
|
+
column_schema = _get_column_schema(schema, DUCKDB_COLUMN_TYPES)
|
|
113
|
+
df = read_csv_duckdb(path, schema=column_schema)
|
|
114
|
+
case ".json":
|
|
115
|
+
df = read_json(path)
|
|
116
|
+
case _:
|
|
117
|
+
msg = f"Unsupported file type: {path.suffix}"
|
|
118
|
+
raise DSGInvalidDataset(msg)
|
|
119
|
+
|
|
120
|
+
actual_columns = set(df.columns)
|
|
121
|
+
diff = expected_columns.difference(actual_columns)
|
|
122
|
+
if diff:
|
|
123
|
+
msg = f"Expected columns {diff} are not in {actual_columns=}"
|
|
124
|
+
raise DSGInvalidDataset(msg)
|
|
125
|
+
|
|
126
|
+
df = _drop_ignored_columns(df, schema.ignore_columns)
|
|
127
|
+
renames = _get_column_renames(schema)
|
|
128
|
+
if renames:
|
|
129
|
+
df = _rename_columns(df, renames)
|
|
130
|
+
if scratch_dir_context is None:
|
|
131
|
+
renamed_path = path.with_stem(path.stem + "_renamed")
|
|
132
|
+
logger.warning(
|
|
133
|
+
"Creating temporary file at %s. Pass scratch_dir_context to avoid this.",
|
|
134
|
+
renamed_path,
|
|
135
|
+
)
|
|
136
|
+
else:
|
|
137
|
+
renamed_path = scratch_dir_context.get_temp_filename(suffix=path.suffix)
|
|
138
|
+
write_dataframe(df, renamed_path, overwrite=True)
|
|
139
|
+
schema.path = str(renamed_path)
|
|
140
|
+
for column in schema.columns:
|
|
141
|
+
if column.name in renames:
|
|
142
|
+
column.name = renames[column.name]
|
|
143
|
+
column.dimension_type = None
|
|
144
|
+
return df
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _get_column_renames(schema: FileSchema) -> dict[str, str]:
|
|
148
|
+
"""Return a mapping of columns to rename."""
|
|
149
|
+
mapping: dict[str, str] = {}
|
|
150
|
+
for column in schema.columns:
|
|
151
|
+
if column.dimension_type is not None and column.name != column.dimension_type.value:
|
|
152
|
+
mapping[column.name] = column.dimension_type.value
|
|
153
|
+
return mapping
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _rename_columns(df: DataFrame, mapping: dict[str, str]) -> DataFrame:
|
|
157
|
+
for old_name, new_name in mapping.items():
|
|
158
|
+
df = df.withColumnRenamed(old_name, new_name)
|
|
159
|
+
logger.info("Renamed column %s to %s", old_name, new_name)
|
|
160
|
+
return df
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _drop_ignored_columns(df: DataFrame, ignore_columns: list[str]) -> DataFrame:
|
|
164
|
+
if not ignore_columns:
|
|
165
|
+
return df
|
|
166
|
+
|
|
167
|
+
existing_columns = set(df.columns)
|
|
168
|
+
for col in ignore_columns:
|
|
169
|
+
if col in existing_columns:
|
|
170
|
+
df = df.drop(col)
|
|
171
|
+
logger.info("Dropped ignored column: %s", col)
|
|
172
|
+
else:
|
|
173
|
+
logger.warning("Ignored column '%s' not found in file", col)
|
|
174
|
+
return df
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _get_column_schema(schema: FileSchema, backend_mapping: dict) -> dict[str, str] | None:
|
|
178
|
+
column_types = schema.get_data_type_mapping()
|
|
179
|
+
if not column_types:
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
mapped_schema: dict[str, str] = {}
|
|
183
|
+
for key, val in column_types.items():
|
|
184
|
+
col_type = val.upper()
|
|
185
|
+
if col_type not in backend_mapping:
|
|
186
|
+
options = " ".join(sorted(backend_mapping.keys()))
|
|
187
|
+
msg = f"column type = {val} is not supported. {options=}"
|
|
188
|
+
raise DSGInvalidField(msg)
|
|
189
|
+
mapped_schema[key] = backend_mapping[col_type]
|
|
190
|
+
return mapped_schema
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from datetime import datetime, timedelta
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
import chronify
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from dsgrid.time.types import IndexTimestampType
|
|
9
|
+
from .dimensions import IndexTimeDimensionModel
|
|
10
|
+
from .time_dimension_base_config import TimeDimensionBaseConfig
|
|
11
|
+
from dsgrid.dimension.time import TimeIntervalType
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class IndexTimeDimensionConfig(TimeDimensionBaseConfig):
|
|
18
|
+
"""Provides an interface to a IndexTimeDimensionModel."""
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def model_class() -> IndexTimeDimensionModel:
|
|
22
|
+
return IndexTimeDimensionModel
|
|
23
|
+
|
|
24
|
+
def supports_chronify(self) -> bool:
|
|
25
|
+
return True
|
|
26
|
+
|
|
27
|
+
def to_chronify(
|
|
28
|
+
self,
|
|
29
|
+
) -> Union[
|
|
30
|
+
chronify.IndexTimeRangeTZ, chronify.IndexTimeRangeNTZ, chronify.IndexTimeRangeWithTZColumn
|
|
31
|
+
]:
|
|
32
|
+
time_cols = self.get_load_data_time_columns()
|
|
33
|
+
assert len(self._model.ranges) == 1
|
|
34
|
+
assert len(time_cols) == 1
|
|
35
|
+
|
|
36
|
+
# IndexTimeDimensionModel does not map to IndexTimeRangeNTZ and TZ at the moment
|
|
37
|
+
assert self.get_time_zone() is None
|
|
38
|
+
config = chronify.IndexTimeRangeWithTZColumn(
|
|
39
|
+
time_column=time_cols[0],
|
|
40
|
+
start=self._model.ranges[0].start,
|
|
41
|
+
length=self.get_lengths()[0],
|
|
42
|
+
start_timestamp=pd.Timestamp(self.get_start_times()[0]),
|
|
43
|
+
resolution=self.get_frequency(),
|
|
44
|
+
time_zone_column="time_zone",
|
|
45
|
+
measurement_type=self._model.measurement_type,
|
|
46
|
+
interval_type=self._model.time_interval_type,
|
|
47
|
+
)
|
|
48
|
+
return config
|
|
49
|
+
|
|
50
|
+
def get_frequency(self) -> timedelta:
|
|
51
|
+
freqs = [trange.frequency for trange in self.model.ranges]
|
|
52
|
+
if len(set(freqs)) > 1:
|
|
53
|
+
msg = f"IndexTimeDimensionConfig.get_frequency found multiple frequencies: {freqs}"
|
|
54
|
+
raise ValueError(msg)
|
|
55
|
+
return freqs[0]
|
|
56
|
+
|
|
57
|
+
def get_start_times(self) -> list[pd.Timestamp]:
|
|
58
|
+
"""get represented start times"""
|
|
59
|
+
tz = self.get_tzinfo()
|
|
60
|
+
start_times = []
|
|
61
|
+
for trange in self.model.ranges:
|
|
62
|
+
start = datetime.strptime(trange.starting_timestamp, trange.str_format)
|
|
63
|
+
assert start.tzinfo is None
|
|
64
|
+
start_times.append(start.replace(tzinfo=tz))
|
|
65
|
+
return start_times
|
|
66
|
+
|
|
67
|
+
def get_lengths(self) -> list[int]:
|
|
68
|
+
return [trange.end - trange.start + 1 for trange in self.model.ranges]
|
|
69
|
+
|
|
70
|
+
def get_load_data_time_columns(self) -> list[str]:
|
|
71
|
+
return list(IndexTimestampType._fields)
|
|
72
|
+
|
|
73
|
+
def get_time_zone(self) -> None:
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
def get_tzinfo(self) -> None:
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
def get_time_interval_type(self) -> TimeIntervalType:
|
|
80
|
+
return self.model.time_interval_type
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Defines dataset dimension requirements for a project."""
|
|
2
|
+
|
|
3
|
+
from pydantic import conlist, Field
|
|
4
|
+
|
|
5
|
+
from dsgrid.config.project_config import RequiredDimensionsModel, InputDatasetModel
|
|
6
|
+
from dsgrid.data_models import DSGBaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class InputDatasetDimensionRequirementsModel(DSGBaseModel):
|
|
10
|
+
"""Defines dataset dimension requirements."""
|
|
11
|
+
|
|
12
|
+
dataset_id: str
|
|
13
|
+
required_dimensions: RequiredDimensionsModel = Field(
|
|
14
|
+
title="required_dimensions",
|
|
15
|
+
description="Defines required record IDs that must exist for each dimension.",
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class InputDatasetDimensionRequirementsListModel(DSGBaseModel):
|
|
20
|
+
"""Defines a list of dataset dimension requirements."""
|
|
21
|
+
|
|
22
|
+
dataset_dimension_requirements: conlist(
|
|
23
|
+
InputDatasetDimensionRequirementsModel, min_length=1
|
|
24
|
+
) = Field(description="List of dataset dimension requirements")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class InputDatasetListModel(DSGBaseModel):
|
|
28
|
+
datasets: conlist(InputDatasetModel, min_length=1) = Field(
|
|
29
|
+
title="datasets",
|
|
30
|
+
description="List of input datasets for the project.",
|
|
31
|
+
)
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from pydantic import field_validator, Field, ValidationInfo, field_serializer
|
|
7
|
+
|
|
8
|
+
from dsgrid.config.dimension_mapping_base import (
|
|
9
|
+
DimensionMappingBaseModel,
|
|
10
|
+
DimensionMappingDatasetToProjectBaseModel,
|
|
11
|
+
DimensionMappingPreRegisteredBaseModel,
|
|
12
|
+
)
|
|
13
|
+
from dsgrid.config.dimensions import DimensionReferenceModel
|
|
14
|
+
from dsgrid.data_models import DSGBaseModel
|
|
15
|
+
from dsgrid.utils.files import compute_file_hash
|
|
16
|
+
from dsgrid.utils.utilities import convert_record_dicts_to_classes
|
|
17
|
+
from .config_base import ConfigWithRecordFileBase
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class MappingTableRecordModel(DSGBaseModel):
|
|
24
|
+
"""Represents one record in dimension mapping record files. Maps one dimension to another."""
|
|
25
|
+
|
|
26
|
+
from_id: str = Field(
|
|
27
|
+
title="from_id",
|
|
28
|
+
description="Source mapping",
|
|
29
|
+
)
|
|
30
|
+
to_id: str | None = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
title="to_id",
|
|
33
|
+
description="Destination mapping",
|
|
34
|
+
)
|
|
35
|
+
from_fraction: float = Field(
|
|
36
|
+
title="from_fraction",
|
|
37
|
+
description="Fraction of from_id to map to to_id",
|
|
38
|
+
default=1.0,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
@field_validator("from_id", "to_id")
|
|
42
|
+
@classmethod
|
|
43
|
+
def check_to_id(cls, val):
|
|
44
|
+
if val == "":
|
|
45
|
+
return None
|
|
46
|
+
return val
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class MappingTableByNameModel(DimensionMappingPreRegisteredBaseModel):
|
|
50
|
+
"""Attributes for a dimension mapping table for soon-to-be registered dimensions by name.
|
|
51
|
+
This will be converted to a MappingTableModel as soon as the dimensions are registered.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
filename: str = Field(
|
|
55
|
+
title="filename",
|
|
56
|
+
alias="file",
|
|
57
|
+
description="Filename containing association table records.",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class DatasetBaseToProjectMappingTableModel(DimensionMappingDatasetToProjectBaseModel):
|
|
62
|
+
"""Attributes for a dimension mapping table to map soon-to-be-registered dataset base
|
|
63
|
+
dimensions to a project's dimensions. This will be converted to a MappingTableModel as soon as
|
|
64
|
+
the dimensions are registered.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
filename: str = Field(
|
|
68
|
+
title="filename",
|
|
69
|
+
alias="file",
|
|
70
|
+
description="Filename containing association table records.",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class DatasetBaseToProjectMappingTableListModel(DSGBaseModel):
|
|
75
|
+
"""Represents the config file passed to register-and-submit-dataset command."""
|
|
76
|
+
|
|
77
|
+
mappings: list[DatasetBaseToProjectMappingTableModel]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class MappingTableModel(DimensionMappingBaseModel):
|
|
81
|
+
"""Attributes for a dimension mapping table"""
|
|
82
|
+
|
|
83
|
+
filename: str | None = Field(
|
|
84
|
+
title="filename",
|
|
85
|
+
alias="file",
|
|
86
|
+
default=None,
|
|
87
|
+
description="Filename containing association table records. Only assigned for user input "
|
|
88
|
+
"and output purposes. The registry database stores records in the mapping JSON document.",
|
|
89
|
+
)
|
|
90
|
+
file_hash: str | None = Field(
|
|
91
|
+
title="file_hash",
|
|
92
|
+
description="Hash of the contents of the file, computed by dsgrid.",
|
|
93
|
+
json_schema_extra={
|
|
94
|
+
"dsgrid_internal": True,
|
|
95
|
+
},
|
|
96
|
+
default=None,
|
|
97
|
+
)
|
|
98
|
+
records: list = Field(
|
|
99
|
+
title="records",
|
|
100
|
+
description="dimension mapping records in filename that get loaded at runtime",
|
|
101
|
+
json_schema_extra={
|
|
102
|
+
"dsgrid_internal": True,
|
|
103
|
+
},
|
|
104
|
+
default=[],
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
@field_validator("filename")
|
|
108
|
+
@classmethod
|
|
109
|
+
def check_filename(cls, filename):
|
|
110
|
+
"""Validate record file"""
|
|
111
|
+
if filename is not None:
|
|
112
|
+
if filename:
|
|
113
|
+
if not os.path.isfile(filename):
|
|
114
|
+
msg = f"{filename} does not exist"
|
|
115
|
+
raise ValueError(msg)
|
|
116
|
+
if not filename.endswith(".csv"):
|
|
117
|
+
msg = f"only CSV is supported: {filename}"
|
|
118
|
+
raise ValueError(msg)
|
|
119
|
+
return filename
|
|
120
|
+
|
|
121
|
+
@field_validator("file_hash")
|
|
122
|
+
@classmethod
|
|
123
|
+
def compute_file_hash(cls, file_hash, info: ValidationInfo):
|
|
124
|
+
"""Compute file hash."""
|
|
125
|
+
if "filename" not in info.data:
|
|
126
|
+
return file_hash
|
|
127
|
+
|
|
128
|
+
if not file_hash:
|
|
129
|
+
file_hash = compute_file_hash(info.data["filename"])
|
|
130
|
+
return file_hash
|
|
131
|
+
|
|
132
|
+
@field_validator("records")
|
|
133
|
+
@classmethod
|
|
134
|
+
def add_records(cls, records, info: ValidationInfo):
|
|
135
|
+
"""Add records from the file."""
|
|
136
|
+
if "filename" not in info.data:
|
|
137
|
+
return records
|
|
138
|
+
|
|
139
|
+
if records:
|
|
140
|
+
if isinstance(records[0], dict):
|
|
141
|
+
records = convert_record_dicts_to_classes(records, MappingTableRecordModel)
|
|
142
|
+
return records
|
|
143
|
+
|
|
144
|
+
with open(info.data["filename"], encoding="utf-8-sig") as f_in:
|
|
145
|
+
return convert_record_dicts_to_classes(csv.DictReader(f_in), MappingTableRecordModel)
|
|
146
|
+
|
|
147
|
+
@field_serializer("filename")
|
|
148
|
+
def serialize_cls(self, val, _):
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
@classmethod
|
|
152
|
+
def from_pre_registered_model(
|
|
153
|
+
cls,
|
|
154
|
+
model: MappingTableByNameModel | DatasetBaseToProjectMappingTableModel,
|
|
155
|
+
from_dimension: DimensionReferenceModel,
|
|
156
|
+
to_dimension: DimensionReferenceModel,
|
|
157
|
+
):
|
|
158
|
+
return MappingTableModel(
|
|
159
|
+
mapping_type=model.mapping_type,
|
|
160
|
+
archetype=model.archetype,
|
|
161
|
+
from_dimension=from_dimension,
|
|
162
|
+
to_dimension=to_dimension,
|
|
163
|
+
description=model.description,
|
|
164
|
+
file=model.filename,
|
|
165
|
+
from_fraction_tolerance=model.from_fraction_tolerance,
|
|
166
|
+
to_fraction_tolerance=model.to_fraction_tolerance,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class MappingTableConfig(ConfigWithRecordFileBase):
|
|
171
|
+
"""Provides an interface to an MappingTableModel"""
|
|
172
|
+
|
|
173
|
+
def __init__(self, *args, **kwargs):
|
|
174
|
+
super().__init__(*args, **kwargs)
|
|
175
|
+
self._dataframe = None
|
|
176
|
+
|
|
177
|
+
@staticmethod
|
|
178
|
+
def config_filename():
|
|
179
|
+
return "dimension_mapping.json5"
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
def config_id(self):
|
|
183
|
+
return self.model.mapping_id
|
|
184
|
+
|
|
185
|
+
@staticmethod
|
|
186
|
+
def model_class():
|
|
187
|
+
return MappingTableModel
|
|
188
|
+
|
|
189
|
+
def get_unique_from_ids(self):
|
|
190
|
+
"""Return the unique from IDs in an association table's records.
|
|
191
|
+
|
|
192
|
+
Returns
|
|
193
|
+
-------
|
|
194
|
+
set
|
|
195
|
+
set of str
|
|
196
|
+
|
|
197
|
+
"""
|
|
198
|
+
return {x.from_id for x in self.model.records}
|
|
199
|
+
|
|
200
|
+
def get_unique_to_ids(self):
|
|
201
|
+
"""Return the unique to IDs in an association table's records.
|
|
202
|
+
|
|
203
|
+
Returns
|
|
204
|
+
-------
|
|
205
|
+
set
|
|
206
|
+
set of str
|
|
207
|
+
|
|
208
|
+
"""
|
|
209
|
+
return {x.to_id for x in self.model.records}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from datetime import timedelta
|
|
2
|
+
|
|
3
|
+
from .dimensions import NoOpTimeDimensionModel
|
|
4
|
+
from .time_dimension_base_config import TimeDimensionBaseConfig
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class NoOpTimeDimensionConfig(TimeDimensionBaseConfig):
|
|
8
|
+
"""Provides an interface to an NoOpTimeDimensionModel."""
|
|
9
|
+
|
|
10
|
+
@staticmethod
|
|
11
|
+
def model_class() -> NoOpTimeDimensionModel:
|
|
12
|
+
return NoOpTimeDimensionModel
|
|
13
|
+
|
|
14
|
+
def check_dataset_time_consistency(self, load_data_df, time_columns) -> None:
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
def get_frequency(self) -> timedelta:
|
|
18
|
+
return timedelta(days=0)
|
|
19
|
+
|
|
20
|
+
def get_time_ranges(self) -> list:
|
|
21
|
+
return []
|
|
22
|
+
|
|
23
|
+
def get_start_times(self) -> list:
|
|
24
|
+
return []
|
|
25
|
+
|
|
26
|
+
def get_lengths(self) -> list:
|
|
27
|
+
return []
|
|
28
|
+
|
|
29
|
+
def get_load_data_time_columns(self) -> list:
|
|
30
|
+
return []
|
|
31
|
+
|
|
32
|
+
def get_time_zone(self) -> None:
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
def get_tzinfo(self) -> None:
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
def get_time_interval_type(self) -> None:
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
def list_expected_dataset_timestamps(self) -> list:
|
|
42
|
+
return []
|