dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,71 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from pydantic import field_validator, Field
5
+
6
+ from dsgrid.data_models import DSGBaseModel
7
+ from dsgrid.utils.utilities import check_uniqueness
8
+ from .config_base import ConfigBase
9
+ from .dimensions import DimensionModel, DimensionsListModel
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class DimensionsConfigModel(DSGBaseModel):
15
+ """Represents multiple dimension models.
16
+
17
+ Used when registering multiple dimensions in one command.
18
+ """
19
+
20
+ dimensions: DimensionsListModel = Field(
21
+ title="dimensions",
22
+ description="Dimensions for submission to the dimension registry",
23
+ )
24
+
25
+ @field_validator("dimensions")
26
+ @classmethod
27
+ def check_files(cls, values: dict) -> dict:
28
+ """Validate dimension files are unique across all dimensions"""
29
+ check_uniqueness(
30
+ (x.filename for x in values if isinstance(x, DimensionModel) and x.filename),
31
+ "dimension record filename",
32
+ )
33
+ return values
34
+
35
+ @field_validator("dimensions")
36
+ @classmethod
37
+ def check_names(cls, values: dict) -> dict:
38
+ """Validate dimension names are unique across all dimensions."""
39
+ check_uniqueness(
40
+ [dim.name for dim in values],
41
+ "dimension record name",
42
+ )
43
+ return values
44
+
45
+
46
+ class DimensionsConfig(ConfigBase):
47
+ """Provides an interface to a DimensionsConfigModel."""
48
+
49
+ def __init__(self, *args, **kwargs):
50
+ super().__init__(*args, **kwargs)
51
+ self._src_dir = None
52
+
53
+ @staticmethod
54
+ def config_filename():
55
+ return "dimensions.json5"
56
+
57
+ @property
58
+ def config_id(self):
59
+ assert False, "not correct for this class"
60
+
61
+ @staticmethod
62
+ def model_class():
63
+ return DimensionsConfigModel
64
+
65
+ @classmethod
66
+ def load(cls, config_filename: Path, *args, **kwargs):
67
+ return super().load(config_filename, *args, **kwargs)
68
+
69
+ @classmethod
70
+ def load_from_model(cls, model):
71
+ return cls(model)
@@ -0,0 +1,190 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Self
4
+
5
+ from pydantic import Field, field_validator, model_validator
6
+
7
+ from dsgrid.data_models import DSGBaseModel
8
+ from dsgrid.dimension.base_models import DimensionType
9
+ from dsgrid.exceptions import DSGInvalidDataset, DSGInvalidField
10
+ from dsgrid.spark.functions import read_csv_duckdb, read_json, read_parquet
11
+ from dsgrid.spark.types import DataFrame, DUCKDB_COLUMN_TYPES, SUPPORTED_TYPES
12
+ from dsgrid.utils.scratch_dir_context import ScratchDirContext
13
+ from dsgrid.utils.spark import write_dataframe
14
+ from dsgrid.utils.utilities import check_uniqueness
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class Column(DSGBaseModel):
21
+ name: str = Field(description="Name of the column")
22
+ dimension_type: DimensionType | None = Field(
23
+ default=None,
24
+ description="Dimension represented by the data in the column. Optional if this is a "
25
+ "time column or pivoted column. Required if the column represents a stacked dimension "
26
+ "but an alternate name is being used, such as 'county' instead of 'geography'. "
27
+ "dsgrid will rename any column that is set at runtime, writing out the result to the "
28
+ "registry's data directory. The original dataset is not modified.",
29
+ )
30
+ data_type: str | None = Field(
31
+ default=None, description="Type of the data in the column. If None, infer the type."
32
+ )
33
+
34
+ @field_validator("data_type")
35
+ @classmethod
36
+ def check_data_type(cls, data_type: str | None) -> str | None:
37
+ if data_type is None:
38
+ return None
39
+
40
+ type_upper = data_type.upper()
41
+ if type_upper not in SUPPORTED_TYPES:
42
+ supported_data_types = sorted(SUPPORTED_TYPES)
43
+ msg = f"{data_type=} is not one of {supported_data_types=}"
44
+ raise ValueError(msg)
45
+ return type_upper
46
+
47
+
48
+ class FileSchema(DSGBaseModel):
49
+ """Defines the format of a data file (CSV, JSON, Parquet)."""
50
+
51
+ path: str | None = Field(description="Path to the file. Must be assigned during registration.")
52
+ columns: list[Column] = Field(
53
+ default=[], description="Custom schema for the columns in the file."
54
+ )
55
+ ignore_columns: list[str] = Field(
56
+ default=[],
57
+ description="List of column names to ignore (drop) when reading the file.",
58
+ )
59
+
60
+ @model_validator(mode="after")
61
+ def check_consistency(self) -> Self:
62
+ if len(self.columns) > 1:
63
+ check_uniqueness((x.name for x in self.columns), "column names")
64
+
65
+ # Check that ignore_columns don't overlap with columns
66
+ column_names = {x.name for x in self.columns}
67
+ ignore_set = set(self.ignore_columns)
68
+ overlap = column_names & ignore_set
69
+ if overlap:
70
+ msg = f"Columns cannot be in both 'columns' and 'ignore_columns': {overlap}"
71
+ raise ValueError(msg)
72
+
73
+ return self
74
+
75
+ def get_data_type_mapping(self) -> dict[str, str]:
76
+ """Return the mapping of column to data type."""
77
+ return {x.name: x.data_type for x in self.columns if x.data_type is not None}
78
+
79
+
80
+ def read_data_file(
81
+ schema: FileSchema, scratch_dir_context: ScratchDirContext | None = None
82
+ ) -> DataFrame:
83
+ """Read a data file from a schema.
84
+
85
+ Parameters
86
+ ----------
87
+ schema : FileSchema
88
+ Schema defining the file path and column types.
89
+ scratch_dir_context : ScratchDirContext
90
+ Optional location to write temporary files.
91
+
92
+ Returns
93
+ -------
94
+ DataFrame
95
+ A Spark DataFrame containing the file data.
96
+ """
97
+ if schema.path is None:
98
+ msg = "File path is not assigned"
99
+ raise DSGInvalidDataset(msg)
100
+
101
+ path = Path(schema.path)
102
+ if not path.exists():
103
+ msg = f"{path} does not exist"
104
+ raise FileNotFoundError(msg)
105
+
106
+ expected_columns = {x.name for x in schema.columns}
107
+
108
+ match path.suffix:
109
+ case ".parquet":
110
+ df = read_parquet(path)
111
+ case ".csv":
112
+ column_schema = _get_column_schema(schema, DUCKDB_COLUMN_TYPES)
113
+ df = read_csv_duckdb(path, schema=column_schema)
114
+ case ".json":
115
+ df = read_json(path)
116
+ case _:
117
+ msg = f"Unsupported file type: {path.suffix}"
118
+ raise DSGInvalidDataset(msg)
119
+
120
+ actual_columns = set(df.columns)
121
+ diff = expected_columns.difference(actual_columns)
122
+ if diff:
123
+ msg = f"Expected columns {diff} are not in {actual_columns=}"
124
+ raise DSGInvalidDataset(msg)
125
+
126
+ df = _drop_ignored_columns(df, schema.ignore_columns)
127
+ renames = _get_column_renames(schema)
128
+ if renames:
129
+ df = _rename_columns(df, renames)
130
+ if scratch_dir_context is None:
131
+ renamed_path = path.with_stem(path.stem + "_renamed")
132
+ logger.warning(
133
+ "Creating temporary file at %s. Pass scratch_dir_context to avoid this.",
134
+ renamed_path,
135
+ )
136
+ else:
137
+ renamed_path = scratch_dir_context.get_temp_filename(suffix=path.suffix)
138
+ write_dataframe(df, renamed_path, overwrite=True)
139
+ schema.path = str(renamed_path)
140
+ for column in schema.columns:
141
+ if column.name in renames:
142
+ column.name = renames[column.name]
143
+ column.dimension_type = None
144
+ return df
145
+
146
+
147
+ def _get_column_renames(schema: FileSchema) -> dict[str, str]:
148
+ """Return a mapping of columns to rename."""
149
+ mapping: dict[str, str] = {}
150
+ for column in schema.columns:
151
+ if column.dimension_type is not None and column.name != column.dimension_type.value:
152
+ mapping[column.name] = column.dimension_type.value
153
+ return mapping
154
+
155
+
156
+ def _rename_columns(df: DataFrame, mapping: dict[str, str]) -> DataFrame:
157
+ for old_name, new_name in mapping.items():
158
+ df = df.withColumnRenamed(old_name, new_name)
159
+ logger.info("Renamed column %s to %s", old_name, new_name)
160
+ return df
161
+
162
+
163
+ def _drop_ignored_columns(df: DataFrame, ignore_columns: list[str]) -> DataFrame:
164
+ if not ignore_columns:
165
+ return df
166
+
167
+ existing_columns = set(df.columns)
168
+ for col in ignore_columns:
169
+ if col in existing_columns:
170
+ df = df.drop(col)
171
+ logger.info("Dropped ignored column: %s", col)
172
+ else:
173
+ logger.warning("Ignored column '%s' not found in file", col)
174
+ return df
175
+
176
+
177
+ def _get_column_schema(schema: FileSchema, backend_mapping: dict) -> dict[str, str] | None:
178
+ column_types = schema.get_data_type_mapping()
179
+ if not column_types:
180
+ return None
181
+
182
+ mapped_schema: dict[str, str] = {}
183
+ for key, val in column_types.items():
184
+ col_type = val.upper()
185
+ if col_type not in backend_mapping:
186
+ options = " ".join(sorted(backend_mapping.keys()))
187
+ msg = f"column type = {val} is not supported. {options=}"
188
+ raise DSGInvalidField(msg)
189
+ mapped_schema[key] = backend_mapping[col_type]
190
+ return mapped_schema
@@ -0,0 +1,80 @@
1
+ import logging
2
+ from datetime import datetime, timedelta
3
+ from typing import Union
4
+
5
+ import chronify
6
+ import pandas as pd
7
+
8
+ from dsgrid.time.types import IndexTimestampType
9
+ from .dimensions import IndexTimeDimensionModel
10
+ from .time_dimension_base_config import TimeDimensionBaseConfig
11
+ from dsgrid.dimension.time import TimeIntervalType
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class IndexTimeDimensionConfig(TimeDimensionBaseConfig):
18
+ """Provides an interface to a IndexTimeDimensionModel."""
19
+
20
+ @staticmethod
21
+ def model_class() -> IndexTimeDimensionModel:
22
+ return IndexTimeDimensionModel
23
+
24
+ def supports_chronify(self) -> bool:
25
+ return True
26
+
27
+ def to_chronify(
28
+ self,
29
+ ) -> Union[
30
+ chronify.IndexTimeRangeTZ, chronify.IndexTimeRangeNTZ, chronify.IndexTimeRangeWithTZColumn
31
+ ]:
32
+ time_cols = self.get_load_data_time_columns()
33
+ assert len(self._model.ranges) == 1
34
+ assert len(time_cols) == 1
35
+
36
+ # IndexTimeDimensionModel does not map to IndexTimeRangeNTZ and TZ at the moment
37
+ assert self.get_time_zone() is None
38
+ config = chronify.IndexTimeRangeWithTZColumn(
39
+ time_column=time_cols[0],
40
+ start=self._model.ranges[0].start,
41
+ length=self.get_lengths()[0],
42
+ start_timestamp=pd.Timestamp(self.get_start_times()[0]),
43
+ resolution=self.get_frequency(),
44
+ time_zone_column="time_zone",
45
+ measurement_type=self._model.measurement_type,
46
+ interval_type=self._model.time_interval_type,
47
+ )
48
+ return config
49
+
50
+ def get_frequency(self) -> timedelta:
51
+ freqs = [trange.frequency for trange in self.model.ranges]
52
+ if len(set(freqs)) > 1:
53
+ msg = f"IndexTimeDimensionConfig.get_frequency found multiple frequencies: {freqs}"
54
+ raise ValueError(msg)
55
+ return freqs[0]
56
+
57
+ def get_start_times(self) -> list[pd.Timestamp]:
58
+ """get represented start times"""
59
+ tz = self.get_tzinfo()
60
+ start_times = []
61
+ for trange in self.model.ranges:
62
+ start = datetime.strptime(trange.starting_timestamp, trange.str_format)
63
+ assert start.tzinfo is None
64
+ start_times.append(start.replace(tzinfo=tz))
65
+ return start_times
66
+
67
+ def get_lengths(self) -> list[int]:
68
+ return [trange.end - trange.start + 1 for trange in self.model.ranges]
69
+
70
+ def get_load_data_time_columns(self) -> list[str]:
71
+ return list(IndexTimestampType._fields)
72
+
73
+ def get_time_zone(self) -> None:
74
+ return None
75
+
76
+ def get_tzinfo(self) -> None:
77
+ return None
78
+
79
+ def get_time_interval_type(self) -> TimeIntervalType:
80
+ return self.model.time_interval_type
@@ -0,0 +1,31 @@
1
+ """Defines dataset dimension requirements for a project."""
2
+
3
+ from pydantic import conlist, Field
4
+
5
+ from dsgrid.config.project_config import RequiredDimensionsModel, InputDatasetModel
6
+ from dsgrid.data_models import DSGBaseModel
7
+
8
+
9
+ class InputDatasetDimensionRequirementsModel(DSGBaseModel):
10
+ """Defines dataset dimension requirements."""
11
+
12
+ dataset_id: str
13
+ required_dimensions: RequiredDimensionsModel = Field(
14
+ title="required_dimensions",
15
+ description="Defines required record IDs that must exist for each dimension.",
16
+ )
17
+
18
+
19
+ class InputDatasetDimensionRequirementsListModel(DSGBaseModel):
20
+ """Defines a list of dataset dimension requirements."""
21
+
22
+ dataset_dimension_requirements: conlist(
23
+ InputDatasetDimensionRequirementsModel, min_length=1
24
+ ) = Field(description="List of dataset dimension requirements")
25
+
26
+
27
+ class InputDatasetListModel(DSGBaseModel):
28
+ datasets: conlist(InputDatasetModel, min_length=1) = Field(
29
+ title="datasets",
30
+ description="List of input datasets for the project.",
31
+ )
@@ -0,0 +1,209 @@
1
+ import csv
2
+ import logging
3
+ import os
4
+
5
+
6
+ from pydantic import field_validator, Field, ValidationInfo, field_serializer
7
+
8
+ from dsgrid.config.dimension_mapping_base import (
9
+ DimensionMappingBaseModel,
10
+ DimensionMappingDatasetToProjectBaseModel,
11
+ DimensionMappingPreRegisteredBaseModel,
12
+ )
13
+ from dsgrid.config.dimensions import DimensionReferenceModel
14
+ from dsgrid.data_models import DSGBaseModel
15
+ from dsgrid.utils.files import compute_file_hash
16
+ from dsgrid.utils.utilities import convert_record_dicts_to_classes
17
+ from .config_base import ConfigWithRecordFileBase
18
+
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class MappingTableRecordModel(DSGBaseModel):
24
+ """Represents one record in dimension mapping record files. Maps one dimension to another."""
25
+
26
+ from_id: str = Field(
27
+ title="from_id",
28
+ description="Source mapping",
29
+ )
30
+ to_id: str | None = Field(
31
+ default=None,
32
+ title="to_id",
33
+ description="Destination mapping",
34
+ )
35
+ from_fraction: float = Field(
36
+ title="from_fraction",
37
+ description="Fraction of from_id to map to to_id",
38
+ default=1.0,
39
+ )
40
+
41
+ @field_validator("from_id", "to_id")
42
+ @classmethod
43
+ def check_to_id(cls, val):
44
+ if val == "":
45
+ return None
46
+ return val
47
+
48
+
49
+ class MappingTableByNameModel(DimensionMappingPreRegisteredBaseModel):
50
+ """Attributes for a dimension mapping table for soon-to-be registered dimensions by name.
51
+ This will be converted to a MappingTableModel as soon as the dimensions are registered.
52
+ """
53
+
54
+ filename: str = Field(
55
+ title="filename",
56
+ alias="file",
57
+ description="Filename containing association table records.",
58
+ )
59
+
60
+
61
+ class DatasetBaseToProjectMappingTableModel(DimensionMappingDatasetToProjectBaseModel):
62
+ """Attributes for a dimension mapping table to map soon-to-be-registered dataset base
63
+ dimensions to a project's dimensions. This will be converted to a MappingTableModel as soon as
64
+ the dimensions are registered.
65
+ """
66
+
67
+ filename: str = Field(
68
+ title="filename",
69
+ alias="file",
70
+ description="Filename containing association table records.",
71
+ )
72
+
73
+
74
+ class DatasetBaseToProjectMappingTableListModel(DSGBaseModel):
75
+ """Represents the config file passed to register-and-submit-dataset command."""
76
+
77
+ mappings: list[DatasetBaseToProjectMappingTableModel]
78
+
79
+
80
+ class MappingTableModel(DimensionMappingBaseModel):
81
+ """Attributes for a dimension mapping table"""
82
+
83
+ filename: str | None = Field(
84
+ title="filename",
85
+ alias="file",
86
+ default=None,
87
+ description="Filename containing association table records. Only assigned for user input "
88
+ "and output purposes. The registry database stores records in the mapping JSON document.",
89
+ )
90
+ file_hash: str | None = Field(
91
+ title="file_hash",
92
+ description="Hash of the contents of the file, computed by dsgrid.",
93
+ json_schema_extra={
94
+ "dsgrid_internal": True,
95
+ },
96
+ default=None,
97
+ )
98
+ records: list = Field(
99
+ title="records",
100
+ description="dimension mapping records in filename that get loaded at runtime",
101
+ json_schema_extra={
102
+ "dsgrid_internal": True,
103
+ },
104
+ default=[],
105
+ )
106
+
107
+ @field_validator("filename")
108
+ @classmethod
109
+ def check_filename(cls, filename):
110
+ """Validate record file"""
111
+ if filename is not None:
112
+ if filename:
113
+ if not os.path.isfile(filename):
114
+ msg = f"{filename} does not exist"
115
+ raise ValueError(msg)
116
+ if not filename.endswith(".csv"):
117
+ msg = f"only CSV is supported: {filename}"
118
+ raise ValueError(msg)
119
+ return filename
120
+
121
+ @field_validator("file_hash")
122
+ @classmethod
123
+ def compute_file_hash(cls, file_hash, info: ValidationInfo):
124
+ """Compute file hash."""
125
+ if "filename" not in info.data:
126
+ return file_hash
127
+
128
+ if not file_hash:
129
+ file_hash = compute_file_hash(info.data["filename"])
130
+ return file_hash
131
+
132
+ @field_validator("records")
133
+ @classmethod
134
+ def add_records(cls, records, info: ValidationInfo):
135
+ """Add records from the file."""
136
+ if "filename" not in info.data:
137
+ return records
138
+
139
+ if records:
140
+ if isinstance(records[0], dict):
141
+ records = convert_record_dicts_to_classes(records, MappingTableRecordModel)
142
+ return records
143
+
144
+ with open(info.data["filename"], encoding="utf-8-sig") as f_in:
145
+ return convert_record_dicts_to_classes(csv.DictReader(f_in), MappingTableRecordModel)
146
+
147
+ @field_serializer("filename")
148
+ def serialize_cls(self, val, _):
149
+ return None
150
+
151
+ @classmethod
152
+ def from_pre_registered_model(
153
+ cls,
154
+ model: MappingTableByNameModel | DatasetBaseToProjectMappingTableModel,
155
+ from_dimension: DimensionReferenceModel,
156
+ to_dimension: DimensionReferenceModel,
157
+ ):
158
+ return MappingTableModel(
159
+ mapping_type=model.mapping_type,
160
+ archetype=model.archetype,
161
+ from_dimension=from_dimension,
162
+ to_dimension=to_dimension,
163
+ description=model.description,
164
+ file=model.filename,
165
+ from_fraction_tolerance=model.from_fraction_tolerance,
166
+ to_fraction_tolerance=model.to_fraction_tolerance,
167
+ )
168
+
169
+
170
+ class MappingTableConfig(ConfigWithRecordFileBase):
171
+ """Provides an interface to an MappingTableModel"""
172
+
173
+ def __init__(self, *args, **kwargs):
174
+ super().__init__(*args, **kwargs)
175
+ self._dataframe = None
176
+
177
+ @staticmethod
178
+ def config_filename():
179
+ return "dimension_mapping.json5"
180
+
181
+ @property
182
+ def config_id(self):
183
+ return self.model.mapping_id
184
+
185
+ @staticmethod
186
+ def model_class():
187
+ return MappingTableModel
188
+
189
+ def get_unique_from_ids(self):
190
+ """Return the unique from IDs in an association table's records.
191
+
192
+ Returns
193
+ -------
194
+ set
195
+ set of str
196
+
197
+ """
198
+ return {x.from_id for x in self.model.records}
199
+
200
+ def get_unique_to_ids(self):
201
+ """Return the unique to IDs in an association table's records.
202
+
203
+ Returns
204
+ -------
205
+ set
206
+ set of str
207
+
208
+ """
209
+ return {x.to_id for x in self.model.records}
@@ -0,0 +1,42 @@
1
+ from datetime import timedelta
2
+
3
+ from .dimensions import NoOpTimeDimensionModel
4
+ from .time_dimension_base_config import TimeDimensionBaseConfig
5
+
6
+
7
+ class NoOpTimeDimensionConfig(TimeDimensionBaseConfig):
8
+ """Provides an interface to an NoOpTimeDimensionModel."""
9
+
10
+ @staticmethod
11
+ def model_class() -> NoOpTimeDimensionModel:
12
+ return NoOpTimeDimensionModel
13
+
14
+ def check_dataset_time_consistency(self, load_data_df, time_columns) -> None:
15
+ pass
16
+
17
+ def get_frequency(self) -> timedelta:
18
+ return timedelta(days=0)
19
+
20
+ def get_time_ranges(self) -> list:
21
+ return []
22
+
23
+ def get_start_times(self) -> list:
24
+ return []
25
+
26
+ def get_lengths(self) -> list:
27
+ return []
28
+
29
+ def get_load_data_time_columns(self) -> list:
30
+ return []
31
+
32
+ def get_time_zone(self) -> None:
33
+ return None
34
+
35
+ def get_tzinfo(self) -> None:
36
+ return None
37
+
38
+ def get_time_interval_type(self) -> None:
39
+ return None
40
+
41
+ def list_expected_dataset_timestamps(self) -> list:
42
+ return []