dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,188 @@
1
+ """Contains data models to control bulk registration of projects and datasets."""
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Iterable
5
+
6
+ from pydantic import Field, ValidationInfo, field_validator, model_validator
7
+
8
+ from dsgrid.data_models import DSGBaseModel
9
+ from dsgrid.dimension.base_models import DimensionType
10
+ from dsgrid.utils.files import load_data
11
+
12
+
13
+ class ProjectRegistrationModel(DSGBaseModel):
14
+ """Defines a project to be registered."""
15
+
16
+ project_id: str = Field(description="Project ID")
17
+ config_file: Path = Field(description="Path to project.json5")
18
+ log_message: str | None = Field(
19
+ default=None,
20
+ description="Log message to use when registering the project. Defaults to an auto-generated message.",
21
+ )
22
+
23
+ @model_validator(mode="before")
24
+ @classmethod
25
+ def fix_paths(cls, data: dict[str, Any]) -> dict[str, Any]:
26
+ _fix_paths(data, ("config_file",))
27
+ return data
28
+
29
+ @field_validator("log_message")
30
+ def fix_log_message(cls, log_message: str | None, info: ValidationInfo) -> str | None:
31
+ if log_message is None and "project_id" in info.data:
32
+ log_message = f"Register project {info.data['project_id']}"
33
+ return log_message
34
+
35
+
36
+ class DatasetRegistrationModel(DSGBaseModel):
37
+ """Defines a dataset to be registered."""
38
+
39
+ dataset_id: str = Field(description="Dataset ID")
40
+ config_file: Path = Field(description="Path to dataset.json5")
41
+ replace_dimension_names_with_ids: bool = Field(
42
+ description="Replace the dimension entries with IDs of dimensions in the database "
43
+ "with matching names. Typically only useful for tests.",
44
+ default=False,
45
+ )
46
+ log_message: str | None = Field(
47
+ default=None,
48
+ description="Log message to use when registering the dataset. Defaults to an auto-generated message.",
49
+ )
50
+
51
+ @field_validator("log_message")
52
+ def fix_log_message(cls, log_message: str | None, info: ValidationInfo) -> str | None:
53
+ if log_message is None and "dataset_id" in info.data:
54
+ log_message = f"Register dataset {info.data['dataset_id']}"
55
+ return log_message
56
+
57
+ @model_validator(mode="before")
58
+ @classmethod
59
+ def fix_paths(cls, data: dict[str, Any]) -> dict[str, Any]:
60
+ _fix_paths(data, ("config_file",))
61
+ return data
62
+
63
+
64
+ class DatasetSubmissionModel(DSGBaseModel):
65
+ """Defines how a dataset should be submitted to a project."""
66
+
67
+ dataset_id: str
68
+ project_id: str
69
+ dimension_mapping_file: Path | None = Field(
70
+ description="Path to file containing mappings of dataset-to-project dimensions",
71
+ default=None,
72
+ )
73
+ dimension_mapping_references_file: Path | None = Field(
74
+ description="Path to file containing references to mappings of dataset-to-project dimensions",
75
+ default=None,
76
+ )
77
+ replace_dimension_mapping_names_with_ids: bool = Field(
78
+ description="Replace the dimension mapping entries with IDs of dimension mappings "
79
+ "in the database with matching names. Typically only useful for tests.",
80
+ default=False,
81
+ )
82
+ autogen_reverse_supplemental_mappings: set[DimensionType] = Field(
83
+ description="Dimensions on which to attempt create reverse mappings from supplemental dimensions.",
84
+ default=set(),
85
+ )
86
+ log_message: str | None = Field(
87
+ default=None,
88
+ description="Log message to use when submitting the dataset. Defaults to an auto-generated message.",
89
+ )
90
+
91
+ @model_validator(mode="before")
92
+ @classmethod
93
+ def fix_autogen_reverse_supplemental_mappings(cls, data: dict[str, Any]) -> dict[str, Any]:
94
+ if "autogen_reverse_supplemental_mappings" in data:
95
+ data["autogen_reverse_supplemental_mappings"] = {
96
+ DimensionType(x) for x in data["autogen_reverse_supplemental_mappings"]
97
+ }
98
+ return data
99
+
100
+ @field_validator("log_message")
101
+ def fix_log_message(cls, log_message: str | None, info: ValidationInfo) -> str | None:
102
+ if log_message is None and "dataset_id" in info.data:
103
+ log_message = (
104
+ f"Submit dataset {info.data['dataset_id']} to project {info.data['project_id']}"
105
+ )
106
+ return log_message
107
+
108
+
109
+ class SubmittedDatasetsJournal(DSGBaseModel):
110
+ """Defines a dataset that was successfully submitted to a project."""
111
+
112
+ dataset_id: str
113
+ project_id: str
114
+
115
+
116
+ class RegistrationJournal(DSGBaseModel):
117
+ """Defines projects and datasets that were succesfully registered."""
118
+
119
+ registered_projects: list[str] = []
120
+ registered_datasets: list[str] = []
121
+ submitted_datasets: list[SubmittedDatasetsJournal] = []
122
+
123
+ def add_dataset(self, dataset_id: str) -> None:
124
+ assert dataset_id not in self.registered_datasets, dataset_id
125
+ self.registered_datasets.append(dataset_id)
126
+
127
+ def add_project(self, project_id: str) -> None:
128
+ assert project_id not in self.registered_projects, project_id
129
+ self.registered_projects.append(project_id)
130
+
131
+ def add_submitted_dataset(self, dataset_id: str, project_id: str) -> None:
132
+ entry = SubmittedDatasetsJournal(dataset_id=dataset_id, project_id=project_id)
133
+ assert entry not in self.submitted_datasets, entry
134
+ self.submitted_datasets.append(entry)
135
+
136
+ def has_entries(self) -> bool:
137
+ return (
138
+ bool(self.registered_projects)
139
+ or bool(self.registered_datasets)
140
+ or bool(self.submitted_datasets)
141
+ )
142
+
143
+
144
+ class RegistrationModel(DSGBaseModel):
145
+ """Defines a list of projects and datasets to be registered."""
146
+
147
+ projects: list[ProjectRegistrationModel] = Field(description="List of projects to register.")
148
+ datasets: list[DatasetRegistrationModel] = Field(description="List of datasets to register.")
149
+ dataset_submissions: list[DatasetSubmissionModel] = Field(
150
+ description="List of datasets to be submitted to projects."
151
+ )
152
+
153
+ def filter_by_journal(self, journal: RegistrationJournal) -> "RegistrationModel":
154
+ """Return a new instance of RegistrationModel by filtering an existing instance with
155
+ a journal.
156
+ """
157
+ projects = list(
158
+ filter(lambda x: x.project_id not in journal.registered_projects, self.projects)
159
+ )
160
+ datasets = list(
161
+ filter(lambda x: x.dataset_id not in journal.registered_datasets, self.datasets)
162
+ )
163
+ dataset_submissions = list(
164
+ filter(
165
+ lambda x: SubmittedDatasetsJournal(
166
+ dataset_id=x.dataset_id, project_id=x.project_id
167
+ )
168
+ not in journal.submitted_datasets,
169
+ self.dataset_submissions,
170
+ )
171
+ )
172
+ return RegistrationModel(
173
+ projects=projects,
174
+ datasets=datasets,
175
+ dataset_submissions=dataset_submissions,
176
+ )
177
+
178
+
179
+ def _fix_paths(data: dict[str, Any], fields: Iterable[str]) -> None:
180
+ for field in fields:
181
+ val = data.get(field)
182
+ if isinstance(val, str):
183
+ data[field] = Path(val)
184
+
185
+
186
+ def create_registration(input_file: Path):
187
+ """Create registration inputs."""
188
+ return RegistrationModel(**load_data(input_file))
@@ -0,0 +1,194 @@
1
+ import abc
2
+ import logging
3
+ from datetime import timedelta
4
+ from typing import Type, Any, Union
5
+
6
+ import chronify
7
+
8
+ from dsgrid.dimension.time import RepresentativePeriodFormat, TimeIntervalType
9
+ from dsgrid.time.types import (
10
+ OneWeekPerMonthByHourType,
11
+ OneWeekdayDayAndOneWeekendDayPerMonthByHourType,
12
+ )
13
+ from .dimensions import RepresentativePeriodTimeDimensionModel
14
+ from .time_dimension_base_config import TimeDimensionBaseConfig
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class RepresentativePeriodTimeDimensionConfig(TimeDimensionBaseConfig):
21
+ """Provides an interface to an RepresentativePeriodTimeDimensionModel."""
22
+
23
+ def __init__(self, *args, **kwargs):
24
+ super().__init__(*args, **kwargs)
25
+ # We expect the list of required formats to grow.
26
+ # It's possible that one function (or set of functions) can handle all permutations
27
+ # of parameters. We can make that determination once we have requirements for more
28
+ # formats.
29
+ match self.model.format:
30
+ case RepresentativePeriodFormat.ONE_WEEK_PER_MONTH_BY_HOUR:
31
+ self._format_handler = OneWeekPerMonthByHourHandler()
32
+ case RepresentativePeriodFormat.ONE_WEEKDAY_DAY_AND_ONE_WEEKEND_DAY_PER_MONTH_BY_HOUR:
33
+ self._format_handler = OneWeekdayDayAndWeekendDayPerMonthByHourHandler()
34
+ case _:
35
+ msg = self.model.format.value
36
+ raise NotImplementedError(msg)
37
+
38
+ def supports_chronify(self) -> bool:
39
+ return True
40
+
41
+ def to_chronify(
42
+ self,
43
+ ) -> Union[chronify.RepresentativePeriodTimeTZ, chronify.RepresentativePeriodTimeNTZ]:
44
+ if len(self._model.ranges) != 1:
45
+ msg = (
46
+ "Mapping RepresentativePeriodTime with chronify is only supported with one range: "
47
+ f"{self._model.ranges}"
48
+ )
49
+ raise NotImplementedError(msg)
50
+ range_ = self._model.ranges[0]
51
+ if range_.start != 1 or range_.end != 12:
52
+ msg = (
53
+ "Mapping RepresentativePeriodTime with chronify is only supported with a full year: "
54
+ f"{range_}"
55
+ )
56
+ raise NotImplementedError(msg)
57
+ # RepresentativePeriodTimeDimensionModel does not map to NTZ at the moment
58
+ if isinstance(self._format_handler, OneWeekPerMonthByHourHandler) or isinstance(
59
+ self._format_handler, OneWeekdayDayAndWeekendDayPerMonthByHourHandler
60
+ ):
61
+ return chronify.RepresentativePeriodTimeTZ(
62
+ measurement_type=self._model.measurement_type,
63
+ interval_type=self._model.time_interval_type,
64
+ time_format=chronify.RepresentativePeriodFormat(self._model.format.value),
65
+ time_zone_column="time_zone",
66
+ )
67
+
68
+ msg = f"Cannot chronify time_config for {self._format_handler}"
69
+ raise NotImplementedError(msg)
70
+
71
+ @staticmethod
72
+ def model_class() -> RepresentativePeriodTimeDimensionModel:
73
+ return RepresentativePeriodTimeDimensionModel
74
+
75
+ def get_frequency(self) -> timedelta:
76
+ return self._format_handler.get_frequency()
77
+
78
+ def get_start_times(self) -> list[Any]:
79
+ return self._format_handler.get_start_times(self.model.ranges)
80
+
81
+ def get_lengths(self) -> list[int]:
82
+ return self._format_handler.get_lengths(self.model.ranges)
83
+
84
+ def get_load_data_time_columns(self) -> list[str]:
85
+ return self._format_handler.get_load_data_time_columns()
86
+
87
+ def get_time_zone(self) -> None:
88
+ return None
89
+
90
+ def get_tzinfo(self) -> None:
91
+ return None
92
+
93
+ def get_time_interval_type(self) -> TimeIntervalType:
94
+ return self.model.time_interval_type
95
+
96
+
97
+ class RepresentativeTimeFormatHandlerBase(abc.ABC):
98
+ """Provides implementations for different representative time formats."""
99
+
100
+ @staticmethod
101
+ @abc.abstractmethod
102
+ def get_representative_time_type() -> Type:
103
+ """Return the time type representing the data."""
104
+
105
+ @abc.abstractmethod
106
+ def get_frequency(self):
107
+ """Return the frequency.
108
+
109
+ Returns
110
+ -------
111
+ timedelta
112
+
113
+ """
114
+
115
+ @staticmethod
116
+ @abc.abstractmethod
117
+ def get_load_data_time_columns():
118
+ """Return the required timestamp columns in the load data table.
119
+
120
+ Returns
121
+ -------
122
+ list
123
+
124
+ """
125
+
126
+
127
+ class OneWeekPerMonthByHourHandler(RepresentativeTimeFormatHandlerBase):
128
+ """Handler for format with hourly data that includes one week per month."""
129
+
130
+ @staticmethod
131
+ def get_representative_time_type() -> OneWeekPerMonthByHourType:
132
+ return OneWeekPerMonthByHourType
133
+
134
+ def get_frequency(self) -> timedelta:
135
+ return timedelta(hours=1)
136
+
137
+ @staticmethod
138
+ def get_start_times(ranges) -> list[OneWeekPerMonthByHourType]:
139
+ """Get the starting combination of (month, day_of_week, hour) based on sorted order"""
140
+ start_times = []
141
+ for model in ranges:
142
+ start_times.append(OneWeekPerMonthByHourType(month=model.start, day_of_week=0, hour=0))
143
+ return start_times
144
+
145
+ @staticmethod
146
+ def get_lengths(ranges) -> list[int]:
147
+ """Get the number of unique combinations of (month, day_of_week, hour)"""
148
+ lengths = []
149
+ for model in ranges:
150
+ n_months = model.end - model.start + 1
151
+ lengths.append(n_months * 7 * 24)
152
+ return lengths
153
+
154
+ @staticmethod
155
+ def get_load_data_time_columns() -> list[str]:
156
+ return list(OneWeekPerMonthByHourType._fields)
157
+
158
+
159
+ class OneWeekdayDayAndWeekendDayPerMonthByHourHandler(RepresentativeTimeFormatHandlerBase):
160
+ """Handler for format with hourly data that includes one weekday day and one weekend day
161
+ per month.
162
+ """
163
+
164
+ @staticmethod
165
+ def get_representative_time_type() -> OneWeekdayDayAndOneWeekendDayPerMonthByHourType:
166
+ return OneWeekdayDayAndOneWeekendDayPerMonthByHourType
167
+
168
+ def get_frequency(self) -> timedelta:
169
+ return timedelta(hours=1)
170
+
171
+ @staticmethod
172
+ def get_start_times(ranges) -> list[OneWeekdayDayAndOneWeekendDayPerMonthByHourType]:
173
+ """Get the starting combination of (month, hour, is_weekday) based on sorted order"""
174
+ start_times = []
175
+ for model in ranges:
176
+ start_times.append(
177
+ OneWeekdayDayAndOneWeekendDayPerMonthByHourType(
178
+ month=model.start, hour=0, is_weekday=False
179
+ )
180
+ )
181
+ return start_times
182
+
183
+ @staticmethod
184
+ def get_lengths(ranges) -> list[int]:
185
+ """Get the number of unique combinations of (month, hour, is_weekday)"""
186
+ lengths = []
187
+ for model in ranges:
188
+ n_months = model.end - model.start + 1
189
+ lengths.append(n_months * 24 * 2)
190
+ return lengths
191
+
192
+ @staticmethod
193
+ def get_load_data_time_columns() -> list[str]:
194
+ return list(OneWeekdayDayAndOneWeekendDayPerMonthByHourType._fields)
@@ -0,0 +1,49 @@
1
+ """Defines simplified data models for testing and filtering."""
2
+
3
+ from pydantic import field_validator, model_validator, Field
4
+
5
+ from dsgrid.data_models import DSGBaseModel
6
+ from dsgrid.dimension.base_models import DimensionType
7
+
8
+
9
+ class DimensionSimpleModel(DSGBaseModel):
10
+ dimension_type: DimensionType
11
+ dimension_name: str | None = None
12
+ record_ids: list[str]
13
+
14
+
15
+ class DimensionsSimpleModel(DSGBaseModel):
16
+ base_dimensions: list[DimensionSimpleModel]
17
+ supplemental_dimensions: list[DimensionSimpleModel] = Field(default=[])
18
+
19
+ @field_validator("base_dimensions")
20
+ @classmethod
21
+ def check_base_dimensions(cls, base_dimensions):
22
+ dimension_types = {x.dimension_type for x in base_dimensions}
23
+ if len(dimension_types) != len(base_dimensions):
24
+ msg = "base_dimensions cannot contain duplicate dimension types"
25
+ raise ValueError(msg)
26
+ return base_dimensions
27
+
28
+ @model_validator(mode="after")
29
+ def check_supplemental_dimensions(self) -> "DimensionsSimpleModel":
30
+ for dim in self.supplemental_dimensions:
31
+ if dim.dimension_name is None:
32
+ msg = f"supplemental dimensions must define dimension_name: {dim}"
33
+ raise ValueError(msg)
34
+ return self
35
+
36
+
37
+ class DatasetSimpleModel(DSGBaseModel):
38
+ dataset_id: str
39
+ dimensions: list[DimensionSimpleModel]
40
+
41
+
42
+ class ProjectSimpleModel(DSGBaseModel):
43
+ project_id: str
44
+ dimensions: DimensionsSimpleModel
45
+
46
+
47
+ class RegistrySimpleModel(DSGBaseModel):
48
+ projects: list[ProjectSimpleModel]
49
+ datasets: list[DatasetSimpleModel]
@@ -0,0 +1,29 @@
1
+ """Defines a supplemental dimension."""
2
+
3
+ from typing import Annotated
4
+ from pydantic import Field
5
+
6
+
7
+ from dsgrid.data_models import DSGBaseModel
8
+ from .dimensions import DimensionModel
9
+ from .mapping_tables import MappingTableByNameModel
10
+
11
+
12
+ class SupplementalDimensionModel(DimensionModel):
13
+ """Defines a supplemental dimension."""
14
+
15
+ mapping: MappingTableByNameModel = Field(
16
+ description="Defines how the supplemental dimension will be mapped to the project's base "
17
+ "dimension.",
18
+ title="mapping",
19
+ )
20
+
21
+
22
+ class SupplementalDimensionsListModel(DSGBaseModel):
23
+ """Defines a list of supplemental dimensions."""
24
+
25
+ supplemental_dimensions: Annotated[
26
+ list[SupplementalDimensionModel], Field(min_length=1)
27
+ ] = Field(
28
+ description="List of supplemental dimensions and mappings to be registered",
29
+ )
@@ -0,0 +1,192 @@
1
+ import abc
2
+ import logging
3
+ from datetime import tzinfo
4
+ from typing import Any
5
+ import pandas as pd
6
+
7
+ import chronify
8
+
9
+ from .dimension_config import DimensionBaseConfigWithoutFiles
10
+ from dsgrid.dimension.time import (
11
+ TimeIntervalType,
12
+ TimeBasedDataAdjustmentModel,
13
+ )
14
+ from dsgrid.dimension.time_utils import (
15
+ build_time_ranges,
16
+ )
17
+ from dsgrid.config.dimensions import TimeRangeModel
18
+
19
+ from dsgrid.spark.types import (
20
+ DataFrame,
21
+ )
22
+
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class TimeDimensionBaseConfig(DimensionBaseConfigWithoutFiles, abc.ABC):
28
+ """Base class for all time dimension configs"""
29
+
30
+ def supports_chronify(self) -> bool:
31
+ """Return True if the config can be converted to chronify."""
32
+ return False
33
+
34
+ # @abc.abstractmethod
35
+ def to_chronify(self) -> chronify.TimeBaseModel:
36
+ """Return the chronify version of the time model."""
37
+ # This is likely temporary until we can use chronify models directly.
38
+ msg = f"{type(self)}.to_chronify"
39
+ raise NotImplementedError(msg)
40
+
41
+ def check_dataset_time_consistency(self, load_data_df, time_columns: list[str]) -> None:
42
+ """Check consistency of the load data with the time dimension.
43
+
44
+ Parameters
45
+ ----------
46
+ load_data_df : pyspark.sql.DataFrame
47
+ time_columns : list[str]
48
+
49
+ Raises
50
+ ------
51
+ DSGInvalidDataset
52
+ Raised if the dataset is inconsistent with the time dimension.
53
+ """
54
+ msg = f"{type(self)}.check_dataset_time_consistency is not implemented"
55
+ raise NotImplementedError(msg)
56
+
57
+ def build_time_dataframe(self) -> DataFrame:
58
+ """Build time dimension as specified in config in a spark dataframe.
59
+
60
+ Returns
61
+ -------
62
+ pyspark.sql.DataFrame
63
+ """
64
+ msg = f"{self.__class__.__name__}.build_time_dataframe is not implemented"
65
+ raise NotImplementedError(msg)
66
+
67
+ @abc.abstractmethod
68
+ def get_load_data_time_columns(self) -> list[str]:
69
+ """Return the required timestamp columns in the load data table.
70
+
71
+ Returns
72
+ -------
73
+ list
74
+ """
75
+
76
+ def list_load_data_columns_for_query_name(self) -> list[str]:
77
+ """Return the time columns expected in the load data table for this dimension's query name.
78
+
79
+ Returns
80
+ -------
81
+ list[str]
82
+ """
83
+ # This may need to be re-implemented by child classes.
84
+ return [self.model.name]
85
+
86
+ def map_timestamp_load_data_columns_for_query_name(self, df) -> DataFrame:
87
+ """Map the timestamp columns in the load data table to those specified by the query name.
88
+
89
+ Parameters
90
+ ----------
91
+ df : pyspark.sql.DataFrame
92
+
93
+ Returns
94
+ -------
95
+ pyspark.sql.DataFrame
96
+ """
97
+ time_cols = self.get_load_data_time_columns()
98
+ if len(time_cols) > 1:
99
+ msg = (
100
+ "Handling of multiple time columns needs to be implemented in the child class: "
101
+ f"{type(self)}: {time_cols=}"
102
+ )
103
+ raise NotImplementedError(msg)
104
+
105
+ time_col = time_cols[0]
106
+ if time_col not in df.columns:
107
+ return df
108
+ return df.withColumnRenamed(time_col, self.model.name)
109
+
110
+ def get_time_ranges(self) -> list[Any]:
111
+ """Return time ranges with time_zone applied.
112
+
113
+ Returns
114
+ -------
115
+ list
116
+ list of DatetimeRange
117
+ """
118
+ msg = f"{type(self)}.get_time_ranges is not implemented"
119
+ raise NotImplementedError(msg)
120
+
121
+ @abc.abstractmethod
122
+ def get_start_times(self) -> list[Any]:
123
+ """Return the list of starting timestamp (with tzinfo) for this dimension.
124
+ One per time range.
125
+
126
+ Returns
127
+ -------
128
+ list[Any]
129
+ """
130
+
131
+ @abc.abstractmethod
132
+ def get_lengths(self) -> list[int]:
133
+ """Return the list of time range length (number of time steps) for this dimension.
134
+ One per time range.
135
+
136
+ Returns
137
+ -------
138
+ list[Any]
139
+ """
140
+
141
+ @abc.abstractmethod
142
+ def get_time_zone(self) -> str | None:
143
+ """Return a time zone instance for this dimension."""
144
+
145
+ def get_time_zones(self) -> list[str]:
146
+ """Return a list of time zones for this dimension."""
147
+ if self.get_time_zone():
148
+ return [self.get_time_zone()]
149
+ return []
150
+
151
+ @abc.abstractmethod
152
+ def get_tzinfo(self) -> tzinfo | None:
153
+ """Return a tzinfo instance for this dimension.
154
+
155
+ Returns
156
+ -------
157
+ tzinfo | None
158
+ """
159
+
160
+ @abc.abstractmethod
161
+ def get_time_interval_type(self) -> TimeIntervalType:
162
+ """Return the time interval type for this dimension.
163
+
164
+ Returns
165
+ -------
166
+ TimeIntervalType
167
+ """
168
+
169
+ def list_expected_dataset_timestamps(
170
+ self,
171
+ time_based_data_adjustment: TimeBasedDataAdjustmentModel | None = None,
172
+ ) -> list[tuple]:
173
+ """Return a list of the timestamps expected in the load_data table.
174
+ Parameters
175
+ ----------
176
+ time_based_data_adjustmen : TimeBasedDataAdjustmentModel | None
177
+
178
+ Returns
179
+ -------
180
+ list
181
+ List of tuples of columns representing time in the load_data table.
182
+
183
+ """
184
+ msg = f"{type(self)}.list_expected_dataset_timestamps is not implemented"
185
+ raise NotImplementedError(msg)
186
+
187
+ def _build_time_ranges(
188
+ self,
189
+ time_ranges: list[TimeRangeModel],
190
+ tz: str | None = None,
191
+ ) -> list[tuple[pd.Timestamp, pd.Timestamp, pd.Timedelta]]:
192
+ return build_time_ranges(time_ranges, tz=tz)