dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
dsgrid/dimension/time.py
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
"""Dimensions related to time"""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
from zoneinfo import ZoneInfo
|
|
5
|
+
import logging
|
|
6
|
+
from pydantic import Field
|
|
7
|
+
from enum import Enum
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
from dsgrid.data_models import DSGEnum, EnumValue, DSGBaseModel
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TimeDimensionType(DSGEnum):
|
|
16
|
+
"""Defines the supported time formats in the load data."""
|
|
17
|
+
|
|
18
|
+
DATETIME = "datetime"
|
|
19
|
+
ANNUAL = "annual"
|
|
20
|
+
REPRESENTATIVE_PERIOD = "representative_period"
|
|
21
|
+
DATETIME_EXTERNAL_TZ = "datetime_external_tz"
|
|
22
|
+
INDEX = "index"
|
|
23
|
+
NOOP = "noop"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TimeZoneFormat(str, Enum):
|
|
27
|
+
"""Defines the time format of the datetime config model"""
|
|
28
|
+
|
|
29
|
+
ALIGNED_IN_ABSOLUTE_TIME = "aligned_in_absolute_time"
|
|
30
|
+
ALIGNED_IN_CLOCK_TIME = "aligned_in_clock_time"
|
|
31
|
+
LOCAL_AS_STRINGS = "local_as_strings"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class RepresentativePeriodFormat(DSGEnum):
|
|
35
|
+
"""Defines the supported formats for representative period data."""
|
|
36
|
+
|
|
37
|
+
# All instances of this Enum must declare frequency.
|
|
38
|
+
# This Enum may be replaced by a generic implementation in order to support a large
|
|
39
|
+
# number of permutations (seasons, weekend day vs week day, sub-hour time, etc).
|
|
40
|
+
|
|
41
|
+
ONE_WEEK_PER_MONTH_BY_HOUR = EnumValue(
|
|
42
|
+
value="one_week_per_month_by_hour",
|
|
43
|
+
frequency=timedelta(hours=1),
|
|
44
|
+
description="load_data columns use 'month', 'day_of_week', 'hour' to specify time",
|
|
45
|
+
)
|
|
46
|
+
ONE_WEEKDAY_DAY_AND_ONE_WEEKEND_DAY_PER_MONTH_BY_HOUR = EnumValue(
|
|
47
|
+
value="one_weekday_day_and_one_weekend_day_per_month_by_hour",
|
|
48
|
+
frequency=timedelta(hours=1),
|
|
49
|
+
description="load_data columns use 'month', 'hour', 'is_weekday' to specify time",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class LeapDayAdjustmentType(DSGEnum):
|
|
54
|
+
"""Leap day adjustment enum types"""
|
|
55
|
+
|
|
56
|
+
DROP_DEC31 = EnumValue(
|
|
57
|
+
value="drop_dec31",
|
|
58
|
+
description="To adjust for leap years, December 31st timestamps and data get dropped.",
|
|
59
|
+
)
|
|
60
|
+
DROP_FEB29 = EnumValue(
|
|
61
|
+
value="drop_feb29",
|
|
62
|
+
description="Feburary 29th timestamps and data are dropped. Currently not yet supported by dsgrid.",
|
|
63
|
+
)
|
|
64
|
+
DROP_JAN1 = EnumValue(
|
|
65
|
+
value="drop_jan1",
|
|
66
|
+
description="To adjust for leap years, January 1st timestamps and data get dropped.",
|
|
67
|
+
)
|
|
68
|
+
NONE = EnumValue(value="none", description="No leap day adjustment made.")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class DaylightSavingSpringForwardType(DSGEnum):
|
|
72
|
+
"""Daylight saving spring forward adjustment enum types"""
|
|
73
|
+
|
|
74
|
+
DROP = EnumValue(
|
|
75
|
+
value="drop",
|
|
76
|
+
description="Drop timestamp(s) and associated data for the spring forward hour (2AM in March)",
|
|
77
|
+
)
|
|
78
|
+
NONE = EnumValue(value="none", description="No daylight saving adjustment for data.")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class DaylightSavingFallBackType(DSGEnum):
|
|
82
|
+
"""Daylight saving fall back adjustment enum types"""
|
|
83
|
+
|
|
84
|
+
INTERPOLATE = EnumValue(
|
|
85
|
+
value="interpolate",
|
|
86
|
+
description="Fill data by interpolating between the left and right edges of the dataframe.",
|
|
87
|
+
)
|
|
88
|
+
DUPLICATE = EnumValue(
|
|
89
|
+
value="duplicate",
|
|
90
|
+
description="Fill data by duplicating the fall-back hour (1AM in November)",
|
|
91
|
+
)
|
|
92
|
+
NONE = EnumValue(value="none", description="No daylight saving adjustment for data.")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class TimeIntervalType(DSGEnum):
|
|
96
|
+
"""Time interval enum types"""
|
|
97
|
+
|
|
98
|
+
# TODO: R2PD uses a different set; do we want to align?
|
|
99
|
+
# https://github.com/Smart-DS/R2PD/blob/master/R2PD/tshelpers.py#L15
|
|
100
|
+
|
|
101
|
+
PERIOD_ENDING = EnumValue(
|
|
102
|
+
value="period_ending",
|
|
103
|
+
description="A time interval that is period ending is coded by the end time. E.g., 2pm (with"
|
|
104
|
+
" freq=1h) represents a period of time between 1-2pm.",
|
|
105
|
+
)
|
|
106
|
+
PERIOD_BEGINNING = EnumValue(
|
|
107
|
+
value="period_beginning",
|
|
108
|
+
description="A time interval that is period beginning is coded by the beginning time. E.g.,"
|
|
109
|
+
" 2pm (with freq=01:00:00) represents a period of time between 2-3pm. This is the dsgrid"
|
|
110
|
+
" default.",
|
|
111
|
+
)
|
|
112
|
+
INSTANTANEOUS = EnumValue(
|
|
113
|
+
value="instantaneous",
|
|
114
|
+
description="The time record value represents measured, instantaneous time",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class MeasurementType(DSGEnum):
|
|
119
|
+
"""Time value measurement enum types"""
|
|
120
|
+
|
|
121
|
+
MEAN = EnumValue(
|
|
122
|
+
value="mean",
|
|
123
|
+
description="Data values represent the average value in a time range",
|
|
124
|
+
)
|
|
125
|
+
MIN = EnumValue(
|
|
126
|
+
value="min",
|
|
127
|
+
description="Data values represent the minimum value in a time range",
|
|
128
|
+
)
|
|
129
|
+
MAX = EnumValue(
|
|
130
|
+
value="max",
|
|
131
|
+
description="Data values represent the maximum value in a time range",
|
|
132
|
+
)
|
|
133
|
+
MEASURED = EnumValue(
|
|
134
|
+
value="measured",
|
|
135
|
+
description="Data values represent the measured value at that reported time",
|
|
136
|
+
)
|
|
137
|
+
TOTAL = EnumValue(
|
|
138
|
+
value="total",
|
|
139
|
+
description="Data values represent the sum of values in a time range",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class DaylightSavingAdjustmentModel(DSGBaseModel):
|
|
144
|
+
"""Defines how to drop and add data along with timestamps to convert standard time
|
|
145
|
+
load profiles to clock time"""
|
|
146
|
+
|
|
147
|
+
spring_forward_hour: DaylightSavingSpringForwardType = Field(
|
|
148
|
+
title="spring_forward_hour",
|
|
149
|
+
description="Data adjustment for spring forward hour (a 2AM in March)",
|
|
150
|
+
default=DaylightSavingSpringForwardType.NONE,
|
|
151
|
+
json_schema_extra={
|
|
152
|
+
"options": DaylightSavingSpringForwardType.format_descriptions_for_docs(),
|
|
153
|
+
},
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
fall_back_hour: DaylightSavingFallBackType = Field(
|
|
157
|
+
title="fall_back_hour",
|
|
158
|
+
description="Data adjustment for spring forward hour (a 2AM in November)",
|
|
159
|
+
default=DaylightSavingFallBackType.NONE,
|
|
160
|
+
json_schema_extra={
|
|
161
|
+
"options": DaylightSavingFallBackType.format_descriptions_for_docs(),
|
|
162
|
+
},
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class TimeBasedDataAdjustmentModel(DSGBaseModel):
|
|
167
|
+
"""Defines how data needs to be adjusted with respect to time.
|
|
168
|
+
For leap day adjustment, up to one full day of timestamps and data are dropped.
|
|
169
|
+
For daylight savings, the dataframe is adjusted alongside the timestamps.
|
|
170
|
+
This is useful when the load profiles are modeled in standard time and
|
|
171
|
+
need to be converted to get clock time load profiles.
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
leap_day_adjustment: LeapDayAdjustmentType = Field(
|
|
175
|
+
default=LeapDayAdjustmentType.NONE,
|
|
176
|
+
title="leap_day_adjustment",
|
|
177
|
+
description="Leap day adjustment method applied to time data. The dsgrid default is None, "
|
|
178
|
+
"i.e., no adjustment made to leap years. Adjustments are made to leap years only.",
|
|
179
|
+
)
|
|
180
|
+
daylight_saving_adjustment: DaylightSavingAdjustmentModel = Field(
|
|
181
|
+
title="daylight_saving_adjustment",
|
|
182
|
+
description="Daylight saving adjustment method applied to time data",
|
|
183
|
+
default=DaylightSavingAdjustmentModel(
|
|
184
|
+
spring_forward_hour=DaylightSavingSpringForwardType.NONE,
|
|
185
|
+
fall_back_hour=DaylightSavingFallBackType.NONE,
|
|
186
|
+
),
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class DatetimeRange:
|
|
191
|
+
def __init__(
|
|
192
|
+
self,
|
|
193
|
+
start,
|
|
194
|
+
end,
|
|
195
|
+
frequency,
|
|
196
|
+
time_based_data_adjustment: TimeBasedDataAdjustmentModel | None = None,
|
|
197
|
+
):
|
|
198
|
+
if time_based_data_adjustment is None:
|
|
199
|
+
time_based_data_adjustment = TimeBasedDataAdjustmentModel()
|
|
200
|
+
self.start = start
|
|
201
|
+
self.end = end
|
|
202
|
+
self.tzinfo = start.tzinfo
|
|
203
|
+
self.frequency = frequency
|
|
204
|
+
self.leap_day_adjustment = time_based_data_adjustment.leap_day_adjustment
|
|
205
|
+
self.dls_springforward_adjustment = (
|
|
206
|
+
time_based_data_adjustment.daylight_saving_adjustment.spring_forward_hour
|
|
207
|
+
)
|
|
208
|
+
self.dls_fallback_adjustment = (
|
|
209
|
+
time_based_data_adjustment.daylight_saving_adjustment.fall_back_hour
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
def __repr__(self):
|
|
213
|
+
return (
|
|
214
|
+
self.__class__.__qualname__
|
|
215
|
+
+ f"(start={self.start}, end={self.end}, frequency={self.frequency}, "
|
|
216
|
+
+ f"leap_day_adjustment={self.leap_day_adjustment}, "
|
|
217
|
+
+ f"dls_springforward_adjustment={self.dls_springforward_adjustment}, "
|
|
218
|
+
+ f"dls_fallback_adjustment={self.dls_fallback_adjustment}."
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
def __str__(self):
|
|
222
|
+
return self.show_range()
|
|
223
|
+
|
|
224
|
+
def show_range(self, n_show=5):
|
|
225
|
+
output = self.list_time_range()
|
|
226
|
+
n_show = min(len(output) // 2, n_show)
|
|
227
|
+
n_head = ", ".join([str(x) for x in output[:n_show]])
|
|
228
|
+
n_tail = ", ".join([str(x) for x in output[-n_show:]])
|
|
229
|
+
return n_head + ",\n ... , \n" + n_tail
|
|
230
|
+
|
|
231
|
+
def _iter_timestamps(self):
|
|
232
|
+
"""Return a generator of datetimes for a time range ('start' and 'end' times are inclusive).
|
|
233
|
+
There could be duplicates.
|
|
234
|
+
|
|
235
|
+
TODO: for future-selves, test functionality of LeapDayAdjustmentType in relation to TimeIntervalType to make sure drop behavior is expected.
|
|
236
|
+
|
|
237
|
+
Yields
|
|
238
|
+
------
|
|
239
|
+
datetime
|
|
240
|
+
|
|
241
|
+
"""
|
|
242
|
+
cur = self.start.to_pydatetime().astimezone(ZoneInfo("UTC"))
|
|
243
|
+
end = self.end.to_pydatetime().astimezone(ZoneInfo("UTC")) + self.frequency
|
|
244
|
+
|
|
245
|
+
while cur < end:
|
|
246
|
+
cur_tz = cur.astimezone(self.tzinfo)
|
|
247
|
+
cur_tz = adjust_timestamp_by_dst_offset(cur_tz, self.frequency)
|
|
248
|
+
month = cur_tz.month
|
|
249
|
+
day = cur_tz.day
|
|
250
|
+
if not (
|
|
251
|
+
self.leap_day_adjustment == LeapDayAdjustmentType.DROP_FEB29
|
|
252
|
+
and month == 2
|
|
253
|
+
and day == 29
|
|
254
|
+
):
|
|
255
|
+
if not (
|
|
256
|
+
self.leap_day_adjustment == LeapDayAdjustmentType.DROP_DEC31
|
|
257
|
+
and month == 12
|
|
258
|
+
and day == 31
|
|
259
|
+
):
|
|
260
|
+
if not (
|
|
261
|
+
self.leap_day_adjustment == LeapDayAdjustmentType.DROP_JAN1
|
|
262
|
+
and month == 1
|
|
263
|
+
and day == 1
|
|
264
|
+
):
|
|
265
|
+
yield cur_tz
|
|
266
|
+
|
|
267
|
+
cur += self.frequency
|
|
268
|
+
|
|
269
|
+
def list_time_range(self):
|
|
270
|
+
"""Return a list of timestamps for a time range.
|
|
271
|
+
|
|
272
|
+
Returns
|
|
273
|
+
-------
|
|
274
|
+
list[datetime]
|
|
275
|
+
"""
|
|
276
|
+
return list(self._iter_timestamps())
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
class AnnualTimeRange(DatetimeRange):
|
|
280
|
+
def _iter_timestamps(self):
|
|
281
|
+
"""
|
|
282
|
+
Return a list of years (datetime obj) on Jan 1st
|
|
283
|
+
Might be okay to not convert to UTC for iteration, since it's annual
|
|
284
|
+
|
|
285
|
+
"""
|
|
286
|
+
start = self.start.to_pydatetime()
|
|
287
|
+
end = self.end.to_pydatetime()
|
|
288
|
+
tz = self.tzinfo
|
|
289
|
+
assert isinstance(self.frequency, int)
|
|
290
|
+
for year in range(start.year, end.year + self.frequency, self.frequency):
|
|
291
|
+
yield datetime(year=year, month=1, day=1, tzinfo=tz)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
class IndexTimeRange(DatetimeRange):
|
|
295
|
+
def __init__(
|
|
296
|
+
self,
|
|
297
|
+
start,
|
|
298
|
+
end,
|
|
299
|
+
frequency,
|
|
300
|
+
start_index,
|
|
301
|
+
time_based_data_adjustment: TimeBasedDataAdjustmentModel | None = None,
|
|
302
|
+
):
|
|
303
|
+
super().__init__(
|
|
304
|
+
start, end, frequency, time_based_data_adjustment=time_based_data_adjustment
|
|
305
|
+
)
|
|
306
|
+
self.start_index = start_index
|
|
307
|
+
|
|
308
|
+
def _iter_timestamps(self):
|
|
309
|
+
cur = self.start.to_pydatetime().astimezone(ZoneInfo("UTC"))
|
|
310
|
+
cur_idx = self.start_index
|
|
311
|
+
end = (
|
|
312
|
+
self.end.to_pydatetime().astimezone(ZoneInfo("UTC")) + self.frequency
|
|
313
|
+
) # to make end time inclusive
|
|
314
|
+
|
|
315
|
+
while cur < end:
|
|
316
|
+
cur_tz = cur.astimezone(self.tzinfo)
|
|
317
|
+
cur_tz = adjust_timestamp_by_dst_offset(cur_tz, self.frequency)
|
|
318
|
+
month = cur_tz.month
|
|
319
|
+
day = cur_tz.day
|
|
320
|
+
if not (
|
|
321
|
+
self.leap_day_adjustment == LeapDayAdjustmentType.DROP_FEB29
|
|
322
|
+
and month == 2
|
|
323
|
+
and day == 29
|
|
324
|
+
):
|
|
325
|
+
if not (
|
|
326
|
+
self.leap_day_adjustment == LeapDayAdjustmentType.DROP_DEC31
|
|
327
|
+
and month == 12
|
|
328
|
+
and day == 31
|
|
329
|
+
):
|
|
330
|
+
if not (
|
|
331
|
+
self.leap_day_adjustment == LeapDayAdjustmentType.DROP_JAN1
|
|
332
|
+
and month == 1
|
|
333
|
+
and day == 1
|
|
334
|
+
):
|
|
335
|
+
yield cur_idx
|
|
336
|
+
cur += self.frequency
|
|
337
|
+
cur_idx += 1
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def adjust_timestamp_by_dst_offset(timestamp, frequency):
|
|
341
|
+
"""Reduce the timestamps within the daylight saving range by 1 hour.
|
|
342
|
+
Used to ensure that a time series at daily (or lower) frequency returns each day at the
|
|
343
|
+
same timestamp in prevailing time, an expected behavior in most standard libraries.
|
|
344
|
+
(e.g., ensure a time series can return 2018-03-11 00:00, 2018-03-12 00:00...
|
|
345
|
+
instead of 2018-03-11 00:00, 2018-03-12 01:00...)
|
|
346
|
+
|
|
347
|
+
"""
|
|
348
|
+
if frequency < timedelta(hours=24):
|
|
349
|
+
return timestamp
|
|
350
|
+
|
|
351
|
+
offset = timestamp.dst() or timedelta(hours=0)
|
|
352
|
+
return timestamp - offset
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""Functions related to time"""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from dsgrid.dimension.time import (
|
|
11
|
+
DatetimeRange,
|
|
12
|
+
TimeBasedDataAdjustmentModel,
|
|
13
|
+
TimeDimensionType,
|
|
14
|
+
)
|
|
15
|
+
from dsgrid.config.dimensions import TimeRangeModel, AnnualRangeModel
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def build_annual_ranges(
|
|
22
|
+
time_ranges: list[AnnualRangeModel],
|
|
23
|
+
tz: str | None = None,
|
|
24
|
+
) -> list[tuple[pd.Timestamp, pd.Timestamp, int]]:
|
|
25
|
+
ranges = []
|
|
26
|
+
for time_range in time_ranges:
|
|
27
|
+
start = datetime.strptime(time_range.start, time_range.str_format)
|
|
28
|
+
end = datetime.strptime(time_range.end, time_range.str_format)
|
|
29
|
+
assert isinstance(time_range.frequency, int)
|
|
30
|
+
freq = time_range.frequency
|
|
31
|
+
ranges.append((pd.Timestamp(start, tz=tz), pd.Timestamp(end, tz=tz), freq))
|
|
32
|
+
|
|
33
|
+
ranges.sort(key=lambda x: x[0])
|
|
34
|
+
return ranges
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def build_time_ranges(
|
|
38
|
+
time_ranges: list[TimeRangeModel],
|
|
39
|
+
tz: str | None = None,
|
|
40
|
+
) -> list[tuple[pd.Timestamp, pd.Timestamp, pd.Timedelta]]:
|
|
41
|
+
ranges = []
|
|
42
|
+
for time_range in time_ranges:
|
|
43
|
+
start = datetime.strptime(time_range.start, time_range.str_format)
|
|
44
|
+
end = datetime.strptime(time_range.end, time_range.str_format)
|
|
45
|
+
start_adj = datetime(
|
|
46
|
+
year=start.year,
|
|
47
|
+
month=start.month,
|
|
48
|
+
day=start.day,
|
|
49
|
+
hour=start.hour,
|
|
50
|
+
minute=start.minute,
|
|
51
|
+
second=start.second,
|
|
52
|
+
microsecond=start.microsecond,
|
|
53
|
+
)
|
|
54
|
+
end_adj = datetime(
|
|
55
|
+
year=end.year,
|
|
56
|
+
month=end.month,
|
|
57
|
+
day=end.day,
|
|
58
|
+
hour=end.hour,
|
|
59
|
+
minute=end.minute,
|
|
60
|
+
second=end.second,
|
|
61
|
+
microsecond=end.microsecond,
|
|
62
|
+
)
|
|
63
|
+
freq = pd.Timedelta(time_range.frequency)
|
|
64
|
+
ranges.append((pd.Timestamp(start_adj, tz=tz), pd.Timestamp(end_adj, tz=tz), freq))
|
|
65
|
+
|
|
66
|
+
ranges.sort(key=lambda x: x[0])
|
|
67
|
+
return ranges
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_time_ranges(
|
|
71
|
+
time_dimension_config, #: DateTimeDimensionConfig,
|
|
72
|
+
time_zone: str = None,
|
|
73
|
+
time_based_data_adjustment: TimeBasedDataAdjustmentModel = None,
|
|
74
|
+
):
|
|
75
|
+
dim_model = time_dimension_config.model
|
|
76
|
+
if time_zone is None:
|
|
77
|
+
time_zone = time_dimension_config.get_tzinfo()
|
|
78
|
+
|
|
79
|
+
if dim_model.time_type == TimeDimensionType.DATETIME:
|
|
80
|
+
dt_ranges = dim_model.ranges
|
|
81
|
+
elif dim_model.time_type == TimeDimensionType.INDEX:
|
|
82
|
+
dt_ranges = time_dimension_config._create_represented_time_ranges()
|
|
83
|
+
else:
|
|
84
|
+
msg = f"Cannot support time_dimension_config model of time_type {dim_model.time_type}."
|
|
85
|
+
raise ValueError(msg)
|
|
86
|
+
|
|
87
|
+
ranges = []
|
|
88
|
+
for start, end, freq in build_time_ranges(dt_ranges, tz=time_zone):
|
|
89
|
+
ranges.append(
|
|
90
|
+
DatetimeRange(
|
|
91
|
+
start=start,
|
|
92
|
+
end=end,
|
|
93
|
+
frequency=freq,
|
|
94
|
+
time_based_data_adjustment=time_based_data_adjustment,
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
return ranges
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def is_leap_year(year: int) -> bool:
|
|
102
|
+
"""Return True if the year is a leap year."""
|
|
103
|
+
return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)
|
dsgrid/dsgrid_rc.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Manages the dsgrid runtime configuration file"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
from warnings import warn
|
|
9
|
+
|
|
10
|
+
import json5
|
|
11
|
+
from pydantic import model_validator
|
|
12
|
+
|
|
13
|
+
from dsgrid.common import BackendEngine, DEFAULT_DB_PASSWORD, DEFAULT_SCRATCH_DIR
|
|
14
|
+
from dsgrid.data_models import DSGBaseModel
|
|
15
|
+
|
|
16
|
+
RC_FILENAME = ".dsgrid.json5"
|
|
17
|
+
DEFAULT_BACKEND = BackendEngine.DUCKDB
|
|
18
|
+
DEFAULT_THRIFT_SERVER_URL = "hive://localhost:10000/default"
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DsgridRuntimeConfig(DSGBaseModel):
|
|
24
|
+
"""Defines the runtime config that can be stored in users' home directories."""
|
|
25
|
+
|
|
26
|
+
database_url: str | None = None
|
|
27
|
+
database_user: str = "root"
|
|
28
|
+
database_password: str = DEFAULT_DB_PASSWORD
|
|
29
|
+
offline: bool = True
|
|
30
|
+
backend_engine: BackendEngine = DEFAULT_BACKEND
|
|
31
|
+
thrift_server_url: str = DEFAULT_THRIFT_SERVER_URL
|
|
32
|
+
use_hive_metastore: bool = False
|
|
33
|
+
console_level: str = "info"
|
|
34
|
+
file_level: str = "info"
|
|
35
|
+
timings: bool = False
|
|
36
|
+
reraise_exceptions: bool = False
|
|
37
|
+
scratch_dir: None | Path = None
|
|
38
|
+
|
|
39
|
+
@model_validator(mode="before")
|
|
40
|
+
@classmethod
|
|
41
|
+
def environment_overrides(cls, values: dict[str, Any]) -> dict[str, Any]:
|
|
42
|
+
for env, field in (
|
|
43
|
+
("DSGRID_BACKEND_ENGINE", "backend_engine"),
|
|
44
|
+
("THRIFT_SERVER_URL", "thrift_server_url"),
|
|
45
|
+
):
|
|
46
|
+
if env in os.environ:
|
|
47
|
+
values[field] = os.environ[env]
|
|
48
|
+
return values
|
|
49
|
+
|
|
50
|
+
@model_validator(mode="before")
|
|
51
|
+
@classmethod
|
|
52
|
+
def remove_legacy_fields(cls, data: dict[str, Any]) -> dict[str, Any]:
|
|
53
|
+
for field in ("database_name",):
|
|
54
|
+
res = data.pop(field, None)
|
|
55
|
+
if res is not None:
|
|
56
|
+
warn(
|
|
57
|
+
f"The dsgrid runtime config field {field} is deprecated. Please remove it. "
|
|
58
|
+
"This will cause an error in a future release.",
|
|
59
|
+
)
|
|
60
|
+
return data
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def load(cls) -> "DsgridRuntimeConfig":
|
|
64
|
+
"""Load the dsgrid runtime config if it exists or one with default values."""
|
|
65
|
+
rc_file = cls.path()
|
|
66
|
+
if rc_file.exists():
|
|
67
|
+
data = json5.loads(rc_file.read_text(encoding="utf-8-sig"))
|
|
68
|
+
return cls(**data)
|
|
69
|
+
return cls()
|
|
70
|
+
|
|
71
|
+
def dump(self) -> None:
|
|
72
|
+
"""Dump the config to the user's home directory."""
|
|
73
|
+
path = self.path()
|
|
74
|
+
data = self.model_dump()
|
|
75
|
+
data.pop("database_user")
|
|
76
|
+
data.pop("database_password")
|
|
77
|
+
with open(path, "w") as f_out:
|
|
78
|
+
json5.dump(data, f_out, indent=2)
|
|
79
|
+
print(f"Wrote dsgrid config to {path}", file=sys.stderr)
|
|
80
|
+
|
|
81
|
+
@staticmethod
|
|
82
|
+
def path() -> Path:
|
|
83
|
+
"""Return the path to the config file."""
|
|
84
|
+
return Path.home() / RC_FILENAME
|
|
85
|
+
|
|
86
|
+
def get_scratch_dir(self) -> Path:
|
|
87
|
+
"""Return the scratch_dir to use."""
|
|
88
|
+
return (self.scratch_dir or Path(DEFAULT_SCRATCH_DIR)).resolve()
|
dsgrid/exceptions.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""dsgrid exceptions and warnings"""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DSGBaseException(Exception):
|
|
5
|
+
"""Base class for all dsgrid exceptions."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DSGBaseWarning(Warning):
|
|
9
|
+
"""Base class for all dsgrid warnings."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DSGInvalidDataset(DSGBaseException):
|
|
13
|
+
"""Raised if a dataset is invalid."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DSGInvalidField(DSGBaseException):
|
|
17
|
+
"""Raised if a field is missing or invalid."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DSGInvalidParameter(DSGBaseException):
|
|
21
|
+
"""Raised if a parameter is invalid."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DSGInvalidFile(DSGBaseException):
|
|
25
|
+
"""Raised if a file cannot be read. Possible reason is that the write operation failed."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DSGInvalidDimension(DSGBaseException):
|
|
29
|
+
"""Raised if a type is not stored or is invalid."""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DSGInvalidDimensionAssociation(DSGBaseException):
|
|
33
|
+
"""Raised if an association is not stored or is invalid."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DSGInvalidDimensionMapping(DSGBaseException):
|
|
37
|
+
"""Raised if a mapping is not stored or is invalid."""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class DSGInvalidQuery(DSGBaseException):
|
|
41
|
+
"""Raised if a query is invalid."""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DSGMissingDimensionMapping(DSGBaseException):
|
|
45
|
+
"""Raised if a mapping is not provided."""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class DSGInvalidOperation(DSGBaseException):
|
|
49
|
+
"""Raised if a requested user operation is invalid."""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class DSGRuntimeError(DSGBaseException):
|
|
53
|
+
"""Raised if there was a generic runtime error."""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class DSGProjectConfigError(DSGBaseException):
|
|
57
|
+
"""Error for bad project configuration inputs"""
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class DSGDatasetConfigError(DSGBaseException):
|
|
61
|
+
"""Error for bad dataset configuration inputs"""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class DSGDuplicateValueRegistered(Warning):
|
|
65
|
+
"""Raised if the user attempts to register a duplicate value."""
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class DSGValueNotRegistered(DSGBaseException):
|
|
69
|
+
"""Raised if a value is not registered."""
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class DSGValueNotStored(DSGBaseException):
|
|
73
|
+
"""Raised if a value is not stored."""
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class DSGConfigWarning(DSGBaseWarning):
|
|
77
|
+
"""Warning for unclear or default configuration inputs"""
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class DSGFileInputError(DSGBaseException):
|
|
81
|
+
"""Error during input file checks."""
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class DSGFileInputWarning(DSGBaseWarning):
|
|
85
|
+
"""Warning during input file checks."""
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class DSGJSONError(DSGBaseException):
|
|
89
|
+
"""Error with JSON file"""
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class DSGFilesystemInterfaceError(DSGBaseException):
|
|
93
|
+
"""Error with FileSystemInterface command"""
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class DSGRegistryLockError(DSGBaseException):
|
|
97
|
+
"""Error with a locked registry"""
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class DSGMakeLockError(DSGBaseException):
|
|
101
|
+
"""Error when making registry lock"""
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class DSGInvalidRegistryState(DSGBaseException):
|
|
105
|
+
"""Invalid Registry State"""
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Abstract implementation for a cloud filesystem"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import abc
|
|
5
|
+
|
|
6
|
+
from .filesystem_interface import FilesystemInterface
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CloudFilesystemInterface(FilesystemInterface, abc.ABC):
|
|
12
|
+
"""Interface to access and edit directories and files on remote cloud filesystem"""
|
|
13
|
+
|
|
14
|
+
@abc.abstractmethod
|
|
15
|
+
def check_versions(self, directory):
|
|
16
|
+
"""Check for multiple versions and versioning expectations of files.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
directory : str
|
|
21
|
+
Directory path
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
@abc.abstractmethod
|
|
25
|
+
def list_versions(self, path):
|
|
26
|
+
"""List all versions of an S3 file object. Only possible in versioned buckets.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
path : str
|
|
31
|
+
Path
|
|
32
|
+
"""
|