dsgrid-toolkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dsgrid-toolkit might be problematic. Click here for more details.

Files changed (152) hide show
  1. dsgrid/__init__.py +22 -0
  2. dsgrid/api/__init__.py +0 -0
  3. dsgrid/api/api_manager.py +179 -0
  4. dsgrid/api/app.py +420 -0
  5. dsgrid/api/models.py +60 -0
  6. dsgrid/api/response_models.py +116 -0
  7. dsgrid/apps/__init__.py +0 -0
  8. dsgrid/apps/project_viewer/app.py +216 -0
  9. dsgrid/apps/registration_gui.py +444 -0
  10. dsgrid/chronify.py +22 -0
  11. dsgrid/cli/__init__.py +0 -0
  12. dsgrid/cli/common.py +120 -0
  13. dsgrid/cli/config.py +177 -0
  14. dsgrid/cli/download.py +13 -0
  15. dsgrid/cli/dsgrid.py +142 -0
  16. dsgrid/cli/dsgrid_admin.py +349 -0
  17. dsgrid/cli/install_notebooks.py +62 -0
  18. dsgrid/cli/query.py +711 -0
  19. dsgrid/cli/registry.py +1773 -0
  20. dsgrid/cloud/__init__.py +0 -0
  21. dsgrid/cloud/cloud_storage_interface.py +140 -0
  22. dsgrid/cloud/factory.py +31 -0
  23. dsgrid/cloud/fake_storage_interface.py +37 -0
  24. dsgrid/cloud/s3_storage_interface.py +156 -0
  25. dsgrid/common.py +35 -0
  26. dsgrid/config/__init__.py +0 -0
  27. dsgrid/config/annual_time_dimension_config.py +187 -0
  28. dsgrid/config/common.py +131 -0
  29. dsgrid/config/config_base.py +148 -0
  30. dsgrid/config/dataset_config.py +684 -0
  31. dsgrid/config/dataset_schema_handler_factory.py +41 -0
  32. dsgrid/config/date_time_dimension_config.py +108 -0
  33. dsgrid/config/dimension_config.py +54 -0
  34. dsgrid/config/dimension_config_factory.py +65 -0
  35. dsgrid/config/dimension_mapping_base.py +349 -0
  36. dsgrid/config/dimension_mappings_config.py +48 -0
  37. dsgrid/config/dimensions.py +775 -0
  38. dsgrid/config/dimensions_config.py +71 -0
  39. dsgrid/config/index_time_dimension_config.py +76 -0
  40. dsgrid/config/input_dataset_requirements.py +31 -0
  41. dsgrid/config/mapping_tables.py +209 -0
  42. dsgrid/config/noop_time_dimension_config.py +42 -0
  43. dsgrid/config/project_config.py +1457 -0
  44. dsgrid/config/registration_models.py +199 -0
  45. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  46. dsgrid/config/simple_models.py +49 -0
  47. dsgrid/config/supplemental_dimension.py +29 -0
  48. dsgrid/config/time_dimension_base_config.py +200 -0
  49. dsgrid/data_models.py +155 -0
  50. dsgrid/dataset/__init__.py +0 -0
  51. dsgrid/dataset/dataset.py +123 -0
  52. dsgrid/dataset/dataset_expression_handler.py +86 -0
  53. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  54. dsgrid/dataset/dataset_schema_handler_base.py +899 -0
  55. dsgrid/dataset/dataset_schema_handler_one_table.py +196 -0
  56. dsgrid/dataset/dataset_schema_handler_standard.py +303 -0
  57. dsgrid/dataset/growth_rates.py +162 -0
  58. dsgrid/dataset/models.py +44 -0
  59. dsgrid/dataset/table_format_handler_base.py +257 -0
  60. dsgrid/dataset/table_format_handler_factory.py +17 -0
  61. dsgrid/dataset/unpivoted_table.py +121 -0
  62. dsgrid/dimension/__init__.py +0 -0
  63. dsgrid/dimension/base_models.py +218 -0
  64. dsgrid/dimension/dimension_filters.py +308 -0
  65. dsgrid/dimension/standard.py +213 -0
  66. dsgrid/dimension/time.py +531 -0
  67. dsgrid/dimension/time_utils.py +88 -0
  68. dsgrid/dsgrid_rc.py +88 -0
  69. dsgrid/exceptions.py +105 -0
  70. dsgrid/filesystem/__init__.py +0 -0
  71. dsgrid/filesystem/cloud_filesystem.py +32 -0
  72. dsgrid/filesystem/factory.py +32 -0
  73. dsgrid/filesystem/filesystem_interface.py +136 -0
  74. dsgrid/filesystem/local_filesystem.py +74 -0
  75. dsgrid/filesystem/s3_filesystem.py +118 -0
  76. dsgrid/loggers.py +132 -0
  77. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +950 -0
  78. dsgrid/notebooks/registration.ipynb +48 -0
  79. dsgrid/notebooks/start_notebook.sh +11 -0
  80. dsgrid/project.py +451 -0
  81. dsgrid/query/__init__.py +0 -0
  82. dsgrid/query/dataset_mapping_plan.py +142 -0
  83. dsgrid/query/derived_dataset.py +384 -0
  84. dsgrid/query/models.py +726 -0
  85. dsgrid/query/query_context.py +287 -0
  86. dsgrid/query/query_submitter.py +847 -0
  87. dsgrid/query/report_factory.py +19 -0
  88. dsgrid/query/report_peak_load.py +70 -0
  89. dsgrid/query/reports_base.py +20 -0
  90. dsgrid/registry/__init__.py +0 -0
  91. dsgrid/registry/bulk_register.py +161 -0
  92. dsgrid/registry/common.py +287 -0
  93. dsgrid/registry/config_update_checker_base.py +63 -0
  94. dsgrid/registry/data_store_factory.py +34 -0
  95. dsgrid/registry/data_store_interface.py +69 -0
  96. dsgrid/registry/dataset_config_generator.py +156 -0
  97. dsgrid/registry/dataset_registry_manager.py +734 -0
  98. dsgrid/registry/dataset_update_checker.py +16 -0
  99. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  100. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  101. dsgrid/registry/dimension_registry_manager.py +413 -0
  102. dsgrid/registry/dimension_update_checker.py +16 -0
  103. dsgrid/registry/duckdb_data_store.py +185 -0
  104. dsgrid/registry/filesystem_data_store.py +141 -0
  105. dsgrid/registry/filter_registry_manager.py +123 -0
  106. dsgrid/registry/project_config_generator.py +57 -0
  107. dsgrid/registry/project_registry_manager.py +1616 -0
  108. dsgrid/registry/project_update_checker.py +48 -0
  109. dsgrid/registry/registration_context.py +223 -0
  110. dsgrid/registry/registry_auto_updater.py +316 -0
  111. dsgrid/registry/registry_database.py +662 -0
  112. dsgrid/registry/registry_interface.py +446 -0
  113. dsgrid/registry/registry_manager.py +544 -0
  114. dsgrid/registry/registry_manager_base.py +367 -0
  115. dsgrid/registry/versioning.py +92 -0
  116. dsgrid/spark/__init__.py +0 -0
  117. dsgrid/spark/functions.py +545 -0
  118. dsgrid/spark/types.py +50 -0
  119. dsgrid/tests/__init__.py +0 -0
  120. dsgrid/tests/common.py +139 -0
  121. dsgrid/tests/make_us_data_registry.py +204 -0
  122. dsgrid/tests/register_derived_datasets.py +103 -0
  123. dsgrid/tests/utils.py +25 -0
  124. dsgrid/time/__init__.py +0 -0
  125. dsgrid/time/time_conversions.py +80 -0
  126. dsgrid/time/types.py +67 -0
  127. dsgrid/units/__init__.py +0 -0
  128. dsgrid/units/constants.py +113 -0
  129. dsgrid/units/convert.py +71 -0
  130. dsgrid/units/energy.py +145 -0
  131. dsgrid/units/power.py +87 -0
  132. dsgrid/utils/__init__.py +0 -0
  133. dsgrid/utils/dataset.py +612 -0
  134. dsgrid/utils/files.py +179 -0
  135. dsgrid/utils/filters.py +125 -0
  136. dsgrid/utils/id_remappings.py +100 -0
  137. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  138. dsgrid/utils/py_expression_eval/README.md +8 -0
  139. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  140. dsgrid/utils/py_expression_eval/tests.py +283 -0
  141. dsgrid/utils/run_command.py +70 -0
  142. dsgrid/utils/scratch_dir_context.py +64 -0
  143. dsgrid/utils/spark.py +918 -0
  144. dsgrid/utils/spark_partition.py +98 -0
  145. dsgrid/utils/timing.py +239 -0
  146. dsgrid/utils/utilities.py +184 -0
  147. dsgrid/utils/versioning.py +36 -0
  148. dsgrid_toolkit-0.2.0.dist-info/METADATA +216 -0
  149. dsgrid_toolkit-0.2.0.dist-info/RECORD +152 -0
  150. dsgrid_toolkit-0.2.0.dist-info/WHEEL +4 -0
  151. dsgrid_toolkit-0.2.0.dist-info/entry_points.txt +4 -0
  152. dsgrid_toolkit-0.2.0.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,531 @@
1
+ """Dimensions related to time"""
2
+
3
+ from datetime import datetime, timedelta
4
+ from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
5
+ import logging
6
+ from pydantic import Field
7
+ from enum import Enum
8
+
9
+
10
+ from dsgrid.data_models import DSGEnum, EnumValue, DSGBaseModel
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class TimeDimensionType(DSGEnum):
16
+ """Defines the supported time formats in the load data."""
17
+
18
+ DATETIME = "datetime"
19
+ ANNUAL = "annual"
20
+ REPRESENTATIVE_PERIOD = "representative_period"
21
+ INDEX = "index"
22
+ NOOP = "noop"
23
+
24
+
25
+ class DatetimeFormat(str, Enum):
26
+ """Defines the time format of the datetime config model"""
27
+
28
+ ALIGNED = "aligned"
29
+ LOCAL = "local"
30
+ LOCAL_AS_STRINGS = "local_as_strings"
31
+
32
+
33
+ class RepresentativePeriodFormat(DSGEnum):
34
+ """Defines the supported formats for representative period data."""
35
+
36
+ # All instances of this Enum must declare frequency.
37
+ # This Enum may be replaced by a generic implementation in order to support a large
38
+ # number of permutations (seasons, weekend day vs week day, sub-hour time, etc).
39
+
40
+ ONE_WEEK_PER_MONTH_BY_HOUR = EnumValue(
41
+ value="one_week_per_month_by_hour",
42
+ frequency=timedelta(hours=1),
43
+ description="load_data columns use 'month', 'day_of_week', 'hour' to specify time",
44
+ )
45
+ ONE_WEEKDAY_DAY_AND_ONE_WEEKEND_DAY_PER_MONTH_BY_HOUR = EnumValue(
46
+ value="one_weekday_day_and_one_weekend_day_per_month_by_hour",
47
+ frequency=timedelta(hours=1),
48
+ description="load_data columns use 'month', 'hour', 'is_weekday' to specify time",
49
+ )
50
+
51
+
52
+ class LeapDayAdjustmentType(DSGEnum):
53
+ """Leap day adjustment enum types"""
54
+
55
+ DROP_DEC31 = EnumValue(
56
+ value="drop_dec31",
57
+ description="To adjust for leap years, December 31st timestamps and data get dropped.",
58
+ )
59
+ DROP_FEB29 = EnumValue(
60
+ value="drop_feb29",
61
+ description="Feburary 29th timestamps and data are dropped. Currently not yet supported by dsgrid.",
62
+ )
63
+ DROP_JAN1 = EnumValue(
64
+ value="drop_jan1",
65
+ description="To adjust for leap years, January 1st timestamps and data get dropped.",
66
+ )
67
+ NONE = EnumValue(value="none", description="No leap day adjustment made.")
68
+
69
+
70
+ class DaylightSavingSpringForwardType(DSGEnum):
71
+ """Daylight saving spring forward adjustment enum types"""
72
+
73
+ DROP = EnumValue(
74
+ value="drop",
75
+ description="Drop timestamp(s) and associated data for the spring forward hour (2AM in March)",
76
+ )
77
+ NONE = EnumValue(value="none", description="No daylight saving adjustment for data.")
78
+
79
+
80
+ class DaylightSavingFallBackType(DSGEnum):
81
+ """Daylight saving fall back adjustment enum types"""
82
+
83
+ INTERPOLATE = EnumValue(
84
+ value="interpolate",
85
+ description="Fill data by interpolating between the left and right edges of the dataframe.",
86
+ )
87
+ DUPLICATE = EnumValue(
88
+ value="duplicate",
89
+ description="Fill data by duplicating the fall-back hour (1AM in November)",
90
+ )
91
+ NONE = EnumValue(value="none", description="No daylight saving adjustment for data.")
92
+
93
+
94
+ class TimeIntervalType(DSGEnum):
95
+ """Time interval enum types"""
96
+
97
+ # TODO: R2PD uses a different set; do we want to align?
98
+ # https://github.com/Smart-DS/R2PD/blob/master/R2PD/tshelpers.py#L15
99
+
100
+ PERIOD_ENDING = EnumValue(
101
+ value="period_ending",
102
+ description="A time interval that is period ending is coded by the end time. E.g., 2pm (with"
103
+ " freq=1h) represents a period of time between 1-2pm.",
104
+ )
105
+ PERIOD_BEGINNING = EnumValue(
106
+ value="period_beginning",
107
+ description="A time interval that is period beginning is coded by the beginning time. E.g.,"
108
+ " 2pm (with freq=01:00:00) represents a period of time between 2-3pm. This is the dsgrid"
109
+ " default.",
110
+ )
111
+ INSTANTANEOUS = EnumValue(
112
+ value="instantaneous",
113
+ description="The time record value represents measured, instantaneous time",
114
+ )
115
+
116
+
117
+ class MeasurementType(DSGEnum):
118
+ """Time value measurement enum types"""
119
+
120
+ MEAN = EnumValue(
121
+ value="mean",
122
+ description="Data values represent the average value in a time range",
123
+ )
124
+ MIN = EnumValue(
125
+ value="min",
126
+ description="Data values represent the minimum value in a time range",
127
+ )
128
+ MAX = EnumValue(
129
+ value="max",
130
+ description="Data values represent the maximum value in a time range",
131
+ )
132
+ MEASURED = EnumValue(
133
+ value="measured",
134
+ description="Data values represent the measured value at that reported time",
135
+ )
136
+ TOTAL = EnumValue(
137
+ value="total",
138
+ description="Data values represent the sum of values in a time range",
139
+ )
140
+
141
+
142
+ class TimeZone(DSGEnum):
143
+ """Time zone enum types
144
+ - tz: zoneinfo.available_timezones()
145
+ - tz_name: spark uses Java timezones: https://jenkov.com/tutorials/java-date-time/java-util-timezone.html
146
+ """
147
+
148
+ UTC = EnumValue(
149
+ value="UTC",
150
+ description="Coordinated Universal Time",
151
+ tz=ZoneInfo("UTC"),
152
+ tz_name="UTC",
153
+ )
154
+ HST = EnumValue(
155
+ value="HawaiiAleutianStandard",
156
+ description="Hawaii Standard Time (UTC=-10). No daylight saving shifts.",
157
+ tz=ZoneInfo("US/Hawaii"),
158
+ tz_name="Etc/GMT+10",
159
+ )
160
+ AST = EnumValue(
161
+ value="AlaskaStandard",
162
+ description="Alaskan Standard Time (UTC=-9). No daylight saving shifts.",
163
+ tz=ZoneInfo("Etc/GMT+9"),
164
+ tz_name="Etc/GMT+9",
165
+ )
166
+ APT = EnumValue(
167
+ value="AlaskaPrevailing",
168
+ description="Alaska Prevailing Time. Commonly called Alaska Local Time. "
169
+ "Includes daylight saving.",
170
+ tz=ZoneInfo("US/Alaska"),
171
+ tz_name="US/Alaska",
172
+ )
173
+ PST = EnumValue(
174
+ value="PacificStandard",
175
+ description="Pacific Standard Time (UTC=-8). No daylight saving shifts.",
176
+ tz=ZoneInfo("Etc/GMT+8"),
177
+ tz_name="Etc/GMT+8",
178
+ )
179
+ PPT = EnumValue(
180
+ value="PacificPrevailing",
181
+ description="Pacific Prevailing Time. Commonly called Pacific Local Time. "
182
+ "Includes daylight saving.",
183
+ tz=ZoneInfo("US/Pacific"),
184
+ tz_name="US/Pacific",
185
+ )
186
+ MST = EnumValue(
187
+ value="MountainStandard",
188
+ description="Mountain Standard Time (UTC=-7). No daylight saving shifts.",
189
+ tz=ZoneInfo("Etc/GMT+7"),
190
+ tz_name="Etc/GMT+7",
191
+ )
192
+ MPT = EnumValue(
193
+ value="MountainPrevailing",
194
+ description="Mountain Prevailing Time. Commonly called Mountain Local Time. "
195
+ "Includes daylight saving.",
196
+ tz=ZoneInfo("US/Mountain"),
197
+ tz_name="US/Mountain",
198
+ )
199
+ CST = EnumValue(
200
+ value="CentralStandard",
201
+ description="Central Standard Time (UTC=-6). No daylight saving shifts.",
202
+ tz=ZoneInfo("Etc/GMT+6"),
203
+ tz_name="Etc/GMT+6",
204
+ )
205
+ CPT = EnumValue(
206
+ value="CentralPrevailing",
207
+ description="Central Prevailing Time. Commonly called Central Local Time. "
208
+ "Includes daylight saving.",
209
+ tz=ZoneInfo("US/Central"),
210
+ tz_name="US/Central",
211
+ )
212
+ EST = EnumValue(
213
+ value="EasternStandard",
214
+ description="Eastern Standard Time (UTC=-5). No daylight saving shifts.",
215
+ tz=ZoneInfo("Etc/GMT+5"),
216
+ tz_name="Etc/GMT+5",
217
+ )
218
+ EPT = EnumValue(
219
+ value="EasternPrevailing",
220
+ description="Eastern Prevailing Time. Commonly called Eastern Local Time. "
221
+ "Includes daylight saving.",
222
+ tz=ZoneInfo("US/Eastern"),
223
+ tz_name="US/Eastern",
224
+ )
225
+ ARIZONA = EnumValue(
226
+ value="USArizona",
227
+ description="US/Arizona = Mountain Standard Time (UTC=-7). No daylight saving shifts. "
228
+ "For Arizona state except Navajo County",
229
+ tz=ZoneInfo("US/Arizona"),
230
+ tz_name="US/Arizona",
231
+ )
232
+
233
+ def get_standard_time(self):
234
+ """get equivalent standard time"""
235
+ if self == TimeZone.UTC:
236
+ return TimeZone.UTC
237
+ if self == TimeZone.HST:
238
+ return TimeZone.HST
239
+ if self in [TimeZone.AST, TimeZone.APT]:
240
+ return TimeZone.AST
241
+ if self in [TimeZone.PST, TimeZone.PPT]:
242
+ return TimeZone.PST
243
+ if self in [TimeZone.MST, TimeZone.MPT]:
244
+ return TimeZone.MST
245
+ if self in [TimeZone.CST, TimeZone.CPT]:
246
+ return TimeZone.CST
247
+ if self in [TimeZone.EST, TimeZone.EPT]:
248
+ return TimeZone.EST
249
+ if self == TimeZone.ARIZONA:
250
+ return TimeZone.ARIZONA
251
+ msg = f"BUG: case not covered: {self}"
252
+ raise NotImplementedError(msg)
253
+
254
+ def get_prevailing_time(self):
255
+ """get equivalent prevailing time"""
256
+ if self == TimeZone.UTC:
257
+ return TimeZone.UTC
258
+ if self == TimeZone.HST:
259
+ return TimeZone.HST
260
+ if self in [TimeZone.AST, TimeZone.APT]:
261
+ return TimeZone.APT
262
+ if self in [TimeZone.PST, TimeZone.PPT]:
263
+ return TimeZone.PPT
264
+ if self in [TimeZone.MST, TimeZone.MPT]:
265
+ return TimeZone.MPT
266
+ if self in [TimeZone.CST, TimeZone.CPT]:
267
+ return TimeZone.CPT
268
+ if self in [TimeZone.EST, TimeZone.EPT]:
269
+ return TimeZone.EPT
270
+ if self == TimeZone.ARIZONA:
271
+ return TimeZone.ARIZONA
272
+ msg = f"BUG: case not covered: {self}"
273
+ raise NotImplementedError(msg)
274
+
275
+ def is_standard(self):
276
+ lst = [
277
+ TimeZone.UTC,
278
+ TimeZone.HST,
279
+ TimeZone.AST,
280
+ TimeZone.PST,
281
+ TimeZone.MST,
282
+ TimeZone.CST,
283
+ TimeZone.EST,
284
+ TimeZone.ARIZONA,
285
+ ]
286
+ if self in lst:
287
+ return True
288
+ return False
289
+
290
+ def is_prevailing(self):
291
+ lst = [
292
+ TimeZone.APT,
293
+ TimeZone.PPT,
294
+ TimeZone.MPT,
295
+ TimeZone.CPT,
296
+ TimeZone.EPT,
297
+ TimeZone.ARIZONA,
298
+ ]
299
+ if self in lst:
300
+ return True
301
+ return False
302
+
303
+
304
+ _TIME_ZONE_NAME_TO_ZONE_INFO = {x.tz_name: x.tz for x in TimeZone}
305
+
306
+ assert len(_TIME_ZONE_NAME_TO_ZONE_INFO) == len(TimeZone)
307
+
308
+
309
+ def get_zone_info_from_tz_name(tz_name: str) -> ZoneInfo:
310
+ """Return the ZoneInfo matching tz_name."""
311
+ return _TIME_ZONE_NAME_TO_ZONE_INFO[tz_name]
312
+
313
+
314
+ def get_zone_info_from_spark_session(tz_name: str) -> ZoneInfo:
315
+ """Return the ZoneInfo matching tz_name, which must have been read from the Spark session."""
316
+ try:
317
+ # We set the Spark session time zone to tz_name which is incompatible with ZoneInfo.
318
+ return ZoneInfo(key=tz_name)
319
+ except ZoneInfoNotFoundError:
320
+ return get_zone_info_from_tz_name(tz_name)
321
+
322
+
323
+ class DaylightSavingAdjustmentModel(DSGBaseModel):
324
+ """Defines how to drop and add data along with timestamps to convert standard time
325
+ load profiles to clock time"""
326
+
327
+ spring_forward_hour: DaylightSavingSpringForwardType = Field(
328
+ title="spring_forward_hour",
329
+ description="Data adjustment for spring forward hour (a 2AM in March)",
330
+ default=DaylightSavingSpringForwardType.NONE,
331
+ json_schema_extra={
332
+ "options": DaylightSavingSpringForwardType.format_descriptions_for_docs(),
333
+ },
334
+ )
335
+
336
+ fall_back_hour: DaylightSavingFallBackType = Field(
337
+ title="fall_back_hour",
338
+ description="Data adjustment for spring forward hour (a 2AM in November)",
339
+ default=DaylightSavingFallBackType.NONE,
340
+ json_schema_extra={
341
+ "options": DaylightSavingFallBackType.format_descriptions_for_docs(),
342
+ },
343
+ )
344
+
345
+
346
+ class TimeBasedDataAdjustmentModel(DSGBaseModel):
347
+ """Defines how data needs to be adjusted with respect to time.
348
+ For leap day adjustment, up to one full day of timestamps and data are dropped.
349
+ For daylight savings, the dataframe is adjusted alongside the timestamps.
350
+ This is useful when the load profiles are modeled in standard time and
351
+ need to be converted to get clock time load profiles.
352
+ """
353
+
354
+ leap_day_adjustment: LeapDayAdjustmentType = Field(
355
+ default=LeapDayAdjustmentType.NONE,
356
+ title="leap_day_adjustment",
357
+ description="Leap day adjustment method applied to time data. The dsgrid default is None, "
358
+ "i.e., no adjustment made to leap years. Adjustments are made to leap years only.",
359
+ )
360
+ daylight_saving_adjustment: DaylightSavingAdjustmentModel = Field(
361
+ title="daylight_saving_adjustment",
362
+ description="Daylight saving adjustment method applied to time data",
363
+ default=DaylightSavingAdjustmentModel(
364
+ spring_forward_hour=DaylightSavingSpringForwardType.NONE,
365
+ fall_back_hour=DaylightSavingFallBackType.NONE,
366
+ ),
367
+ )
368
+
369
+
370
+ class DatetimeRange:
371
+ def __init__(
372
+ self,
373
+ start,
374
+ end,
375
+ frequency,
376
+ time_based_data_adjustment: TimeBasedDataAdjustmentModel | None = None,
377
+ ):
378
+ if time_based_data_adjustment is None:
379
+ time_based_data_adjustment = TimeBasedDataAdjustmentModel()
380
+ self.start = start
381
+ self.end = end
382
+ self.tzinfo = start.tzinfo
383
+ self.frequency = frequency
384
+ self.leap_day_adjustment = time_based_data_adjustment.leap_day_adjustment
385
+ self.dls_springforward_adjustment = (
386
+ time_based_data_adjustment.daylight_saving_adjustment.spring_forward_hour
387
+ )
388
+ self.dls_fallback_adjustment = (
389
+ time_based_data_adjustment.daylight_saving_adjustment.fall_back_hour
390
+ )
391
+
392
+ def __repr__(self):
393
+ return (
394
+ self.__class__.__qualname__
395
+ + f"(start={self.start}, end={self.end}, frequency={self.frequency}, "
396
+ + f"leap_day_adjustment={self.leap_day_adjustment}, "
397
+ + f"dls_springforward_adjustment={self.dls_springforward_adjustment}, "
398
+ + f"dls_fallback_adjustment={self.dls_fallback_adjustment}."
399
+ )
400
+
401
+ def __str__(self):
402
+ return self.show_range()
403
+
404
+ def show_range(self, n_show=5):
405
+ output = self.list_time_range()
406
+ n_show = min(len(output) // 2, n_show)
407
+ n_head = ", ".join([str(x) for x in output[:n_show]])
408
+ n_tail = ", ".join([str(x) for x in output[-n_show:]])
409
+ return n_head + ",\n ... , \n" + n_tail
410
+
411
+ def _iter_timestamps(self):
412
+ """Return a generator of datetimes for a time range ('start' and 'end' times are inclusive).
413
+ There could be duplicates.
414
+
415
+ TODO: for future-selves, test functionality of LeapDayAdjustmentType in relation to TimeIntervalType to make sure drop behavior is expected.
416
+
417
+ Yields
418
+ ------
419
+ datetime
420
+
421
+ """
422
+ cur = self.start.to_pydatetime().astimezone(ZoneInfo("UTC"))
423
+ end = self.end.to_pydatetime().astimezone(ZoneInfo("UTC")) + self.frequency
424
+
425
+ while cur < end:
426
+ cur_tz = cur.astimezone(self.tzinfo)
427
+ cur_tz = adjust_timestamp_by_dst_offset(cur_tz, self.frequency)
428
+ month = cur_tz.month
429
+ day = cur_tz.day
430
+ if not (
431
+ self.leap_day_adjustment == LeapDayAdjustmentType.DROP_FEB29
432
+ and month == 2
433
+ and day == 29
434
+ ):
435
+ if not (
436
+ self.leap_day_adjustment == LeapDayAdjustmentType.DROP_DEC31
437
+ and month == 12
438
+ and day == 31
439
+ ):
440
+ if not (
441
+ self.leap_day_adjustment == LeapDayAdjustmentType.DROP_JAN1
442
+ and month == 1
443
+ and day == 1
444
+ ):
445
+ yield cur_tz
446
+
447
+ cur += self.frequency
448
+
449
+ def list_time_range(self):
450
+ """Return a list of timestamps for a time range.
451
+
452
+ Returns
453
+ -------
454
+ list[datetime]
455
+ """
456
+ return list(self._iter_timestamps())
457
+
458
+
459
+ class AnnualTimeRange(DatetimeRange):
460
+ def _iter_timestamps(self):
461
+ """
462
+ Return a list of years (datetime obj) on Jan 1st
463
+ Might be okay to not convert to UTC for iteration, since it's annual
464
+
465
+ """
466
+ start = self.start.to_pydatetime()
467
+ end = self.end.to_pydatetime()
468
+ tz = self.tzinfo
469
+ for year in range(start.year, end.year + 1):
470
+ yield datetime(year=year, month=1, day=1, tzinfo=tz)
471
+
472
+
473
+ class IndexTimeRange(DatetimeRange):
474
+ def __init__(
475
+ self,
476
+ start,
477
+ end,
478
+ frequency,
479
+ start_index,
480
+ time_based_data_adjustment: TimeBasedDataAdjustmentModel | None = None,
481
+ ):
482
+ super().__init__(
483
+ start, end, frequency, time_based_data_adjustment=time_based_data_adjustment
484
+ )
485
+ self.start_index = start_index
486
+
487
+ def _iter_timestamps(self):
488
+ cur = self.start.to_pydatetime().astimezone(ZoneInfo("UTC"))
489
+ cur_idx = self.start_index
490
+ end = (
491
+ self.end.to_pydatetime().astimezone(ZoneInfo("UTC")) + self.frequency
492
+ ) # to make end time inclusive
493
+
494
+ while cur < end:
495
+ cur_tz = cur.astimezone(self.tzinfo)
496
+ cur_tz = adjust_timestamp_by_dst_offset(cur_tz, self.frequency)
497
+ month = cur_tz.month
498
+ day = cur_tz.day
499
+ if not (
500
+ self.leap_day_adjustment == LeapDayAdjustmentType.DROP_FEB29
501
+ and month == 2
502
+ and day == 29
503
+ ):
504
+ if not (
505
+ self.leap_day_adjustment == LeapDayAdjustmentType.DROP_DEC31
506
+ and month == 12
507
+ and day == 31
508
+ ):
509
+ if not (
510
+ self.leap_day_adjustment == LeapDayAdjustmentType.DROP_JAN1
511
+ and month == 1
512
+ and day == 1
513
+ ):
514
+ yield cur_idx
515
+ cur += self.frequency
516
+ cur_idx += 1
517
+
518
+
519
+ def adjust_timestamp_by_dst_offset(timestamp, frequency):
520
+ """Reduce the timestamps within the daylight saving range by 1 hour.
521
+ Used to ensure that a time series at daily (or lower) frequency returns each day at the
522
+ same timestamp in prevailing time, an expected behavior in most standard libraries.
523
+ (e.g., ensure a time series can return 2018-03-11 00:00, 2018-03-12 00:00...
524
+ instead of 2018-03-11 00:00, 2018-03-12 01:00...)
525
+
526
+ """
527
+ if frequency < timedelta(hours=24):
528
+ return timestamp
529
+
530
+ offset = timestamp.dst() or timedelta(hours=0)
531
+ return timestamp - offset
@@ -0,0 +1,88 @@
1
+ """Functions related to time"""
2
+
3
+ from datetime import datetime
4
+
5
+ import logging
6
+
7
+
8
+ import pandas as pd
9
+
10
+ from dsgrid.dimension.time import (
11
+ DatetimeRange,
12
+ TimeZone,
13
+ TimeBasedDataAdjustmentModel,
14
+ TimeDimensionType,
15
+ )
16
+ from dsgrid.config.dimensions import TimeRangeModel
17
+
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def build_time_ranges(
23
+ time_ranges: TimeRangeModel,
24
+ str_format: str,
25
+ tz: TimeZone | None = None,
26
+ ):
27
+ ranges = []
28
+ for time_range in time_ranges:
29
+ start = datetime.strptime(time_range.start, str_format)
30
+ end = datetime.strptime(time_range.end, str_format)
31
+ start_adj = datetime(
32
+ year=start.year,
33
+ month=start.month,
34
+ day=start.day,
35
+ hour=start.hour,
36
+ minute=start.minute,
37
+ second=start.second,
38
+ microsecond=start.microsecond,
39
+ )
40
+ end_adj = datetime(
41
+ year=end.year,
42
+ month=end.month,
43
+ day=end.day,
44
+ hour=end.hour,
45
+ minute=end.minute,
46
+ second=end.second,
47
+ microsecond=end.microsecond,
48
+ )
49
+ ranges.append((pd.Timestamp(start_adj, tz=tz), pd.Timestamp(end_adj, tz=tz)))
50
+
51
+ ranges.sort(key=lambda x: x[0])
52
+ return ranges
53
+
54
+
55
+ def get_time_ranges(
56
+ time_dimension_config, #: DateTimeDimensionConfig,
57
+ timezone: TimeZone = None,
58
+ time_based_data_adjustment: TimeBasedDataAdjustmentModel = None,
59
+ ):
60
+ dim_model = time_dimension_config.model
61
+ if timezone is None:
62
+ timezone = time_dimension_config.get_tzinfo()
63
+
64
+ if dim_model.time_type == TimeDimensionType.DATETIME:
65
+ dt_ranges = dim_model.ranges
66
+ elif dim_model.time_type == TimeDimensionType.INDEX:
67
+ dt_ranges = time_dimension_config._create_represented_time_ranges()
68
+ else:
69
+ msg = f"Cannot support time_dimension_config model of time_typ {dim_model.time_type}."
70
+ raise ValueError(msg)
71
+
72
+ ranges = []
73
+ for start, end in build_time_ranges(dt_ranges, dim_model.str_format, tz=timezone):
74
+ ranges.append(
75
+ DatetimeRange(
76
+ start=start,
77
+ end=end,
78
+ frequency=dim_model.frequency,
79
+ time_based_data_adjustment=time_based_data_adjustment,
80
+ )
81
+ )
82
+
83
+ return ranges
84
+
85
+
86
+ def is_leap_year(year: int) -> bool:
87
+ """Return True if the year is a leap year."""
88
+ return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)
dsgrid/dsgrid_rc.py ADDED
@@ -0,0 +1,88 @@
1
+ """Manages the dsgrid runtime configuration file"""
2
+
3
+ import logging
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import Any
8
+ from warnings import warn
9
+
10
+ import json5
11
+ from pydantic import model_validator
12
+
13
+ from dsgrid.common import BackendEngine, DEFAULT_DB_PASSWORD, DEFAULT_SCRATCH_DIR
14
+ from dsgrid.data_models import DSGBaseModel
15
+
16
+ RC_FILENAME = ".dsgrid.json5"
17
+ DEFAULT_BACKEND = BackendEngine.DUCKDB
18
+ DEFAULT_THRIFT_SERVER_URL = "hive://localhost:10000/default"
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class DsgridRuntimeConfig(DSGBaseModel):
24
+ """Defines the runtime config that can be stored in users' home directories."""
25
+
26
+ database_url: str | None = None
27
+ database_user: str = "root"
28
+ database_password: str = DEFAULT_DB_PASSWORD
29
+ offline: bool = True
30
+ backend_engine: BackendEngine = DEFAULT_BACKEND
31
+ thrift_server_url: str = DEFAULT_THRIFT_SERVER_URL
32
+ use_hive_metastore: bool = False
33
+ console_level: str = "info"
34
+ file_level: str = "info"
35
+ timings: bool = False
36
+ reraise_exceptions: bool = False
37
+ scratch_dir: None | Path = None
38
+
39
+ @model_validator(mode="before")
40
+ @classmethod
41
+ def environment_overrides(cls, values: dict[str, Any]) -> dict[str, Any]:
42
+ for env, field in (
43
+ ("DSGRID_BACKEND_ENGINE", "backend_engine"),
44
+ ("THRIFT_SERVER_URL", "thrift_server_url"),
45
+ ):
46
+ if env in os.environ:
47
+ values[field] = os.environ[env]
48
+ return values
49
+
50
+ @model_validator(mode="before")
51
+ @classmethod
52
+ def remove_legacy_fields(cls, data: dict[str, Any]) -> dict[str, Any]:
53
+ for field in ("database_name",):
54
+ res = data.pop(field, None)
55
+ if res is not None:
56
+ warn(
57
+ f"The dsgrid runtime config field {field} is deprecated. Please remove it. "
58
+ "This will cause an error in a future release.",
59
+ )
60
+ return data
61
+
62
+ @classmethod
63
+ def load(cls) -> "DsgridRuntimeConfig":
64
+ """Load the dsgrid runtime config if it exists or one with default values."""
65
+ rc_file = cls.path()
66
+ if rc_file.exists():
67
+ data = json5.loads(rc_file.read_text(encoding="utf-8-sig"))
68
+ return cls(**data)
69
+ return cls()
70
+
71
+ def dump(self) -> None:
72
+ """Dump the config to the user's home directory."""
73
+ path = self.path()
74
+ data = self.model_dump()
75
+ data.pop("database_user")
76
+ data.pop("database_password")
77
+ with open(path, "w") as f_out:
78
+ json5.dump(data, f_out, indent=2)
79
+ print(f"Wrote dsgrid config to {path}", file=sys.stderr)
80
+
81
+ @staticmethod
82
+ def path() -> Path:
83
+ """Return the path to the config file."""
84
+ return Path.home() / RC_FILENAME
85
+
86
+ def get_scratch_dir(self) -> Path:
87
+ """Return the scratch_dir to use."""
88
+ return self.scratch_dir or Path(DEFAULT_SCRATCH_DIR)