dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,352 @@
1
+ """Dimensions related to time"""
2
+
3
+ from datetime import datetime, timedelta
4
+ from zoneinfo import ZoneInfo
5
+ import logging
6
+ from pydantic import Field
7
+ from enum import Enum
8
+
9
+
10
+ from dsgrid.data_models import DSGEnum, EnumValue, DSGBaseModel
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class TimeDimensionType(DSGEnum):
16
+ """Defines the supported time formats in the load data."""
17
+
18
+ DATETIME = "datetime"
19
+ ANNUAL = "annual"
20
+ REPRESENTATIVE_PERIOD = "representative_period"
21
+ DATETIME_EXTERNAL_TZ = "datetime_external_tz"
22
+ INDEX = "index"
23
+ NOOP = "noop"
24
+
25
+
26
+ class TimeZoneFormat(str, Enum):
27
+ """Defines the time format of the datetime config model"""
28
+
29
+ ALIGNED_IN_ABSOLUTE_TIME = "aligned_in_absolute_time"
30
+ ALIGNED_IN_CLOCK_TIME = "aligned_in_clock_time"
31
+ LOCAL_AS_STRINGS = "local_as_strings"
32
+
33
+
34
+ class RepresentativePeriodFormat(DSGEnum):
35
+ """Defines the supported formats for representative period data."""
36
+
37
+ # All instances of this Enum must declare frequency.
38
+ # This Enum may be replaced by a generic implementation in order to support a large
39
+ # number of permutations (seasons, weekend day vs week day, sub-hour time, etc).
40
+
41
+ ONE_WEEK_PER_MONTH_BY_HOUR = EnumValue(
42
+ value="one_week_per_month_by_hour",
43
+ frequency=timedelta(hours=1),
44
+ description="load_data columns use 'month', 'day_of_week', 'hour' to specify time",
45
+ )
46
+ ONE_WEEKDAY_DAY_AND_ONE_WEEKEND_DAY_PER_MONTH_BY_HOUR = EnumValue(
47
+ value="one_weekday_day_and_one_weekend_day_per_month_by_hour",
48
+ frequency=timedelta(hours=1),
49
+ description="load_data columns use 'month', 'hour', 'is_weekday' to specify time",
50
+ )
51
+
52
+
53
+ class LeapDayAdjustmentType(DSGEnum):
54
+ """Leap day adjustment enum types"""
55
+
56
+ DROP_DEC31 = EnumValue(
57
+ value="drop_dec31",
58
+ description="To adjust for leap years, December 31st timestamps and data get dropped.",
59
+ )
60
+ DROP_FEB29 = EnumValue(
61
+ value="drop_feb29",
62
+ description="Feburary 29th timestamps and data are dropped. Currently not yet supported by dsgrid.",
63
+ )
64
+ DROP_JAN1 = EnumValue(
65
+ value="drop_jan1",
66
+ description="To adjust for leap years, January 1st timestamps and data get dropped.",
67
+ )
68
+ NONE = EnumValue(value="none", description="No leap day adjustment made.")
69
+
70
+
71
+ class DaylightSavingSpringForwardType(DSGEnum):
72
+ """Daylight saving spring forward adjustment enum types"""
73
+
74
+ DROP = EnumValue(
75
+ value="drop",
76
+ description="Drop timestamp(s) and associated data for the spring forward hour (2AM in March)",
77
+ )
78
+ NONE = EnumValue(value="none", description="No daylight saving adjustment for data.")
79
+
80
+
81
+ class DaylightSavingFallBackType(DSGEnum):
82
+ """Daylight saving fall back adjustment enum types"""
83
+
84
+ INTERPOLATE = EnumValue(
85
+ value="interpolate",
86
+ description="Fill data by interpolating between the left and right edges of the dataframe.",
87
+ )
88
+ DUPLICATE = EnumValue(
89
+ value="duplicate",
90
+ description="Fill data by duplicating the fall-back hour (1AM in November)",
91
+ )
92
+ NONE = EnumValue(value="none", description="No daylight saving adjustment for data.")
93
+
94
+
95
+ class TimeIntervalType(DSGEnum):
96
+ """Time interval enum types"""
97
+
98
+ # TODO: R2PD uses a different set; do we want to align?
99
+ # https://github.com/Smart-DS/R2PD/blob/master/R2PD/tshelpers.py#L15
100
+
101
+ PERIOD_ENDING = EnumValue(
102
+ value="period_ending",
103
+ description="A time interval that is period ending is coded by the end time. E.g., 2pm (with"
104
+ " freq=1h) represents a period of time between 1-2pm.",
105
+ )
106
+ PERIOD_BEGINNING = EnumValue(
107
+ value="period_beginning",
108
+ description="A time interval that is period beginning is coded by the beginning time. E.g.,"
109
+ " 2pm (with freq=01:00:00) represents a period of time between 2-3pm. This is the dsgrid"
110
+ " default.",
111
+ )
112
+ INSTANTANEOUS = EnumValue(
113
+ value="instantaneous",
114
+ description="The time record value represents measured, instantaneous time",
115
+ )
116
+
117
+
118
+ class MeasurementType(DSGEnum):
119
+ """Time value measurement enum types"""
120
+
121
+ MEAN = EnumValue(
122
+ value="mean",
123
+ description="Data values represent the average value in a time range",
124
+ )
125
+ MIN = EnumValue(
126
+ value="min",
127
+ description="Data values represent the minimum value in a time range",
128
+ )
129
+ MAX = EnumValue(
130
+ value="max",
131
+ description="Data values represent the maximum value in a time range",
132
+ )
133
+ MEASURED = EnumValue(
134
+ value="measured",
135
+ description="Data values represent the measured value at that reported time",
136
+ )
137
+ TOTAL = EnumValue(
138
+ value="total",
139
+ description="Data values represent the sum of values in a time range",
140
+ )
141
+
142
+
143
+ class DaylightSavingAdjustmentModel(DSGBaseModel):
144
+ """Defines how to drop and add data along with timestamps to convert standard time
145
+ load profiles to clock time"""
146
+
147
+ spring_forward_hour: DaylightSavingSpringForwardType = Field(
148
+ title="spring_forward_hour",
149
+ description="Data adjustment for spring forward hour (a 2AM in March)",
150
+ default=DaylightSavingSpringForwardType.NONE,
151
+ json_schema_extra={
152
+ "options": DaylightSavingSpringForwardType.format_descriptions_for_docs(),
153
+ },
154
+ )
155
+
156
+ fall_back_hour: DaylightSavingFallBackType = Field(
157
+ title="fall_back_hour",
158
+ description="Data adjustment for spring forward hour (a 2AM in November)",
159
+ default=DaylightSavingFallBackType.NONE,
160
+ json_schema_extra={
161
+ "options": DaylightSavingFallBackType.format_descriptions_for_docs(),
162
+ },
163
+ )
164
+
165
+
166
+ class TimeBasedDataAdjustmentModel(DSGBaseModel):
167
+ """Defines how data needs to be adjusted with respect to time.
168
+ For leap day adjustment, up to one full day of timestamps and data are dropped.
169
+ For daylight savings, the dataframe is adjusted alongside the timestamps.
170
+ This is useful when the load profiles are modeled in standard time and
171
+ need to be converted to get clock time load profiles.
172
+ """
173
+
174
+ leap_day_adjustment: LeapDayAdjustmentType = Field(
175
+ default=LeapDayAdjustmentType.NONE,
176
+ title="leap_day_adjustment",
177
+ description="Leap day adjustment method applied to time data. The dsgrid default is None, "
178
+ "i.e., no adjustment made to leap years. Adjustments are made to leap years only.",
179
+ )
180
+ daylight_saving_adjustment: DaylightSavingAdjustmentModel = Field(
181
+ title="daylight_saving_adjustment",
182
+ description="Daylight saving adjustment method applied to time data",
183
+ default=DaylightSavingAdjustmentModel(
184
+ spring_forward_hour=DaylightSavingSpringForwardType.NONE,
185
+ fall_back_hour=DaylightSavingFallBackType.NONE,
186
+ ),
187
+ )
188
+
189
+
190
+ class DatetimeRange:
191
+ def __init__(
192
+ self,
193
+ start,
194
+ end,
195
+ frequency,
196
+ time_based_data_adjustment: TimeBasedDataAdjustmentModel | None = None,
197
+ ):
198
+ if time_based_data_adjustment is None:
199
+ time_based_data_adjustment = TimeBasedDataAdjustmentModel()
200
+ self.start = start
201
+ self.end = end
202
+ self.tzinfo = start.tzinfo
203
+ self.frequency = frequency
204
+ self.leap_day_adjustment = time_based_data_adjustment.leap_day_adjustment
205
+ self.dls_springforward_adjustment = (
206
+ time_based_data_adjustment.daylight_saving_adjustment.spring_forward_hour
207
+ )
208
+ self.dls_fallback_adjustment = (
209
+ time_based_data_adjustment.daylight_saving_adjustment.fall_back_hour
210
+ )
211
+
212
+ def __repr__(self):
213
+ return (
214
+ self.__class__.__qualname__
215
+ + f"(start={self.start}, end={self.end}, frequency={self.frequency}, "
216
+ + f"leap_day_adjustment={self.leap_day_adjustment}, "
217
+ + f"dls_springforward_adjustment={self.dls_springforward_adjustment}, "
218
+ + f"dls_fallback_adjustment={self.dls_fallback_adjustment}."
219
+ )
220
+
221
+ def __str__(self):
222
+ return self.show_range()
223
+
224
+ def show_range(self, n_show=5):
225
+ output = self.list_time_range()
226
+ n_show = min(len(output) // 2, n_show)
227
+ n_head = ", ".join([str(x) for x in output[:n_show]])
228
+ n_tail = ", ".join([str(x) for x in output[-n_show:]])
229
+ return n_head + ",\n ... , \n" + n_tail
230
+
231
+ def _iter_timestamps(self):
232
+ """Return a generator of datetimes for a time range ('start' and 'end' times are inclusive).
233
+ There could be duplicates.
234
+
235
+ TODO: for future-selves, test functionality of LeapDayAdjustmentType in relation to TimeIntervalType to make sure drop behavior is expected.
236
+
237
+ Yields
238
+ ------
239
+ datetime
240
+
241
+ """
242
+ cur = self.start.to_pydatetime().astimezone(ZoneInfo("UTC"))
243
+ end = self.end.to_pydatetime().astimezone(ZoneInfo("UTC")) + self.frequency
244
+
245
+ while cur < end:
246
+ cur_tz = cur.astimezone(self.tzinfo)
247
+ cur_tz = adjust_timestamp_by_dst_offset(cur_tz, self.frequency)
248
+ month = cur_tz.month
249
+ day = cur_tz.day
250
+ if not (
251
+ self.leap_day_adjustment == LeapDayAdjustmentType.DROP_FEB29
252
+ and month == 2
253
+ and day == 29
254
+ ):
255
+ if not (
256
+ self.leap_day_adjustment == LeapDayAdjustmentType.DROP_DEC31
257
+ and month == 12
258
+ and day == 31
259
+ ):
260
+ if not (
261
+ self.leap_day_adjustment == LeapDayAdjustmentType.DROP_JAN1
262
+ and month == 1
263
+ and day == 1
264
+ ):
265
+ yield cur_tz
266
+
267
+ cur += self.frequency
268
+
269
+ def list_time_range(self):
270
+ """Return a list of timestamps for a time range.
271
+
272
+ Returns
273
+ -------
274
+ list[datetime]
275
+ """
276
+ return list(self._iter_timestamps())
277
+
278
+
279
+ class AnnualTimeRange(DatetimeRange):
280
+ def _iter_timestamps(self):
281
+ """
282
+ Return a list of years (datetime obj) on Jan 1st
283
+ Might be okay to not convert to UTC for iteration, since it's annual
284
+
285
+ """
286
+ start = self.start.to_pydatetime()
287
+ end = self.end.to_pydatetime()
288
+ tz = self.tzinfo
289
+ assert isinstance(self.frequency, int)
290
+ for year in range(start.year, end.year + self.frequency, self.frequency):
291
+ yield datetime(year=year, month=1, day=1, tzinfo=tz)
292
+
293
+
294
+ class IndexTimeRange(DatetimeRange):
295
+ def __init__(
296
+ self,
297
+ start,
298
+ end,
299
+ frequency,
300
+ start_index,
301
+ time_based_data_adjustment: TimeBasedDataAdjustmentModel | None = None,
302
+ ):
303
+ super().__init__(
304
+ start, end, frequency, time_based_data_adjustment=time_based_data_adjustment
305
+ )
306
+ self.start_index = start_index
307
+
308
+ def _iter_timestamps(self):
309
+ cur = self.start.to_pydatetime().astimezone(ZoneInfo("UTC"))
310
+ cur_idx = self.start_index
311
+ end = (
312
+ self.end.to_pydatetime().astimezone(ZoneInfo("UTC")) + self.frequency
313
+ ) # to make end time inclusive
314
+
315
+ while cur < end:
316
+ cur_tz = cur.astimezone(self.tzinfo)
317
+ cur_tz = adjust_timestamp_by_dst_offset(cur_tz, self.frequency)
318
+ month = cur_tz.month
319
+ day = cur_tz.day
320
+ if not (
321
+ self.leap_day_adjustment == LeapDayAdjustmentType.DROP_FEB29
322
+ and month == 2
323
+ and day == 29
324
+ ):
325
+ if not (
326
+ self.leap_day_adjustment == LeapDayAdjustmentType.DROP_DEC31
327
+ and month == 12
328
+ and day == 31
329
+ ):
330
+ if not (
331
+ self.leap_day_adjustment == LeapDayAdjustmentType.DROP_JAN1
332
+ and month == 1
333
+ and day == 1
334
+ ):
335
+ yield cur_idx
336
+ cur += self.frequency
337
+ cur_idx += 1
338
+
339
+
340
+ def adjust_timestamp_by_dst_offset(timestamp, frequency):
341
+ """Reduce the timestamps within the daylight saving range by 1 hour.
342
+ Used to ensure that a time series at daily (or lower) frequency returns each day at the
343
+ same timestamp in prevailing time, an expected behavior in most standard libraries.
344
+ (e.g., ensure a time series can return 2018-03-11 00:00, 2018-03-12 00:00...
345
+ instead of 2018-03-11 00:00, 2018-03-12 01:00...)
346
+
347
+ """
348
+ if frequency < timedelta(hours=24):
349
+ return timestamp
350
+
351
+ offset = timestamp.dst() or timedelta(hours=0)
352
+ return timestamp - offset
@@ -0,0 +1,103 @@
1
+ """Functions related to time"""
2
+
3
+ from datetime import datetime
4
+
5
+ import logging
6
+
7
+
8
+ import pandas as pd
9
+
10
+ from dsgrid.dimension.time import (
11
+ DatetimeRange,
12
+ TimeBasedDataAdjustmentModel,
13
+ TimeDimensionType,
14
+ )
15
+ from dsgrid.config.dimensions import TimeRangeModel, AnnualRangeModel
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def build_annual_ranges(
22
+ time_ranges: list[AnnualRangeModel],
23
+ tz: str | None = None,
24
+ ) -> list[tuple[pd.Timestamp, pd.Timestamp, int]]:
25
+ ranges = []
26
+ for time_range in time_ranges:
27
+ start = datetime.strptime(time_range.start, time_range.str_format)
28
+ end = datetime.strptime(time_range.end, time_range.str_format)
29
+ assert isinstance(time_range.frequency, int)
30
+ freq = time_range.frequency
31
+ ranges.append((pd.Timestamp(start, tz=tz), pd.Timestamp(end, tz=tz), freq))
32
+
33
+ ranges.sort(key=lambda x: x[0])
34
+ return ranges
35
+
36
+
37
+ def build_time_ranges(
38
+ time_ranges: list[TimeRangeModel],
39
+ tz: str | None = None,
40
+ ) -> list[tuple[pd.Timestamp, pd.Timestamp, pd.Timedelta]]:
41
+ ranges = []
42
+ for time_range in time_ranges:
43
+ start = datetime.strptime(time_range.start, time_range.str_format)
44
+ end = datetime.strptime(time_range.end, time_range.str_format)
45
+ start_adj = datetime(
46
+ year=start.year,
47
+ month=start.month,
48
+ day=start.day,
49
+ hour=start.hour,
50
+ minute=start.minute,
51
+ second=start.second,
52
+ microsecond=start.microsecond,
53
+ )
54
+ end_adj = datetime(
55
+ year=end.year,
56
+ month=end.month,
57
+ day=end.day,
58
+ hour=end.hour,
59
+ minute=end.minute,
60
+ second=end.second,
61
+ microsecond=end.microsecond,
62
+ )
63
+ freq = pd.Timedelta(time_range.frequency)
64
+ ranges.append((pd.Timestamp(start_adj, tz=tz), pd.Timestamp(end_adj, tz=tz), freq))
65
+
66
+ ranges.sort(key=lambda x: x[0])
67
+ return ranges
68
+
69
+
70
+ def get_time_ranges(
71
+ time_dimension_config, #: DateTimeDimensionConfig,
72
+ time_zone: str = None,
73
+ time_based_data_adjustment: TimeBasedDataAdjustmentModel = None,
74
+ ):
75
+ dim_model = time_dimension_config.model
76
+ if time_zone is None:
77
+ time_zone = time_dimension_config.get_tzinfo()
78
+
79
+ if dim_model.time_type == TimeDimensionType.DATETIME:
80
+ dt_ranges = dim_model.ranges
81
+ elif dim_model.time_type == TimeDimensionType.INDEX:
82
+ dt_ranges = time_dimension_config._create_represented_time_ranges()
83
+ else:
84
+ msg = f"Cannot support time_dimension_config model of time_type {dim_model.time_type}."
85
+ raise ValueError(msg)
86
+
87
+ ranges = []
88
+ for start, end, freq in build_time_ranges(dt_ranges, tz=time_zone):
89
+ ranges.append(
90
+ DatetimeRange(
91
+ start=start,
92
+ end=end,
93
+ frequency=freq,
94
+ time_based_data_adjustment=time_based_data_adjustment,
95
+ )
96
+ )
97
+
98
+ return ranges
99
+
100
+
101
+ def is_leap_year(year: int) -> bool:
102
+ """Return True if the year is a leap year."""
103
+ return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)
dsgrid/dsgrid_rc.py ADDED
@@ -0,0 +1,88 @@
1
+ """Manages the dsgrid runtime configuration file"""
2
+
3
+ import logging
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import Any
8
+ from warnings import warn
9
+
10
+ import json5
11
+ from pydantic import model_validator
12
+
13
+ from dsgrid.common import BackendEngine, DEFAULT_DB_PASSWORD, DEFAULT_SCRATCH_DIR
14
+ from dsgrid.data_models import DSGBaseModel
15
+
16
+ RC_FILENAME = ".dsgrid.json5"
17
+ DEFAULT_BACKEND = BackendEngine.DUCKDB
18
+ DEFAULT_THRIFT_SERVER_URL = "hive://localhost:10000/default"
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class DsgridRuntimeConfig(DSGBaseModel):
24
+ """Defines the runtime config that can be stored in users' home directories."""
25
+
26
+ database_url: str | None = None
27
+ database_user: str = "root"
28
+ database_password: str = DEFAULT_DB_PASSWORD
29
+ offline: bool = True
30
+ backend_engine: BackendEngine = DEFAULT_BACKEND
31
+ thrift_server_url: str = DEFAULT_THRIFT_SERVER_URL
32
+ use_hive_metastore: bool = False
33
+ console_level: str = "info"
34
+ file_level: str = "info"
35
+ timings: bool = False
36
+ reraise_exceptions: bool = False
37
+ scratch_dir: None | Path = None
38
+
39
+ @model_validator(mode="before")
40
+ @classmethod
41
+ def environment_overrides(cls, values: dict[str, Any]) -> dict[str, Any]:
42
+ for env, field in (
43
+ ("DSGRID_BACKEND_ENGINE", "backend_engine"),
44
+ ("THRIFT_SERVER_URL", "thrift_server_url"),
45
+ ):
46
+ if env in os.environ:
47
+ values[field] = os.environ[env]
48
+ return values
49
+
50
+ @model_validator(mode="before")
51
+ @classmethod
52
+ def remove_legacy_fields(cls, data: dict[str, Any]) -> dict[str, Any]:
53
+ for field in ("database_name",):
54
+ res = data.pop(field, None)
55
+ if res is not None:
56
+ warn(
57
+ f"The dsgrid runtime config field {field} is deprecated. Please remove it. "
58
+ "This will cause an error in a future release.",
59
+ )
60
+ return data
61
+
62
+ @classmethod
63
+ def load(cls) -> "DsgridRuntimeConfig":
64
+ """Load the dsgrid runtime config if it exists or one with default values."""
65
+ rc_file = cls.path()
66
+ if rc_file.exists():
67
+ data = json5.loads(rc_file.read_text(encoding="utf-8-sig"))
68
+ return cls(**data)
69
+ return cls()
70
+
71
+ def dump(self) -> None:
72
+ """Dump the config to the user's home directory."""
73
+ path = self.path()
74
+ data = self.model_dump()
75
+ data.pop("database_user")
76
+ data.pop("database_password")
77
+ with open(path, "w") as f_out:
78
+ json5.dump(data, f_out, indent=2)
79
+ print(f"Wrote dsgrid config to {path}", file=sys.stderr)
80
+
81
+ @staticmethod
82
+ def path() -> Path:
83
+ """Return the path to the config file."""
84
+ return Path.home() / RC_FILENAME
85
+
86
+ def get_scratch_dir(self) -> Path:
87
+ """Return the scratch_dir to use."""
88
+ return (self.scratch_dir or Path(DEFAULT_SCRATCH_DIR)).resolve()
dsgrid/exceptions.py ADDED
@@ -0,0 +1,105 @@
1
+ """dsgrid exceptions and warnings"""
2
+
3
+
4
+ class DSGBaseException(Exception):
5
+ """Base class for all dsgrid exceptions."""
6
+
7
+
8
+ class DSGBaseWarning(Warning):
9
+ """Base class for all dsgrid warnings."""
10
+
11
+
12
+ class DSGInvalidDataset(DSGBaseException):
13
+ """Raised if a dataset is invalid."""
14
+
15
+
16
+ class DSGInvalidField(DSGBaseException):
17
+ """Raised if a field is missing or invalid."""
18
+
19
+
20
+ class DSGInvalidParameter(DSGBaseException):
21
+ """Raised if a parameter is invalid."""
22
+
23
+
24
+ class DSGInvalidFile(DSGBaseException):
25
+ """Raised if a file cannot be read. Possible reason is that the write operation failed."""
26
+
27
+
28
+ class DSGInvalidDimension(DSGBaseException):
29
+ """Raised if a type is not stored or is invalid."""
30
+
31
+
32
+ class DSGInvalidDimensionAssociation(DSGBaseException):
33
+ """Raised if an association is not stored or is invalid."""
34
+
35
+
36
+ class DSGInvalidDimensionMapping(DSGBaseException):
37
+ """Raised if a mapping is not stored or is invalid."""
38
+
39
+
40
+ class DSGInvalidQuery(DSGBaseException):
41
+ """Raised if a query is invalid."""
42
+
43
+
44
+ class DSGMissingDimensionMapping(DSGBaseException):
45
+ """Raised if a mapping is not provided."""
46
+
47
+
48
+ class DSGInvalidOperation(DSGBaseException):
49
+ """Raised if a requested user operation is invalid."""
50
+
51
+
52
+ class DSGRuntimeError(DSGBaseException):
53
+ """Raised if there was a generic runtime error."""
54
+
55
+
56
+ class DSGProjectConfigError(DSGBaseException):
57
+ """Error for bad project configuration inputs"""
58
+
59
+
60
+ class DSGDatasetConfigError(DSGBaseException):
61
+ """Error for bad dataset configuration inputs"""
62
+
63
+
64
+ class DSGDuplicateValueRegistered(Warning):
65
+ """Raised if the user attempts to register a duplicate value."""
66
+
67
+
68
+ class DSGValueNotRegistered(DSGBaseException):
69
+ """Raised if a value is not registered."""
70
+
71
+
72
+ class DSGValueNotStored(DSGBaseException):
73
+ """Raised if a value is not stored."""
74
+
75
+
76
+ class DSGConfigWarning(DSGBaseWarning):
77
+ """Warning for unclear or default configuration inputs"""
78
+
79
+
80
+ class DSGFileInputError(DSGBaseException):
81
+ """Error during input file checks."""
82
+
83
+
84
+ class DSGFileInputWarning(DSGBaseWarning):
85
+ """Warning during input file checks."""
86
+
87
+
88
+ class DSGJSONError(DSGBaseException):
89
+ """Error with JSON file"""
90
+
91
+
92
+ class DSGFilesystemInterfaceError(DSGBaseException):
93
+ """Error with FileSystemInterface command"""
94
+
95
+
96
+ class DSGRegistryLockError(DSGBaseException):
97
+ """Error with a locked registry"""
98
+
99
+
100
+ class DSGMakeLockError(DSGBaseException):
101
+ """Error when making registry lock"""
102
+
103
+
104
+ class DSGInvalidRegistryState(DSGBaseException):
105
+ """Invalid Registry State"""
File without changes
@@ -0,0 +1,32 @@
1
+ """Abstract implementation for a cloud filesystem"""
2
+
3
+ import logging
4
+ import abc
5
+
6
+ from .filesystem_interface import FilesystemInterface
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class CloudFilesystemInterface(FilesystemInterface, abc.ABC):
12
+ """Interface to access and edit directories and files on remote cloud filesystem"""
13
+
14
+ @abc.abstractmethod
15
+ def check_versions(self, directory):
16
+ """Check for multiple versions and versioning expectations of files.
17
+
18
+ Parameters
19
+ ----------
20
+ directory : str
21
+ Directory path
22
+ """
23
+
24
+ @abc.abstractmethod
25
+ def list_versions(self, path):
26
+ """List all versions of an S3 file object. Only possible in versioned buckets.
27
+
28
+ Parameters
29
+ ----------
30
+ path : str
31
+ Path
32
+ """