dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,287 @@
1
+ import logging
2
+ from contextlib import contextmanager
3
+ from pathlib import Path
4
+ from typing import Generator
5
+
6
+ from dsgrid.dataset.models import (
7
+ ValueFormat,
8
+ StackedTableFormatModel,
9
+ PivotedTableFormatModel,
10
+ )
11
+ from dsgrid.common import VALUE_COLUMN
12
+ from dsgrid.dimension.base_models import DimensionType
13
+ from dsgrid.config.project_config import DatasetBaseDimensionNamesModel
14
+ from dsgrid.dataset.dataset_mapping_manager import DatasetMappingManager
15
+ from dsgrid.query.dataset_mapping_plan import DatasetMappingPlan, MapOperationCheckpoint
16
+ from dsgrid.spark.functions import drop_temp_tables_and_views
17
+ from dsgrid.spark.types import DataFrame
18
+ from dsgrid.utils.spark import get_spark_session
19
+ from dsgrid.utils.scratch_dir_context import ScratchDirContext
20
+ from .models import ColumnType, DatasetMetadataModel, DimensionMetadataModel, QueryBaseModel
21
+
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class QueryContext:
27
+ """Maintains context of the query as it is processed through the stack."""
28
+
29
+ def __init__(
30
+ self,
31
+ model: QueryBaseModel,
32
+ base_dimension_names: DatasetBaseDimensionNamesModel,
33
+ scratch_dir_context: ScratchDirContext,
34
+ checkpoint: MapOperationCheckpoint | None = None,
35
+ ) -> None:
36
+ self._model = model
37
+ self._record_ids_by_dimension_type: dict[DimensionType, list[tuple[str]]] = {}
38
+ self._metadata = DatasetMetadataModel(
39
+ table_format=self.model.result.table_format,
40
+ base_dimension_names=base_dimension_names,
41
+ )
42
+ self._dataset_metadata: dict[str, DatasetMetadataModel] = {}
43
+ self._scratch_dir_context = scratch_dir_context
44
+ self._checkpoint = checkpoint
45
+
46
+ @property
47
+ def metadata(self) -> DatasetMetadataModel:
48
+ return self._metadata
49
+
50
+ @metadata.setter
51
+ def metadata(self, val: DatasetMetadataModel) -> None:
52
+ self._metadata = val
53
+
54
+ @property
55
+ def model(self) -> QueryBaseModel:
56
+ return self._model
57
+
58
+ @property
59
+ def base_dimension_names(self) -> DatasetBaseDimensionNamesModel:
60
+ return self._metadata.base_dimension_names
61
+
62
+ @property
63
+ def scratch_dir_context(self) -> ScratchDirContext:
64
+ """Return the context for managing scratch directories."""
65
+ return self._scratch_dir_context
66
+
67
+ def consolidate_dataset_metadata(self) -> None:
68
+ for dim_type in DimensionType:
69
+ main_metadata = self._metadata.dimensions.get_metadata(dim_type)
70
+ main_metadata.clear()
71
+ keys = set()
72
+ for dataset_metadata in self._dataset_metadata.values():
73
+ for metadata in dataset_metadata.dimensions.get_metadata(dim_type):
74
+ key = metadata.make_key()
75
+ if key not in keys:
76
+ main_metadata.append(metadata)
77
+ keys.add(key)
78
+
79
+ def finalize(self) -> None:
80
+ """Perform cleanup."""
81
+ drop_temp_tables_and_views()
82
+
83
+ def get_value_columns(self) -> set[str]:
84
+ """Return the value columns in the final dataset."""
85
+ match self.get_value_format():
86
+ case ValueFormat.PIVOTED:
87
+ return self.get_pivoted_columns()
88
+ case ValueFormat.STACKED:
89
+ return {VALUE_COLUMN}
90
+ case _:
91
+ msg = str(self.get_value_format())
92
+ raise NotImplementedError(msg)
93
+
94
+ def get_pivoted_columns(self) -> set[str]:
95
+ if self.get_value_format() != ValueFormat.PIVOTED:
96
+ msg = "Bug: get_pivoted_columns is only supported on a pivoted table"
97
+ raise Exception(msg)
98
+ metadata = self._get_metadata()
99
+ assert isinstance(metadata.table_format, PivotedTableFormatModel)
100
+ return self.get_dimension_column_names(metadata.table_format.pivoted_dimension_type)
101
+
102
+ def get_pivoted_dimension_type(self) -> DimensionType:
103
+ if self.get_value_format() != ValueFormat.PIVOTED:
104
+ msg = "Bug: get_pivoted_dimension_type is only supported on a pivoted table"
105
+ raise Exception(msg)
106
+ metadata = self._get_metadata()
107
+ assert isinstance(metadata.table_format, PivotedTableFormatModel)
108
+ return metadata.table_format.pivoted_dimension_type
109
+
110
+ def get_value_format(self, dataset_id: str | None = None) -> ValueFormat:
111
+ val = self._get_metadata(dataset_id).table_format.format_type
112
+ if not isinstance(val, ValueFormat):
113
+ val = ValueFormat(val)
114
+ return val
115
+
116
+ def set_value_format(self, val: ValueFormat) -> None:
117
+ if not isinstance(val, ValueFormat):
118
+ val = ValueFormat(val)
119
+ self._metadata.table_format.format_type = val
120
+
121
+ def get_dimension_column_names(
122
+ self, dimension_type: DimensionType, dataset_id: str | None = None
123
+ ) -> set[str]:
124
+ """Return the load data column names for the dimension."""
125
+ return self._get_metadata(dataset_id).dimensions.get_column_names(dimension_type)
126
+
127
+ def get_all_dimension_column_names(
128
+ self, dataset_id: str | None = None, exclude: set[DimensionType] | None = None
129
+ ) -> set[str]:
130
+ names = set()
131
+ for dimension_type in DimensionType:
132
+ if exclude is not None and dimension_type in exclude:
133
+ continue
134
+ names.update(self.get_dimension_column_names(dimension_type, dataset_id=dataset_id))
135
+ return names
136
+
137
+ def get_dimension_names(
138
+ self, dimension_type: DimensionType, dataset_id: str | None = None
139
+ ) -> set[str]:
140
+ return self._get_metadata(dataset_id).dimensions.get_dimension_names(dimension_type)
141
+
142
+ def get_all_dimension_names(
143
+ self, dataset_id: str | None = None, exclude: set[DimensionType] | None = None
144
+ ) -> set[str]:
145
+ names = set()
146
+ for dimension_type in DimensionType:
147
+ if exclude is not None and dimension_type in exclude:
148
+ continue
149
+ names.update(self.get_dimension_names(dimension_type, dataset_id=dataset_id))
150
+ return names
151
+
152
+ def set_dataset_metadata(
153
+ self,
154
+ dataset_id: str,
155
+ column_type: ColumnType,
156
+ mapped_time_columns: list[str],
157
+ ) -> None:
158
+ table_format = StackedTableFormatModel()
159
+ self._dataset_metadata[dataset_id] = DatasetMetadataModel(table_format=table_format)
160
+ base_dimension_names = self.base_dimension_names
161
+ for dim_type in DimensionType:
162
+ name = getattr(base_dimension_names, dim_type.value)
163
+ assert name is not None
164
+ match (column_type, dim_type):
165
+ case (ColumnType.DIMENSION_TYPES, DimensionType.TIME):
166
+ column_names = mapped_time_columns
167
+ case (ColumnType.DIMENSION_NAMES, _):
168
+ column_names = [name]
169
+ case (ColumnType.DIMENSION_TYPES, _):
170
+ column_names = [dim_type.value]
171
+ case _:
172
+ msg = f"Bug: need to support {column_type=} {dim_type=}"
173
+ raise NotImplementedError(msg)
174
+ self.add_dimension_metadata(
175
+ dim_type,
176
+ DimensionMetadataModel(dimension_name=name, column_names=column_names),
177
+ dataset_id=dataset_id,
178
+ )
179
+
180
+ def convert_to_pivoted(self) -> str:
181
+ assert isinstance(self.model.result.table_format, PivotedTableFormatModel)
182
+ pivoted_dimension_type = self.model.result.table_format.pivoted_dimension_type
183
+ self.set_value_format(ValueFormat.PIVOTED)
184
+ columns = self.get_dimension_column_names(pivoted_dimension_type)
185
+ names = self.get_dimension_names(pivoted_dimension_type)
186
+ if len(columns) != 1 or len(names) != 1:
187
+ # This is checked in the query model and so this should never happen.
188
+ msg = (
189
+ "Bug: The pivoted dimension can only have 1 column and 1 name: "
190
+ f"{columns=} {names=}"
191
+ )
192
+ raise Exception(msg)
193
+ return next(iter(columns))
194
+
195
+ def serialize_dataset_metadata_to_file(self, dataset_id: str, filename: Path) -> None:
196
+ filename.write_text(self._dataset_metadata[dataset_id].model_dump_json(indent=2))
197
+
198
+ def set_dataset_metadata_from_file(self, dataset_id: str, filename: Path) -> None:
199
+ assert dataset_id not in self._dataset_metadata, dataset_id
200
+ self._dataset_metadata[dataset_id] = DatasetMetadataModel.from_file(filename)
201
+
202
+ def add_dimension_metadata(
203
+ self,
204
+ dimension_type: DimensionType,
205
+ dimension_metadata: DimensionMetadataModel,
206
+ dataset_id=None,
207
+ ) -> None:
208
+ self._get_metadata(dataset_id).dimensions.add_metadata(dimension_type, dimension_metadata)
209
+ logger.debug(
210
+ "Added dimension name for %s: %s dataset_id=%s",
211
+ dimension_type,
212
+ dimension_metadata,
213
+ dataset_id,
214
+ )
215
+
216
+ def get_dimension_column_names_by_name(
217
+ self,
218
+ dimension_type: DimensionType,
219
+ name: str,
220
+ dataset_id: str | None = None,
221
+ ) -> list[str]:
222
+ """Return the load data column names for the dimension."""
223
+ for metadata in self.get_dimension_metadata(dimension_type, dataset_id=dataset_id):
224
+ if metadata.dimension_name == name:
225
+ return metadata.column_names
226
+ msg = f"No dimension match: {dimension_type=} {name=}"
227
+ raise Exception(msg)
228
+
229
+ def get_dimension_metadata(
230
+ self,
231
+ dimension_type: DimensionType,
232
+ dataset_id: str | None = None,
233
+ ) -> list[DimensionMetadataModel]:
234
+ return self._get_metadata(dataset_id).dimensions.get_metadata(dimension_type)
235
+
236
+ def replace_dimension_metadata(
237
+ self,
238
+ dimension_type: DimensionType,
239
+ dimension_metadata: list[DimensionMetadataModel],
240
+ dataset_id: str | None = None,
241
+ ) -> None:
242
+ self._get_metadata(dataset_id).dimensions.replace_metadata(
243
+ dimension_type, dimension_metadata
244
+ )
245
+ logger.debug(
246
+ "Replaced dimension for %s: %s dataset_id=%s",
247
+ dimension_type,
248
+ dimension_metadata,
249
+ dataset_id,
250
+ )
251
+
252
+ def _get_metadata(self, dataset_id: str | None = None) -> DatasetMetadataModel:
253
+ return self._metadata if dataset_id is None else self._dataset_metadata[dataset_id]
254
+
255
+ def get_record_ids(self) -> dict[DimensionType, DataFrame]:
256
+ spark = get_spark_session()
257
+ return {
258
+ k: spark.createDataFrame(v, ["id"])
259
+ for k, v in self._record_ids_by_dimension_type.items()
260
+ }
261
+
262
+ def try_get_record_ids_by_dimension_type(self, dim_type: DimensionType) -> DataFrame | None:
263
+ records = self._record_ids_by_dimension_type.get(dim_type)
264
+ if records is None:
265
+ return records
266
+
267
+ spark = get_spark_session()
268
+ return spark.createDataFrame(records, [dim_type.value])
269
+
270
+ def set_record_ids_by_dimension_type(
271
+ self, dim_type: DimensionType, record_ids: DataFrame
272
+ ) -> None:
273
+ # Can't keep the dataframes in memory because of spark restarts.
274
+ self._record_ids_by_dimension_type[dim_type] = [(x.id,) for x in record_ids.collect()]
275
+
276
+ @contextmanager
277
+ def dataset_mapping_manager(
278
+ self, dataset_id: str, plan: DatasetMappingPlan
279
+ ) -> Generator[DatasetMappingManager, None, None]:
280
+ """Start a mapping manager for a dataset."""
281
+ checkpoint = (
282
+ self._checkpoint
283
+ if self._checkpoint is not None and self._checkpoint.dataset_id == dataset_id
284
+ else None
285
+ )
286
+ with DatasetMappingManager(dataset_id, plan, self._scratch_dir_context, checkpoint) as mgr:
287
+ yield mgr