dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
dsgrid/tests/common.py ADDED
@@ -0,0 +1,140 @@
1
+ import getpass
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+ from semver import VersionInfo
7
+
8
+ from dsgrid.exceptions import DSGInvalidParameter, DSGInvalidOperation
9
+ from dsgrid.registry.dimension_registry_manager import DimensionRegistryManager
10
+ from dsgrid.registry.project_registry_manager import ProjectRegistryManager
11
+ from dsgrid.registry.registry_manager import RegistryManager
12
+ from dsgrid.registry.common import DataStoreType, DatabaseConnection, VersionUpdateType
13
+ from dsgrid.utils.files import dump_data, load_data
14
+
15
+ TEST_PROJECT_PATH = Path(__file__).absolute().parents[2] / "dsgrid-test-data"
16
+ TEST_PROJECT_REPO = TEST_PROJECT_PATH / "test_efs"
17
+ TEST_STANDARD_SCENARIOS_PROJECT_REPO = TEST_PROJECT_PATH / "standard_scenarios_2021"
18
+ TEST_DATASET_DIRECTORY = TEST_PROJECT_PATH / "datasets"
19
+ TEST_REGISTRY_DATABASE = "cached-test-dsgrid"
20
+ TEST_REGISTRY_BASE_PATH = Path("tests/data/registry")
21
+ TEST_REGISTRY_DATA_PATH = Path("tests/data/registry/registry_data")
22
+ TEST_EFS_REGISTRATION_FILE = Path("tests/data/test_efs_registration.json5")
23
+ # AWS_PROFILE_NAME = "nrel-aws-dsgrid"
24
+ TEST_REMOTE_REGISTRY = "s3://nrel-dsgrid-registry-test"
25
+ CACHED_TEST_REGISTRY_DB = f"sqlite:///{TEST_REGISTRY_BASE_PATH}/cached_registry.db"
26
+ STANDARD_SCENARIOS_PROJECT_REPO = Path(__file__).parents[2] / "dsgrid-project-StandardScenarios"
27
+ IEF_PROJECT_REPO = Path(__file__).parents[2] / "dsgrid-project-IEF"
28
+ SIMPLE_STANDARD_SCENARIOS = TEST_PROJECT_PATH / "filtered_registries" / "simple_standard_scenarios"
29
+ SIMPLE_STANDARD_SCENARIOS_REGISTRY_DB = (
30
+ f"sqlite:///{TEST_PROJECT_PATH}/filtered_registries/simple_standard_scenarios/registry.db"
31
+ )
32
+
33
+
34
+ def create_local_test_registry(
35
+ tmpdir: Path, conn=None, data_store_type: DataStoreType = DataStoreType.FILESYSTEM
36
+ ):
37
+ if conn is None:
38
+ conn = DatabaseConnection(url=f"sqlite:///{tmpdir}/dsgrid-test.db")
39
+ data_path = tmpdir / "registry_data"
40
+ mgr = RegistryManager.create(conn, data_path, data_store_type=data_store_type, overwrite=True)
41
+ mgr.dispose()
42
+ return data_path
43
+
44
+
45
+ def check_configs_update(base_dir, manager):
46
+ """Runs an update on one of each type of config.
47
+
48
+ Parameters
49
+ ----------
50
+ base_dir : Path
51
+ manager : RegistryManager
52
+
53
+ Returns
54
+ -------
55
+ list
56
+ Each updated config and new version: [(Updated ID, new version)]
57
+ For the dimension element the tuple is (Updated ID, dimension type, new version).
58
+ Order is dimension ID, dimension mapping ID, dataset ID, project ID
59
+
60
+ """
61
+ update_dir = base_dir / "updates"
62
+ user = getpass.getuser()
63
+
64
+ updated_ids = []
65
+ for mgr in (
66
+ manager.dimension_manager,
67
+ manager.dimension_mapping_manager,
68
+ manager.dataset_manager,
69
+ manager.project_manager,
70
+ ):
71
+ config_id = mgr.list_ids()[0]
72
+ version = mgr.get_latest_version(config_id)
73
+ check_config_update(update_dir, mgr, config_id, user, version)
74
+ new_version = mgr.get_latest_version(config_id)
75
+ if isinstance(mgr, DimensionRegistryManager):
76
+ config = mgr.get_by_id(config_id)
77
+ updated_ids.append((config_id, config.model.dimension_type, new_version))
78
+ else:
79
+ updated_ids.append((config_id, new_version))
80
+
81
+ return updated_ids
82
+
83
+
84
+ def check_config_update(base_dir, mgr: ProjectRegistryManager, config_id, user, version):
85
+ """Runs basic positive and negative update tests for the config.
86
+
87
+ Parameters
88
+ ----------
89
+ base_dir : str
90
+ mgr : RegistryManagerBase
91
+ config_id : str
92
+ user : str
93
+ version : str
94
+
95
+ """
96
+ config_file = Path(base_dir) / mgr.config_class().config_filename()
97
+ assert not config_file.exists()
98
+ try:
99
+ mgr.dump(config_id, base_dir, force=True)
100
+ with pytest.raises(DSGInvalidOperation):
101
+ mgr.dump(config_id, base_dir)
102
+ mgr.dump(config_id, base_dir, force=True)
103
+ assert config_file.exists()
104
+ config_data = load_data(config_file)
105
+ config_data["description"] += "; updated description"
106
+ dump_data(config_data, config_file)
107
+ with pytest.raises(DSGInvalidParameter):
108
+ mgr.update_from_file(
109
+ config_file,
110
+ "invalid_config_id",
111
+ user,
112
+ VersionUpdateType.PATCH,
113
+ "update to description",
114
+ version,
115
+ )
116
+ with pytest.raises(DSGInvalidParameter):
117
+ mgr.update_from_file(
118
+ config_file,
119
+ config_id,
120
+ user,
121
+ VersionUpdateType.PATCH,
122
+ "update to description",
123
+ str(VersionInfo.parse(version).bump_patch()),
124
+ )
125
+
126
+ mgr.update_from_file(
127
+ config_file,
128
+ config_id,
129
+ user,
130
+ VersionUpdateType.PATCH,
131
+ "update to description",
132
+ version,
133
+ )
134
+ assert (
135
+ VersionInfo.parse(mgr.get_latest_version(config_id))
136
+ == VersionInfo.parse(version).bump_patch()
137
+ )
138
+ finally:
139
+ if config_file.exists():
140
+ os.remove(config_file)
@@ -0,0 +1,265 @@
1
+ import contextlib
2
+ import getpass
3
+ import logging
4
+ import os
5
+ import shutil
6
+ import tempfile
7
+ from pathlib import Path
8
+
9
+ import rich_click as click
10
+
11
+ from dsgrid.cli.common import path_callback
12
+ from dsgrid.loggers import setup_logging, check_log_file_size
13
+ from dsgrid.registry.common import DataStoreType, DatabaseConnection
14
+ from dsgrid.registry.registry_manager import RegistryManager
15
+ from dsgrid.tests.common import (
16
+ create_local_test_registry,
17
+ TEST_REMOTE_REGISTRY,
18
+ TEST_PROJECT_REPO,
19
+ TEST_DATASET_DIRECTORY,
20
+ )
21
+ from dsgrid.utils.timing import timer_stats_collector
22
+ from dsgrid.utils.files import dump_data, load_data
23
+ from dsgrid.utils.id_remappings import (
24
+ map_dimension_names_to_ids,
25
+ replace_dimension_names_with_current_ids,
26
+ )
27
+
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ def _find_file_with_stem(directory: Path, stem: str) -> Path | None:
33
+ """Find a file in directory with the given stem, regardless of extension."""
34
+ for path in directory.iterdir():
35
+ if path.stem == stem:
36
+ return path
37
+ return None
38
+
39
+
40
+ def update_dataset_config_paths(config_file: Path, dataset_id: str) -> None:
41
+ """Update the data file paths in a dataset config to be relative to the config file.
42
+
43
+ Parameters
44
+ ----------
45
+ config_file : Path
46
+ Path to the dataset configuration file.
47
+ dataset_id : str
48
+ The dataset ID, used to locate the data files in TEST_DATASET_DIRECTORY.
49
+ """
50
+ data = load_data(config_file)
51
+ if "data_layout" not in data:
52
+ return
53
+
54
+ data_layout = data["data_layout"]
55
+ config_dir = config_file.parent.resolve()
56
+ dataset_data_dir = (TEST_DATASET_DIRECTORY / dataset_id).resolve()
57
+
58
+ if "data_file" in data_layout:
59
+ stem = Path(data_layout["data_file"]["path"]).stem
60
+ data_file_path = _find_file_with_stem(dataset_data_dir, stem)
61
+ if data_file_path is None:
62
+ msg = f"Could not find data file with stem '{stem}' in {dataset_data_dir}"
63
+ raise FileNotFoundError(msg)
64
+ relative_path = os.path.relpath(data_file_path, config_dir)
65
+ data_layout["data_file"]["path"] = relative_path
66
+
67
+ if "lookup_data_file" in data_layout and data_layout["lookup_data_file"] is not None:
68
+ stem = Path(data_layout["lookup_data_file"]["path"]).stem
69
+ lookup_file_path = _find_file_with_stem(dataset_data_dir, stem)
70
+ if lookup_file_path is None:
71
+ msg = f"Could not find lookup file with stem '{stem}' in {dataset_data_dir}"
72
+ raise FileNotFoundError(msg)
73
+ relative_path = os.path.relpath(lookup_file_path, config_dir)
74
+ data_layout["lookup_data_file"]["path"] = relative_path
75
+
76
+ if "missing_associations" in data_layout and data_layout["missing_associations"] is not None:
77
+ items = []
78
+ for item in data_layout["missing_associations"]:
79
+ stem = Path(item).stem
80
+ missing_path = _find_file_with_stem(dataset_data_dir, stem)
81
+ if missing_path is None:
82
+ msg = (
83
+ f"Could not find missing associations with stem '{stem}' in {dataset_data_dir}"
84
+ )
85
+ raise FileNotFoundError(msg)
86
+ relative_path = os.path.relpath(missing_path, config_dir)
87
+ items.append(relative_path)
88
+ data_layout["missing_associations"] = items
89
+
90
+ dump_data(data, config_file)
91
+
92
+
93
+ @contextlib.contextmanager
94
+ def make_test_data_registry(
95
+ registry_path,
96
+ src_dir,
97
+ include_projects=True,
98
+ include_datasets=True,
99
+ offline_mode=True,
100
+ database_url: str | None = None,
101
+ data_store_type: DataStoreType = DataStoreType.FILESYSTEM,
102
+ ):
103
+ """Creates a local registry from a dsgrid project source directory for testing.
104
+
105
+ This is a context manager that yields the RegistryManager and disposes it on exit.
106
+
107
+ Parameters
108
+ ----------
109
+ registry_path : Path
110
+ Path in which the registry will be created.
111
+ src_dir : Path
112
+ Path containing source config files
113
+ include_projects : bool
114
+ If False, do not register any projects.
115
+ include_datasets : bool
116
+ If False, do not register any datasets.
117
+ offline_mode : bool
118
+ If False, use the test remote registry.
119
+ data_store_type: DataStoreType
120
+ Type of store to use for the registry data.
121
+
122
+ Yields
123
+ ------
124
+ RegistryManager
125
+ """
126
+ if not include_projects and include_datasets:
127
+ msg = "If include_datasets is True then include_projects must also be True."
128
+ raise Exception(msg)
129
+ url = f"sqlite:///{registry_path}/registry.db" if database_url is None else database_url
130
+ conn = DatabaseConnection(url=url)
131
+ create_local_test_registry(registry_path, conn=conn, data_store_type=data_store_type)
132
+ dataset_dirs = [
133
+ Path("datasets/modeled/comstock"),
134
+ Path("datasets/modeled/comstock_unpivoted"),
135
+ ]
136
+
137
+ user = getpass.getuser()
138
+ log_message = "Initial registration"
139
+ if offline_mode:
140
+ manager = RegistryManager.load(conn, offline_mode=offline_mode)
141
+ else:
142
+ manager = RegistryManager.load(
143
+ conn, remote_path=TEST_REMOTE_REGISTRY, offline_mode=offline_mode
144
+ )
145
+
146
+ try:
147
+ project_config_file = src_dir / "project.json5"
148
+ project_id = load_data(project_config_file)["project_id"]
149
+ dataset_config_files = [src_dir / path / "dataset.json5" for path in dataset_dirs]
150
+ dataset_mapping_files = [
151
+ src_dir / path / "dimension_mappings.json5" for path in dataset_dirs
152
+ ]
153
+ for i, filename in enumerate(dataset_mapping_files):
154
+ if not filename.exists():
155
+ dataset_mapping_files[i] = None
156
+ dataset_ids = [
157
+ load_data(config_file)["dataset_id"] for config_file in dataset_config_files
158
+ ]
159
+
160
+ if include_projects:
161
+ print("\n 1. register project: \n")
162
+ manager.project_manager.register(
163
+ project_config_file,
164
+ user,
165
+ log_message,
166
+ )
167
+ if include_datasets:
168
+ for i, dataset_config_file in enumerate(dataset_config_files):
169
+ dataset_id = dataset_ids[i]
170
+ print(f"\n 2. register dataset {dataset_id}: \n")
171
+ dataset_mapping_file = dataset_mapping_files[i]
172
+ mappings = map_dimension_names_to_ids(manager.dimension_manager)
173
+ replace_dimension_names_with_current_ids(dataset_config_file, mappings)
174
+ update_dataset_config_paths(dataset_config_file, dataset_id)
175
+ manager.dataset_manager.register(
176
+ dataset_config_file,
177
+ user,
178
+ log_message,
179
+ )
180
+ print(f"\n 3. submit dataset {dataset_id} to project\n")
181
+ manager.project_manager.submit_dataset(
182
+ project_id,
183
+ dataset_id,
184
+ user,
185
+ log_message,
186
+ dimension_mapping_file=dataset_mapping_file,
187
+ )
188
+ yield manager
189
+ finally:
190
+ manager.dispose()
191
+
192
+
193
+ @click.command()
194
+ @click.argument(
195
+ "registry-path",
196
+ type=Path,
197
+ default=f"{Path.home()}/.dsgrid-test-registry",
198
+ callback=path_callback,
199
+ )
200
+ @click.option(
201
+ "-f",
202
+ "--force",
203
+ default=False,
204
+ is_flag=True,
205
+ show_default=True,
206
+ help="Delete registry-path if it exists.",
207
+ )
208
+ @click.option(
209
+ "-p",
210
+ "--project-dir",
211
+ default=TEST_PROJECT_REPO,
212
+ help="path to a project repository",
213
+ callback=path_callback,
214
+ )
215
+ @click.option(
216
+ "-t",
217
+ "--data-store-type",
218
+ type=click.Choice([x.value for x in DataStoreType]),
219
+ default=DataStoreType.FILESYSTEM.value,
220
+ show_default=True,
221
+ help="Type of store to use for the registry data.",
222
+ callback=lambda *x: DataStoreType(x[2]),
223
+ )
224
+ @click.option(
225
+ "--verbose",
226
+ is_flag=True,
227
+ default=False,
228
+ show_default=True,
229
+ help="Enable verbose log output.",
230
+ )
231
+ def run(
232
+ registry_path: Path,
233
+ force: bool,
234
+ project_dir: Path,
235
+ data_store_type: DataStoreType,
236
+ verbose: bool,
237
+ ):
238
+ """Creates a local registry from a dsgrid project source directory for testing."""
239
+ level = logging.DEBUG if verbose else logging.INFO
240
+ log_file = Path("test_dsgrid_project.log")
241
+ check_log_file_size(log_file, no_prompts=True)
242
+ setup_logging("dsgrid", log_file, console_level=level, file_level=level, mode="a")
243
+ if registry_path.exists():
244
+ if force:
245
+ shutil.rmtree(registry_path)
246
+ else:
247
+ print(f"{registry_path} already exists. Use --force to overwrite.")
248
+ os.makedirs(registry_path)
249
+ tmp_project_dir = Path(tempfile.gettempdir()) / "tmp_test_project_dir"
250
+ if tmp_project_dir.exists():
251
+ shutil.rmtree(tmp_project_dir)
252
+ shutil.copytree(project_dir, tmp_project_dir)
253
+ try:
254
+ with make_test_data_registry(
255
+ registry_path,
256
+ tmp_project_dir / "dsgrid_project",
257
+ data_store_type=data_store_type,
258
+ ):
259
+ pass # Manager is created and disposed in context manager
260
+ finally:
261
+ timer_stats_collector.log_stats()
262
+
263
+
264
+ if __name__ == "__main__":
265
+ run()
@@ -0,0 +1,103 @@
1
+ import logging
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ import rich_click as click
6
+
7
+ from dsgrid.loggers import setup_logging, check_log_file_size
8
+ from dsgrid.query.models import ProjectQueryModel
9
+ from dsgrid.registry.dataset_registry import DatasetRegistry
10
+ from dsgrid.utils.run_command import check_run_command
11
+ from dsgrid.utils.timing import timer_stats_collector
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @click.command()
18
+ @click.argument(
19
+ "query_files",
20
+ nargs=-1,
21
+ type=click.Path(exists=True),
22
+ callback=lambda *x: [Path(y) for y in x[2]],
23
+ )
24
+ @click.option(
25
+ "-r", "--registry-path", required=True, callback=lambda *x: Path(x[2]), help="Path to registry"
26
+ )
27
+ @click.option(
28
+ "-o",
29
+ "--output",
30
+ default="query_output",
31
+ show_default=True,
32
+ type=click.Path(),
33
+ help="Output directory for query results",
34
+ callback=lambda *x: Path(x[2]),
35
+ )
36
+ @click.option(
37
+ "-p",
38
+ "--project-id",
39
+ default="dsgrid_conus_2022",
40
+ show_default=True,
41
+ type=str,
42
+ help="Project ID",
43
+ )
44
+ @click.option(
45
+ "--verbose", is_flag=True, default=False, show_default=True, help="Enable verbose log output."
46
+ )
47
+ def run(query_files, project_id, registry_path, output, verbose):
48
+ """Registers derived datasets in a local registry for testing."""
49
+ level = logging.DEBUG if verbose else logging.INFO
50
+ log_file = Path("dsgrid_registration.log")
51
+ check_log_file_size(log_file, no_prompts=True)
52
+ logger = setup_logging(__name__, log_file, console_level=level, file_level=level, mode="a")
53
+ try:
54
+ _run_registration(query_files, project_id, registry_path, output)
55
+ finally:
56
+ # Raise the console level so that timer stats only go to the log file.
57
+ for _, handler in enumerate(logger.handlers):
58
+ if handler.name == "console":
59
+ handler.setLevel(logging.WARNING)
60
+ break
61
+
62
+ timer_stats_collector.log_stats()
63
+
64
+
65
+ def _run_registration(
66
+ query_files: list[Path], project_id: str, registry_path: Path, query_output_dir: Path
67
+ ):
68
+ log_message = "Submit derived dataset"
69
+ query_output_dir.mkdir(exist_ok=True)
70
+ derived_dataset_config_dir = query_output_dir / "derived_dataset_configs"
71
+ if derived_dataset_config_dir.exists():
72
+ shutil.rmtree(derived_dataset_config_dir)
73
+ derived_dataset_config_dir.mkdir()
74
+ for query_file in query_files:
75
+ logger.info("Register derived dataset from %s", query_file)
76
+ query = ProjectQueryModel.from_file(query_file)
77
+ dataset_id = query.project.dataset.dataset_id
78
+ dataset_config_dir = derived_dataset_config_dir / dataset_id
79
+ dataset_config_file = dataset_config_dir / DatasetRegistry.config_filename()
80
+
81
+ create_cmd = (
82
+ f"dsgrid query project run --registry-path={registry_path} "
83
+ f"-o {query_output_dir} {query_file}"
84
+ )
85
+
86
+ config_cmd = (
87
+ f"dsgrid query project create-derived-dataset-config "
88
+ f"--registry-path={registry_path} {query_output_dir / dataset_id} {dataset_config_dir}"
89
+ )
90
+
91
+ submit_cmd = (
92
+ f"dsgrid registry --path {registry_path} projects "
93
+ f"register-and-submit-dataset -c {dataset_config_file} -p {project_id} "
94
+ f"-l '{log_message}' -d {query_output_dir / dataset_id}"
95
+ )
96
+
97
+ for cmd in (create_cmd, config_cmd, submit_cmd):
98
+ logger.info(cmd)
99
+ check_run_command(cmd)
100
+
101
+
102
+ if __name__ == "__main__":
103
+ run()
dsgrid/tests/utils.py ADDED
@@ -0,0 +1,25 @@
1
+ from pathlib import Path
2
+
3
+ from dsgrid.spark.types import use_duckdb
4
+ from dsgrid.utils.spark import get_spark_session
5
+
6
+
7
+ def read_parquet(filename: Path):
8
+ """Read a Parquet file and load it into cache. This helps debugging with pytest --pdb.
9
+ If you don't use this, the parquet file will get deleted on a failure and you won't be able
10
+ to inspect the dataframe.
11
+ """
12
+ spark = get_spark_session()
13
+ df = spark.read.parquet(str(filename))
14
+ if not use_duckdb():
15
+ df.cache()
16
+ df.count()
17
+ return df
18
+
19
+
20
+ def read_parquet_two_table_format(path: Path):
21
+ spark = get_spark_session()
22
+ load_data = spark.read.parquet(str(path / "load_data.parquet"))
23
+ lookup = spark.read.parquet(str(path / "load_data_lookup.parquet"))
24
+ table = load_data.join(lookup, on="id").drop("id")
25
+ return table
File without changes
@@ -0,0 +1,80 @@
1
+ """Functions to perform time conversions"""
2
+
3
+ from datetime import datetime
4
+
5
+ from dsgrid.time.types import DayType, Season
6
+
7
+
8
+ def convert_datetime_to_day_type(timestamp):
9
+ """Returns the day type for the datetime.
10
+
11
+ Parameters
12
+ ----------
13
+ timestamp : datetime.datetime
14
+
15
+ Returns
16
+ -------
17
+ str
18
+ DayType id
19
+
20
+ """
21
+ # Monday is 0.
22
+ if timestamp.weekday() <= 4:
23
+ return DayType.WEEKDAY.value
24
+ return DayType.WEEKEND.value
25
+
26
+
27
+ def convert_datetime_to_season(timestamp):
28
+ """Returns the season for the datetime.
29
+
30
+ Parameters
31
+ ----------
32
+ timestamp : datetime.datetime
33
+
34
+ Returns
35
+ -------
36
+ str
37
+ Season id
38
+
39
+ """
40
+ # TODO: dates do change slightly every year. Is this close enough?
41
+ # dates also change by region, it's weather driven.
42
+ year = timestamp.year
43
+ if timestamp < datetime(year, 3, 20) or timestamp > datetime(year, 12, 21):
44
+ season = Season.WINTER.value
45
+ elif timestamp < datetime(year, 6, 20):
46
+ season = Season.SPRING.value
47
+ elif timestamp < datetime(year, 9, 22):
48
+ season = Season.SUMMER.value
49
+ else:
50
+ season = Season.AUTUMN.value
51
+
52
+ return season
53
+
54
+
55
+ def interpret_datetime(timestamp):
56
+ """Return a datetime object from a timestamp string.
57
+
58
+ Parameters
59
+ ----------
60
+ timestamp : str
61
+
62
+ Returns
63
+ -------
64
+ datetime.datetime
65
+
66
+ """
67
+ formats = (
68
+ "%Y-%m-%dT%H:%M:%S",
69
+ "%Y-%m-%d %H:%M:%S",
70
+ "%Y-%m-%dT%H:%M:%SZ",
71
+ "%Y-%m-%dT%H:%M:%S.%f",
72
+ "%Y-%m-%d %H:%M:%S.%f",
73
+ )
74
+
75
+ for i, fmt in enumerate(formats):
76
+ try:
77
+ return datetime.strptime(timestamp, fmt)
78
+ except ValueError:
79
+ if i == len(formats) - 1:
80
+ raise
dsgrid/time/types.py ADDED
@@ -0,0 +1,67 @@
1
+ """Types related to time"""
2
+
3
+ from datetime import datetime
4
+ from typing import NamedTuple
5
+
6
+ from dsgrid.data_models import DSGEnum
7
+
8
+
9
+ class DayType(DSGEnum):
10
+ """Day types"""
11
+
12
+ WEEKEND = "weekend"
13
+ WEEKDAY = "weekday"
14
+
15
+
16
+ class Season(DSGEnum):
17
+ """Seasons"""
18
+
19
+ WINTER = "winter"
20
+ SPRING = "spring"
21
+ SUMMER = "summer"
22
+ AUTUMN = "autumn"
23
+ FALL = "autumn"
24
+
25
+
26
+ # The types below represent the timestamps that exist as columns in all datasets.
27
+
28
+
29
+ class DatetimeTimestampType(NamedTuple):
30
+ """Single column with datetime."""
31
+
32
+ timestamp: datetime
33
+
34
+
35
+ class AnnualTimestampType(NamedTuple):
36
+ """Single column with only year."""
37
+
38
+ time_year: int
39
+
40
+
41
+ class OneWeekPerMonthByHourType(NamedTuple):
42
+ """Columns of representative time with one week per month."""
43
+
44
+ month: int
45
+ # 0 = Monday, 6 = Sunday. Follows pyspark.sql.functions.weekday and Python datetime.weekday.
46
+ day_of_week: int
47
+ hour: int
48
+
49
+
50
+ class OneWeekdayDayAndOneWeekendDayPerMonthByHourType(NamedTuple):
51
+ """Columns of representative time with month, hour, and weekday vs weekend."""
52
+
53
+ month: int
54
+ is_weekday: bool
55
+ hour: int
56
+
57
+
58
+ class IndexTimestampType(NamedTuple):
59
+ """Single column with numerical indices."""
60
+
61
+ time_index: int
62
+
63
+
64
+ class StringTimestampType(NamedTuple):
65
+ """Single column with time (must include offset) as str."""
66
+
67
+ timestamp: str
File without changes