dsgrid-toolkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dsgrid-toolkit might be problematic. Click here for more details.

Files changed (152) hide show
  1. dsgrid/__init__.py +22 -0
  2. dsgrid/api/__init__.py +0 -0
  3. dsgrid/api/api_manager.py +179 -0
  4. dsgrid/api/app.py +420 -0
  5. dsgrid/api/models.py +60 -0
  6. dsgrid/api/response_models.py +116 -0
  7. dsgrid/apps/__init__.py +0 -0
  8. dsgrid/apps/project_viewer/app.py +216 -0
  9. dsgrid/apps/registration_gui.py +444 -0
  10. dsgrid/chronify.py +22 -0
  11. dsgrid/cli/__init__.py +0 -0
  12. dsgrid/cli/common.py +120 -0
  13. dsgrid/cli/config.py +177 -0
  14. dsgrid/cli/download.py +13 -0
  15. dsgrid/cli/dsgrid.py +142 -0
  16. dsgrid/cli/dsgrid_admin.py +349 -0
  17. dsgrid/cli/install_notebooks.py +62 -0
  18. dsgrid/cli/query.py +711 -0
  19. dsgrid/cli/registry.py +1773 -0
  20. dsgrid/cloud/__init__.py +0 -0
  21. dsgrid/cloud/cloud_storage_interface.py +140 -0
  22. dsgrid/cloud/factory.py +31 -0
  23. dsgrid/cloud/fake_storage_interface.py +37 -0
  24. dsgrid/cloud/s3_storage_interface.py +156 -0
  25. dsgrid/common.py +35 -0
  26. dsgrid/config/__init__.py +0 -0
  27. dsgrid/config/annual_time_dimension_config.py +187 -0
  28. dsgrid/config/common.py +131 -0
  29. dsgrid/config/config_base.py +148 -0
  30. dsgrid/config/dataset_config.py +684 -0
  31. dsgrid/config/dataset_schema_handler_factory.py +41 -0
  32. dsgrid/config/date_time_dimension_config.py +108 -0
  33. dsgrid/config/dimension_config.py +54 -0
  34. dsgrid/config/dimension_config_factory.py +65 -0
  35. dsgrid/config/dimension_mapping_base.py +349 -0
  36. dsgrid/config/dimension_mappings_config.py +48 -0
  37. dsgrid/config/dimensions.py +775 -0
  38. dsgrid/config/dimensions_config.py +71 -0
  39. dsgrid/config/index_time_dimension_config.py +76 -0
  40. dsgrid/config/input_dataset_requirements.py +31 -0
  41. dsgrid/config/mapping_tables.py +209 -0
  42. dsgrid/config/noop_time_dimension_config.py +42 -0
  43. dsgrid/config/project_config.py +1457 -0
  44. dsgrid/config/registration_models.py +199 -0
  45. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  46. dsgrid/config/simple_models.py +49 -0
  47. dsgrid/config/supplemental_dimension.py +29 -0
  48. dsgrid/config/time_dimension_base_config.py +200 -0
  49. dsgrid/data_models.py +155 -0
  50. dsgrid/dataset/__init__.py +0 -0
  51. dsgrid/dataset/dataset.py +123 -0
  52. dsgrid/dataset/dataset_expression_handler.py +86 -0
  53. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  54. dsgrid/dataset/dataset_schema_handler_base.py +899 -0
  55. dsgrid/dataset/dataset_schema_handler_one_table.py +196 -0
  56. dsgrid/dataset/dataset_schema_handler_standard.py +303 -0
  57. dsgrid/dataset/growth_rates.py +162 -0
  58. dsgrid/dataset/models.py +44 -0
  59. dsgrid/dataset/table_format_handler_base.py +257 -0
  60. dsgrid/dataset/table_format_handler_factory.py +17 -0
  61. dsgrid/dataset/unpivoted_table.py +121 -0
  62. dsgrid/dimension/__init__.py +0 -0
  63. dsgrid/dimension/base_models.py +218 -0
  64. dsgrid/dimension/dimension_filters.py +308 -0
  65. dsgrid/dimension/standard.py +213 -0
  66. dsgrid/dimension/time.py +531 -0
  67. dsgrid/dimension/time_utils.py +88 -0
  68. dsgrid/dsgrid_rc.py +88 -0
  69. dsgrid/exceptions.py +105 -0
  70. dsgrid/filesystem/__init__.py +0 -0
  71. dsgrid/filesystem/cloud_filesystem.py +32 -0
  72. dsgrid/filesystem/factory.py +32 -0
  73. dsgrid/filesystem/filesystem_interface.py +136 -0
  74. dsgrid/filesystem/local_filesystem.py +74 -0
  75. dsgrid/filesystem/s3_filesystem.py +118 -0
  76. dsgrid/loggers.py +132 -0
  77. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +950 -0
  78. dsgrid/notebooks/registration.ipynb +48 -0
  79. dsgrid/notebooks/start_notebook.sh +11 -0
  80. dsgrid/project.py +451 -0
  81. dsgrid/query/__init__.py +0 -0
  82. dsgrid/query/dataset_mapping_plan.py +142 -0
  83. dsgrid/query/derived_dataset.py +384 -0
  84. dsgrid/query/models.py +726 -0
  85. dsgrid/query/query_context.py +287 -0
  86. dsgrid/query/query_submitter.py +847 -0
  87. dsgrid/query/report_factory.py +19 -0
  88. dsgrid/query/report_peak_load.py +70 -0
  89. dsgrid/query/reports_base.py +20 -0
  90. dsgrid/registry/__init__.py +0 -0
  91. dsgrid/registry/bulk_register.py +161 -0
  92. dsgrid/registry/common.py +287 -0
  93. dsgrid/registry/config_update_checker_base.py +63 -0
  94. dsgrid/registry/data_store_factory.py +34 -0
  95. dsgrid/registry/data_store_interface.py +69 -0
  96. dsgrid/registry/dataset_config_generator.py +156 -0
  97. dsgrid/registry/dataset_registry_manager.py +734 -0
  98. dsgrid/registry/dataset_update_checker.py +16 -0
  99. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  100. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  101. dsgrid/registry/dimension_registry_manager.py +413 -0
  102. dsgrid/registry/dimension_update_checker.py +16 -0
  103. dsgrid/registry/duckdb_data_store.py +185 -0
  104. dsgrid/registry/filesystem_data_store.py +141 -0
  105. dsgrid/registry/filter_registry_manager.py +123 -0
  106. dsgrid/registry/project_config_generator.py +57 -0
  107. dsgrid/registry/project_registry_manager.py +1616 -0
  108. dsgrid/registry/project_update_checker.py +48 -0
  109. dsgrid/registry/registration_context.py +223 -0
  110. dsgrid/registry/registry_auto_updater.py +316 -0
  111. dsgrid/registry/registry_database.py +662 -0
  112. dsgrid/registry/registry_interface.py +446 -0
  113. dsgrid/registry/registry_manager.py +544 -0
  114. dsgrid/registry/registry_manager_base.py +367 -0
  115. dsgrid/registry/versioning.py +92 -0
  116. dsgrid/spark/__init__.py +0 -0
  117. dsgrid/spark/functions.py +545 -0
  118. dsgrid/spark/types.py +50 -0
  119. dsgrid/tests/__init__.py +0 -0
  120. dsgrid/tests/common.py +139 -0
  121. dsgrid/tests/make_us_data_registry.py +204 -0
  122. dsgrid/tests/register_derived_datasets.py +103 -0
  123. dsgrid/tests/utils.py +25 -0
  124. dsgrid/time/__init__.py +0 -0
  125. dsgrid/time/time_conversions.py +80 -0
  126. dsgrid/time/types.py +67 -0
  127. dsgrid/units/__init__.py +0 -0
  128. dsgrid/units/constants.py +113 -0
  129. dsgrid/units/convert.py +71 -0
  130. dsgrid/units/energy.py +145 -0
  131. dsgrid/units/power.py +87 -0
  132. dsgrid/utils/__init__.py +0 -0
  133. dsgrid/utils/dataset.py +612 -0
  134. dsgrid/utils/files.py +179 -0
  135. dsgrid/utils/filters.py +125 -0
  136. dsgrid/utils/id_remappings.py +100 -0
  137. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  138. dsgrid/utils/py_expression_eval/README.md +8 -0
  139. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  140. dsgrid/utils/py_expression_eval/tests.py +283 -0
  141. dsgrid/utils/run_command.py +70 -0
  142. dsgrid/utils/scratch_dir_context.py +64 -0
  143. dsgrid/utils/spark.py +918 -0
  144. dsgrid/utils/spark_partition.py +98 -0
  145. dsgrid/utils/timing.py +239 -0
  146. dsgrid/utils/utilities.py +184 -0
  147. dsgrid/utils/versioning.py +36 -0
  148. dsgrid_toolkit-0.2.0.dist-info/METADATA +216 -0
  149. dsgrid_toolkit-0.2.0.dist-info/RECORD +152 -0
  150. dsgrid_toolkit-0.2.0.dist-info/WHEEL +4 -0
  151. dsgrid_toolkit-0.2.0.dist-info/entry_points.txt +4 -0
  152. dsgrid_toolkit-0.2.0.dist-info/licenses/LICENSE +29 -0
dsgrid/tests/common.py ADDED
@@ -0,0 +1,139 @@
1
+ import getpass
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+ from semver import VersionInfo
7
+
8
+ from dsgrid.exceptions import DSGInvalidParameter, DSGInvalidOperation
9
+ from dsgrid.registry.dimension_registry_manager import DimensionRegistryManager
10
+ from dsgrid.registry.project_registry_manager import ProjectRegistryManager
11
+ from dsgrid.registry.registry_manager import RegistryManager
12
+ from dsgrid.registry.common import DataStoreType, DatabaseConnection, VersionUpdateType
13
+ from dsgrid.utils.files import dump_data, load_data
14
+
15
+ TEST_PROJECT_PATH = Path(__file__).absolute().parents[2] / "dsgrid-test-data"
16
+ TEST_PROJECT_REPO = TEST_PROJECT_PATH / "test_efs"
17
+ TEST_STANDARD_SCENARIOS_PROJECT_REPO = TEST_PROJECT_PATH / "standard_scenarios_2021"
18
+ TEST_DATASET_DIRECTORY = TEST_PROJECT_PATH / "datasets"
19
+ TEST_REGISTRY_DATABASE = "cached-test-dsgrid"
20
+ TEST_REGISTRY_BASE_PATH = Path("tests/data/registry")
21
+ TEST_REGISTRY_DATA_PATH = Path("tests/data/registry/registry_data")
22
+ TEST_EFS_REGISTRATION_FILE = Path("tests/data/test_efs_registration.json5")
23
+ # AWS_PROFILE_NAME = "nrel-aws-dsgrid"
24
+ TEST_REMOTE_REGISTRY = "s3://nrel-dsgrid-registry-test"
25
+ CACHED_TEST_REGISTRY_DB = f"sqlite:///{TEST_REGISTRY_BASE_PATH}/cached_registry.db"
26
+ STANDARD_SCENARIOS_PROJECT_REPO = Path(__file__).parents[2] / "dsgrid-project-StandardScenarios"
27
+ IEF_PROJECT_REPO = Path(__file__).parents[2] / "dsgrid-project-IEF"
28
+ SIMPLE_STANDARD_SCENARIOS = TEST_PROJECT_PATH / "filtered_registries" / "simple_standard_scenarios"
29
+ SIMPLE_STANDARD_SCENARIOS_REGISTRY_DB = (
30
+ f"sqlite:///{TEST_PROJECT_PATH}/filtered_registries/simple_standard_scenarios/registry.db"
31
+ )
32
+
33
+
34
+ def create_local_test_registry(
35
+ tmpdir: Path, conn=None, data_store_type: DataStoreType = DataStoreType.FILESYSTEM
36
+ ):
37
+ if conn is None:
38
+ conn = DatabaseConnection(url=f"sqlite:///{tmpdir}/dsgrid-test.db")
39
+ data_path = tmpdir / "registry_data"
40
+ RegistryManager.create(conn, data_path, data_store_type=data_store_type, overwrite=True)
41
+ return data_path
42
+
43
+
44
+ def check_configs_update(base_dir, manager):
45
+ """Runs an update on one of each type of config.
46
+
47
+ Parameters
48
+ ----------
49
+ base_dir : Path
50
+ manager : RegistryManager
51
+
52
+ Returns
53
+ -------
54
+ list
55
+ Each updated config and new version: [(Updated ID, new version)]
56
+ For the dimension element the tuple is (Updated ID, dimension type, new version).
57
+ Order is dimension ID, dimension mapping ID, dataset ID, project ID
58
+
59
+ """
60
+ update_dir = base_dir / "updates"
61
+ user = getpass.getuser()
62
+
63
+ updated_ids = []
64
+ for mgr in (
65
+ manager.dimension_manager,
66
+ manager.dimension_mapping_manager,
67
+ manager.dataset_manager,
68
+ manager.project_manager,
69
+ ):
70
+ config_id = mgr.list_ids()[0]
71
+ version = mgr.get_latest_version(config_id)
72
+ check_config_update(update_dir, mgr, config_id, user, version)
73
+ new_version = mgr.get_latest_version(config_id)
74
+ if isinstance(mgr, DimensionRegistryManager):
75
+ config = mgr.get_by_id(config_id)
76
+ updated_ids.append((config_id, config.model.dimension_type, new_version))
77
+ else:
78
+ updated_ids.append((config_id, new_version))
79
+
80
+ return updated_ids
81
+
82
+
83
+ def check_config_update(base_dir, mgr: ProjectRegistryManager, config_id, user, version):
84
+ """Runs basic positive and negative update tests for the config.
85
+
86
+ Parameters
87
+ ----------
88
+ base_dir : str
89
+ mgr : RegistryManagerBase
90
+ config_id : str
91
+ user : str
92
+ version : str
93
+
94
+ """
95
+ config_file = Path(base_dir) / mgr.config_class().config_filename()
96
+ assert not config_file.exists()
97
+ try:
98
+ mgr.dump(config_id, base_dir, force=True)
99
+ with pytest.raises(DSGInvalidOperation):
100
+ mgr.dump(config_id, base_dir)
101
+ mgr.dump(config_id, base_dir, force=True)
102
+ assert config_file.exists()
103
+ config_data = load_data(config_file)
104
+ config_data["description"] += "; updated description"
105
+ dump_data(config_data, config_file)
106
+ with pytest.raises(DSGInvalidParameter):
107
+ mgr.update_from_file(
108
+ config_file,
109
+ "invalid_config_id",
110
+ user,
111
+ VersionUpdateType.PATCH,
112
+ "update to description",
113
+ version,
114
+ )
115
+ with pytest.raises(DSGInvalidParameter):
116
+ mgr.update_from_file(
117
+ config_file,
118
+ config_id,
119
+ user,
120
+ VersionUpdateType.PATCH,
121
+ "update to description",
122
+ str(VersionInfo.parse(version).bump_patch()),
123
+ )
124
+
125
+ mgr.update_from_file(
126
+ config_file,
127
+ config_id,
128
+ user,
129
+ VersionUpdateType.PATCH,
130
+ "update to description",
131
+ version,
132
+ )
133
+ assert (
134
+ VersionInfo.parse(mgr.get_latest_version(config_id))
135
+ == VersionInfo.parse(version).bump_patch()
136
+ )
137
+ finally:
138
+ if config_file.exists():
139
+ os.remove(config_file)
@@ -0,0 +1,204 @@
1
+ import getpass
2
+ import logging
3
+ import os
4
+ import shutil
5
+ import tempfile
6
+ from pathlib import Path
7
+
8
+ import rich_click as click
9
+
10
+ from dsgrid.cli.common import path_callback
11
+ from dsgrid.loggers import setup_logging, check_log_file_size
12
+ from dsgrid.registry.common import DataStoreType, DatabaseConnection
13
+ from dsgrid.registry.registry_manager import RegistryManager
14
+ from dsgrid.tests.common import (
15
+ create_local_test_registry,
16
+ TEST_DATASET_DIRECTORY,
17
+ TEST_REMOTE_REGISTRY,
18
+ TEST_PROJECT_REPO,
19
+ )
20
+ from dsgrid.utils.timing import timer_stats_collector
21
+ from dsgrid.utils.files import load_data
22
+ from dsgrid.utils.id_remappings import (
23
+ map_dimension_names_to_ids,
24
+ replace_dimension_names_with_current_ids,
25
+ )
26
+
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def make_test_data_registry(
32
+ registry_path,
33
+ src_dir,
34
+ dataset_path=None,
35
+ include_projects=True,
36
+ include_datasets=True,
37
+ offline_mode=True,
38
+ database_url: str | None = None,
39
+ data_store_type: DataStoreType = DataStoreType.FILESYSTEM,
40
+ ) -> RegistryManager:
41
+ """Creates a local registry from a dsgrid project source directory for testing.
42
+
43
+ Parameters
44
+ ----------
45
+ registry_path : Path
46
+ Path in which the registry will be created.
47
+ src_dir : Path
48
+ Path containing source config files
49
+ dataset_path : Path | None
50
+ If None, use "DSGRID_LOCAL_DATA_DIRECTORY" env variable.
51
+ include_projects : bool
52
+ If False, do not register any projects.
53
+ include_datasets : bool
54
+ If False, do not register any datasets.
55
+ offline_mode : bool
56
+ If False, use the test remote registry.
57
+ data_store_type: DataStoreType
58
+ Type of store to use for the registry data.
59
+ """
60
+ if not include_projects and include_datasets:
61
+ msg = "If include_datasets is True then include_projects must also be True."
62
+ raise Exception(msg)
63
+
64
+ if dataset_path is None:
65
+ dataset_path = os.environ.get("DSGRID_LOCAL_DATA_DIRECTORY", TEST_DATASET_DIRECTORY)
66
+ dataset_path = Path(dataset_path)
67
+ url = f"sqlite:///{registry_path}/registry.db" if database_url is None else database_url
68
+ conn = DatabaseConnection(url=url)
69
+ create_local_test_registry(registry_path, conn=conn, data_store_type=data_store_type)
70
+ dataset_dirs = [
71
+ Path("datasets/modeled/comstock"),
72
+ Path("datasets/modeled/comstock_unpivoted"),
73
+ ]
74
+
75
+ user = getpass.getuser()
76
+ log_message = "Initial registration"
77
+ if offline_mode:
78
+ manager = RegistryManager.load(conn, offline_mode=offline_mode)
79
+ else:
80
+ manager = RegistryManager.load(
81
+ conn, remote_path=TEST_REMOTE_REGISTRY, offline_mode=offline_mode
82
+ )
83
+
84
+ project_config_file = src_dir / "project.json5"
85
+ project_id = load_data(project_config_file)["project_id"]
86
+ dataset_config_files = [src_dir / path / "dataset.json5" for path in dataset_dirs]
87
+ dataset_mapping_files = [src_dir / path / "dimension_mappings.json5" for path in dataset_dirs]
88
+ for i, filename in enumerate(dataset_mapping_files):
89
+ if not filename.exists():
90
+ dataset_mapping_files[i] = None
91
+ dataset_ids = [load_data(config_file)["dataset_id"] for config_file in dataset_config_files]
92
+
93
+ if include_projects:
94
+ print("\n 1. register project: \n")
95
+ manager.project_manager.register(
96
+ project_config_file,
97
+ user,
98
+ log_message,
99
+ )
100
+ if include_datasets:
101
+ for i, dataset_config_file in enumerate(dataset_config_files):
102
+ dataset_id = dataset_ids[i]
103
+ print(f"\n 2. register dataset {dataset_id}: \n")
104
+ dataset_mapping_file = dataset_mapping_files[i]
105
+ mappings = map_dimension_names_to_ids(manager.dimension_manager)
106
+ replace_dimension_names_with_current_ids(dataset_config_file, mappings)
107
+ manager.dataset_manager.register(
108
+ dataset_config_file,
109
+ dataset_path / dataset_id,
110
+ user,
111
+ log_message,
112
+ )
113
+ print(f"\n 3. submit dataset {dataset_id} to project\n")
114
+ manager.project_manager.submit_dataset(
115
+ project_id,
116
+ dataset_id,
117
+ user,
118
+ log_message,
119
+ dimension_mapping_file=dataset_mapping_file,
120
+ )
121
+ return manager
122
+
123
+
124
+ @click.command()
125
+ @click.argument(
126
+ "registry-path",
127
+ type=Path,
128
+ default=f"{Path.home()}/.dsgrid-test-registry",
129
+ callback=path_callback,
130
+ )
131
+ @click.option(
132
+ "-f",
133
+ "--force",
134
+ default=False,
135
+ is_flag=True,
136
+ show_default=True,
137
+ help="Delete registry-path if it exists.",
138
+ )
139
+ @click.option(
140
+ "-p",
141
+ "--project-dir",
142
+ default=TEST_PROJECT_REPO,
143
+ help="path to a project repository",
144
+ callback=path_callback,
145
+ )
146
+ @click.option(
147
+ "-d",
148
+ "--dataset-dir",
149
+ default=TEST_DATASET_DIRECTORY,
150
+ help="path to your local datasets",
151
+ callback=path_callback,
152
+ )
153
+ @click.option(
154
+ "-t",
155
+ "--data-store-type",
156
+ type=click.Choice([x.value for x in DataStoreType]),
157
+ default=DataStoreType.FILESYSTEM.value,
158
+ show_default=True,
159
+ help="Type of store to use for the registry data.",
160
+ callback=lambda *x: DataStoreType(x[2]),
161
+ )
162
+ @click.option(
163
+ "--verbose",
164
+ is_flag=True,
165
+ default=False,
166
+ show_default=True,
167
+ help="Enable verbose log output.",
168
+ )
169
+ def run(
170
+ registry_path: Path,
171
+ force: bool,
172
+ project_dir: Path,
173
+ dataset_dir: Path,
174
+ data_store_type: DataStoreType,
175
+ verbose: bool,
176
+ ):
177
+ """Creates a local registry from a dsgrid project source directory for testing."""
178
+ level = logging.DEBUG if verbose else logging.INFO
179
+ log_file = Path("test_dsgrid_project.log")
180
+ check_log_file_size(log_file, no_prompts=True)
181
+ setup_logging("dsgrid", log_file, console_level=level, file_level=level, mode="a")
182
+ if registry_path.exists():
183
+ if force:
184
+ shutil.rmtree(registry_path)
185
+ else:
186
+ print(f"{registry_path} already exists. Use --force to overwrite.")
187
+ os.makedirs(registry_path)
188
+ tmp_project_dir = Path(tempfile.gettempdir()) / "tmp_test_project_dir"
189
+ if tmp_project_dir.exists():
190
+ shutil.rmtree(tmp_project_dir)
191
+ shutil.copytree(project_dir, tmp_project_dir)
192
+ try:
193
+ make_test_data_registry(
194
+ registry_path,
195
+ tmp_project_dir / "dsgrid_project",
196
+ dataset_dir,
197
+ data_store_type=data_store_type,
198
+ )
199
+ finally:
200
+ timer_stats_collector.log_stats()
201
+
202
+
203
+ if __name__ == "__main__":
204
+ run()
@@ -0,0 +1,103 @@
1
+ import logging
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ import rich_click as click
6
+
7
+ from dsgrid.loggers import setup_logging, check_log_file_size
8
+ from dsgrid.query.models import ProjectQueryModel
9
+ from dsgrid.registry.dataset_registry import DatasetRegistry
10
+ from dsgrid.utils.run_command import check_run_command
11
+ from dsgrid.utils.timing import timer_stats_collector
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @click.command()
18
+ @click.argument(
19
+ "query_files",
20
+ nargs=-1,
21
+ type=click.Path(exists=True),
22
+ callback=lambda *x: [Path(y) for y in x[2]],
23
+ )
24
+ @click.option(
25
+ "-r", "--registry-path", required=True, callback=lambda *x: Path(x[2]), help="Path to registry"
26
+ )
27
+ @click.option(
28
+ "-o",
29
+ "--output",
30
+ default="query_output",
31
+ show_default=True,
32
+ type=click.Path(),
33
+ help="Output directory for query results",
34
+ callback=lambda *x: Path(x[2]),
35
+ )
36
+ @click.option(
37
+ "-p",
38
+ "--project-id",
39
+ default="dsgrid_conus_2022",
40
+ show_default=True,
41
+ type=str,
42
+ help="Project ID",
43
+ )
44
+ @click.option(
45
+ "--verbose", is_flag=True, default=False, show_default=True, help="Enable verbose log output."
46
+ )
47
+ def run(query_files, project_id, registry_path, output, verbose):
48
+ """Registers derived datasets in a local registry for testing."""
49
+ level = logging.DEBUG if verbose else logging.INFO
50
+ log_file = Path("dsgrid_registration.log")
51
+ check_log_file_size(log_file, no_prompts=True)
52
+ logger = setup_logging(__name__, log_file, console_level=level, file_level=level, mode="a")
53
+ try:
54
+ _run_registration(query_files, project_id, registry_path, output)
55
+ finally:
56
+ # Raise the console level so that timer stats only go to the log file.
57
+ for _, handler in enumerate(logger.handlers):
58
+ if handler.name == "console":
59
+ handler.setLevel(logging.WARNING)
60
+ break
61
+
62
+ timer_stats_collector.log_stats()
63
+
64
+
65
+ def _run_registration(
66
+ query_files: list[Path], project_id: str, registry_path: Path, query_output_dir: Path
67
+ ):
68
+ log_message = "Submit derived dataset"
69
+ query_output_dir.mkdir(exist_ok=True)
70
+ derived_dataset_config_dir = query_output_dir / "derived_dataset_configs"
71
+ if derived_dataset_config_dir.exists():
72
+ shutil.rmtree(derived_dataset_config_dir)
73
+ derived_dataset_config_dir.mkdir()
74
+ for query_file in query_files:
75
+ logger.info("Register derived dataset from %s", query_file)
76
+ query = ProjectQueryModel.from_file(query_file)
77
+ dataset_id = query.project.dataset.dataset_id
78
+ dataset_config_dir = derived_dataset_config_dir / dataset_id
79
+ dataset_config_file = dataset_config_dir / DatasetRegistry.config_filename()
80
+
81
+ create_cmd = (
82
+ f"dsgrid query project run --offline --registry-path={registry_path} "
83
+ f"-o {query_output_dir} {query_file}"
84
+ )
85
+
86
+ config_cmd = (
87
+ f"dsgrid query project create-derived-dataset-config --offline "
88
+ f"--registry-path={registry_path} {query_output_dir / dataset_id} {dataset_config_dir}"
89
+ )
90
+
91
+ submit_cmd = (
92
+ f"dsgrid registry --offline --path {registry_path} projects "
93
+ f"register-and-submit-dataset -c {dataset_config_file} -p {project_id} "
94
+ f"-l '{log_message}' -d {query_output_dir / dataset_id}"
95
+ )
96
+
97
+ for cmd in (create_cmd, config_cmd, submit_cmd):
98
+ logger.info(cmd)
99
+ check_run_command(cmd)
100
+
101
+
102
+ if __name__ == "__main__":
103
+ run()
dsgrid/tests/utils.py ADDED
@@ -0,0 +1,25 @@
1
+ from pathlib import Path
2
+
3
+ from dsgrid.spark.types import use_duckdb
4
+ from dsgrid.utils.spark import get_spark_session
5
+
6
+
7
+ def read_parquet(filename: Path):
8
+ """Read a Parquet file and load it into cache. This helps debugging with pytest --pdb.
9
+ If you don't use this, the parquet file will get deleted on a failure and you won't be able
10
+ to inspect the dataframe.
11
+ """
12
+ spark = get_spark_session()
13
+ df = spark.read.parquet(str(filename))
14
+ if not use_duckdb():
15
+ df.cache()
16
+ df.count()
17
+ return df
18
+
19
+
20
+ def read_parquet_two_table_format(path: Path):
21
+ spark = get_spark_session()
22
+ load_data = spark.read.parquet(str(path / "load_data.parquet"))
23
+ lookup = spark.read.parquet(str(path / "load_data_lookup.parquet"))
24
+ table = load_data.join(lookup, on="id").drop("id")
25
+ return table
File without changes
@@ -0,0 +1,80 @@
1
+ """Functions to perform time conversions"""
2
+
3
+ from datetime import datetime
4
+
5
+ from dsgrid.time.types import DayType, Season
6
+
7
+
8
+ def convert_datetime_to_day_type(timestamp):
9
+ """Returns the day type for the datetime.
10
+
11
+ Parameters
12
+ ----------
13
+ timestamp : datetime.datetime
14
+
15
+ Returns
16
+ -------
17
+ str
18
+ DayType id
19
+
20
+ """
21
+ # Monday is 0.
22
+ if timestamp.weekday() <= 4:
23
+ return DayType.WEEKDAY.value
24
+ return DayType.WEEKEND.value
25
+
26
+
27
+ def convert_datetime_to_season(timestamp):
28
+ """Returns the season for the datetime.
29
+
30
+ Parameters
31
+ ----------
32
+ timestamp : datetime.datetime
33
+
34
+ Returns
35
+ -------
36
+ str
37
+ Season id
38
+
39
+ """
40
+ # TODO: dates do change slightly every year. Is this close enough?
41
+ # dates also change by region, it's weather driven.
42
+ year = timestamp.year
43
+ if timestamp < datetime(year, 3, 20) or timestamp > datetime(year, 12, 21):
44
+ season = Season.WINTER.value
45
+ elif timestamp < datetime(year, 6, 20):
46
+ season = Season.SPRING.value
47
+ elif timestamp < datetime(year, 9, 22):
48
+ season = Season.SUMMER.value
49
+ else:
50
+ season = Season.AUTUMN.value
51
+
52
+ return season
53
+
54
+
55
+ def interpret_datetime(timestamp):
56
+ """Return a datetime object from a timestamp string.
57
+
58
+ Parameters
59
+ ----------
60
+ timestamp : str
61
+
62
+ Returns
63
+ -------
64
+ datetime.datetime
65
+
66
+ """
67
+ formats = (
68
+ "%Y-%m-%dT%H:%M:%S",
69
+ "%Y-%m-%d %H:%M:%S",
70
+ "%Y-%m-%dT%H:%M:%SZ",
71
+ "%Y-%m-%dT%H:%M:%S.%f",
72
+ "%Y-%m-%d %H:%M:%S.%f",
73
+ )
74
+
75
+ for i, fmt in enumerate(formats):
76
+ try:
77
+ return datetime.strptime(timestamp, fmt)
78
+ except ValueError:
79
+ if i == len(formats) - 1:
80
+ raise
dsgrid/time/types.py ADDED
@@ -0,0 +1,67 @@
1
+ """Types related to time"""
2
+
3
+ from datetime import datetime
4
+ from typing import NamedTuple
5
+
6
+ from dsgrid.data_models import DSGEnum
7
+
8
+
9
+ class DayType(DSGEnum):
10
+ """Day types"""
11
+
12
+ WEEKEND = "weekend"
13
+ WEEKDAY = "weekday"
14
+
15
+
16
+ class Season(DSGEnum):
17
+ """Seasons"""
18
+
19
+ WINTER = "winter"
20
+ SPRING = "spring"
21
+ SUMMER = "summer"
22
+ AUTUMN = "autumn"
23
+ FALL = "autumn"
24
+
25
+
26
+ # The types below represent the timestamps that exist as columns in all datasets.
27
+
28
+
29
+ class DatetimeTimestampType(NamedTuple):
30
+ """Single column with datetime."""
31
+
32
+ timestamp: datetime
33
+
34
+
35
+ class AnnualTimestampType(NamedTuple):
36
+ """Single column with only year."""
37
+
38
+ time_year: int
39
+
40
+
41
+ class OneWeekPerMonthByHourType(NamedTuple):
42
+ """Columns of representative time with one week per month."""
43
+
44
+ month: int
45
+ # 0 = Monday, 6 = Sunday. Follows pyspark.sql.functions.weekday and Python datetime.weekday.
46
+ day_of_week: int
47
+ hour: int
48
+
49
+
50
+ class OneWeekdayDayAndOneWeekendDayPerMonthByHourType(NamedTuple):
51
+ """Columns of representative time with month, hour, and weekday vs weekend."""
52
+
53
+ month: int
54
+ is_weekday: bool
55
+ hour: int
56
+
57
+
58
+ class IndexTimestampType(NamedTuple):
59
+ """Single column with numerical indices."""
60
+
61
+ time_index: int
62
+
63
+
64
+ class StringTimestampType(NamedTuple):
65
+ """Single column with time (must include offset) as str."""
66
+
67
+ timestamp: str
File without changes