dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,949 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "93935e00",
6
+ "metadata": {},
7
+ "source": [
8
+ "## 1. initialize"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "59c46803",
15
+ "metadata": {
16
+ "ExecuteTime": {
17
+ "end_time": "2022-06-28T22:18:35.922502Z",
18
+ "start_time": "2022-06-28T22:18:35.770022Z"
19
+ }
20
+ },
21
+ "outputs": [],
22
+ "source": [
23
+ "from pathlib import Path\n",
24
+ "import os\n",
25
+ "import getpass\n",
26
+ "import shutil\n",
27
+ "\n",
28
+ "from pyspark.sql import SparkSession\n",
29
+ "from pyspark import SparkConf, SparkContext"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "id": "e604690d",
36
+ "metadata": {
37
+ "ExecuteTime": {
38
+ "end_time": "2022-06-28T22:18:37.067792Z",
39
+ "start_time": "2022-06-28T22:18:37.059186Z"
40
+ }
41
+ },
42
+ "outputs": [],
43
+ "source": [
44
+ "os.environ"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "markdown",
49
+ "id": "5c7e730c",
50
+ "metadata": {},
51
+ "source": [
52
+ "## 2. start spark cluster"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": null,
58
+ "id": "c0fac86c",
59
+ "metadata": {
60
+ "ExecuteTime": {
61
+ "end_time": "2022-06-28T22:18:38.952702Z",
62
+ "start_time": "2022-06-28T22:18:38.947792Z"
63
+ }
64
+ },
65
+ "outputs": [],
66
+ "source": [
67
+ "# tweak setting here:\n",
68
+ "def init_spark(cluster=None, name=\"dsgrid\", tz=\"UTC\"):\n",
69
+ " \"\"\"Initialize a SparkSession.\"\"\"\n",
70
+ " conf = SparkConf().setAppName(name)\n",
71
+ "\n",
72
+ " if cluster is None:\n",
73
+ " spark = SparkSession.builder.master(\"local\").appName(name).getOrCreate()\n",
74
+ " elif cluster == \"AWS\":\n",
75
+ " pass\n",
76
+ " # does not need to setMaster for AWS cluster\n",
77
+ " else:\n",
78
+ " conf = conf.setMaster(cluster)\n",
79
+ " conf = conf.setAll(\n",
80
+ " [\n",
81
+ " # (\"spark.sql.shuffle.partitions\", \"200\"),\n",
82
+ " # (\"spark.executor.instances\", \"7\"),\n",
83
+ " # (\"spark.executor.cores\", \"5\"),\n",
84
+ " # (\"spark.executor.memory\", \"10g\"),\n",
85
+ " # (\"spark.driver.memory\", \"10g\"),\n",
86
+ " # (\"spark.dynamicAllocation.enabled\", True),\n",
87
+ " # (\"spark.shuffle.service.enabled\", True),\n",
88
+ " (\"spark.sql.session.timeZone\", tz),\n",
89
+ " ]\n",
90
+ " )\n",
91
+ " spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
92
+ " return spark"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "markdown",
97
+ "id": "45a0e446",
98
+ "metadata": {},
99
+ "source": [
100
+ "To launch a standalone cluster or a cluster on Kestrel, follow **instructions** here: \\\n",
101
+ "https://github.com/dsgrid/dsgrid/tree/main/dev#spark-standalone-cluster\n",
102
+ "\n",
103
+ "accordingly, uncomment and update the cluster name below:"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "id": "fd3c099a",
110
+ "metadata": {
111
+ "ExecuteTime": {
112
+ "end_time": "2022-06-28T22:18:59.645718Z",
113
+ "start_time": "2022-06-28T22:18:41.443768Z"
114
+ }
115
+ },
116
+ "outputs": [],
117
+ "source": [
118
+ "main_tz = \"EST\" # <--- UTC, EST\n",
119
+ "\n",
120
+ "### STAND-ALONE CLUSTER\n",
121
+ "# cluster = \"spark://lliu2-34727s:7077\"\n",
122
+ "# name = \"stand-alone\"\n",
123
+ "\n",
124
+ "### CLUSTER ON HPC - Type in nodename\n",
125
+ "# NODENAME = \"r103u23\" # <--- change after deploying cluster\n",
126
+ "# cluster = f\"spark://{NODENAME}.ib0.cm.hpc.nrel.gov:7077\"\n",
127
+ "# name = \"HPC\"\n",
128
+ "\n",
129
+ "### CLUSTER ON HPC - Get cluster from file dropped by prep_spark_cluster_notebook.py\n",
130
+ "# import toml\n",
131
+ "# config = toml.load(\"cluster.toml\")\n",
132
+ "# cluster = config[\"cluster\"]\n",
133
+ "# name = \"HPC\"\n",
134
+ "\n",
135
+ "### LOCAL MODE\n",
136
+ "# cluster = None\n",
137
+ "# name = \"local\"\n",
138
+ "\n",
139
+ "### AWS MODE\n",
140
+ "cluster = \"AWS\"\n",
141
+ "name = \"AWS\"\n",
142
+ "\n",
143
+ "# Initialize\n",
144
+ "spark = init_spark(cluster, \"dsgrid-load\", tz=main_tz)\n",
145
+ "\n",
146
+ "# get Spark Context UI\n",
147
+ "sc = spark.sparkContext\n",
148
+ "sc"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "markdown",
153
+ "id": "5c91e3b2",
154
+ "metadata": {},
155
+ "source": [
156
+ "#### The *Spark UI* above works only for local mode. For HPC cluster Spark UI, use:\n",
157
+ "http://localhost:8080"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": null,
163
+ "id": "1f4214f4",
164
+ "metadata": {
165
+ "ExecuteTime": {
166
+ "end_time": "2022-06-28T22:18:59.689409Z",
167
+ "start_time": "2022-06-28T22:18:59.647919Z"
168
+ }
169
+ },
170
+ "outputs": [],
171
+ "source": [
172
+ "for x in sorted(sc.getConf().getAll()):\n",
173
+ " print(x)"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": null,
179
+ "id": "b33cfadc",
180
+ "metadata": {},
181
+ "outputs": [],
182
+ "source": []
183
+ },
184
+ {
185
+ "cell_type": "markdown",
186
+ "id": "d8b37296",
187
+ "metadata": {},
188
+ "source": [
189
+ "## 3. dsgrid"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": null,
195
+ "id": "aa24da30",
196
+ "metadata": {
197
+ "ExecuteTime": {
198
+ "end_time": "2022-06-28T22:18:59.889562Z",
199
+ "start_time": "2022-06-28T22:18:59.691274Z"
200
+ }
201
+ },
202
+ "outputs": [],
203
+ "source": [
204
+ "from IPython.core.display import display, HTML\n",
205
+ "\n",
206
+ "display(HTML(\"<style>.container { width:100% !important; }</style>\"))\n",
207
+ "import pandas as pd\n",
208
+ "\n",
209
+ "pd.set_option(\"display.max_rows\", 20)\n",
210
+ "# import plotly\n",
211
+ "# pd.options.plotting.backend = \"plotly\"\n",
212
+ "import numpy as np\n",
213
+ "import itertools\n",
214
+ "import pytz\n",
215
+ "from datetime import datetime, timedelta\n",
216
+ "\n",
217
+ "from semver import VersionInfo\n",
218
+ "from pydantic import ValidationError\n",
219
+ "import pyspark.sql.functions as F\n",
220
+ "import pyspark.sql.types as sparktypes"
221
+ ]
222
+ },
223
+ {
224
+ "cell_type": "code",
225
+ "execution_count": null,
226
+ "id": "239217ef",
227
+ "metadata": {
228
+ "ExecuteTime": {
229
+ "end_time": "2022-06-28T22:19:00.152343Z",
230
+ "start_time": "2022-06-28T22:18:59.891604Z"
231
+ }
232
+ },
233
+ "outputs": [],
234
+ "source": [
235
+ "from dsgrid.common import LOCAL_REGISTRY\n",
236
+ "from dsgrid.registry.registry_manager import RegistryManager\n",
237
+ "from dsgrid.utils.files import load_data\n",
238
+ "from dsgrid.utils.spark import create_dataframe, read_dataframe, get_unique_values\n",
239
+ "from dsgrid.dimension.base_models import DimensionType\n",
240
+ "from dsgrid.dataset.dataset import Dataset\n",
241
+ "from dsgrid.project import Project\n"
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "markdown",
246
+ "id": "02cec913",
247
+ "metadata": {},
248
+ "source": [
249
+ "## 3.1. Check dsgrid registry"
250
+ ]
251
+ },
252
+ {
253
+ "cell_type": "code",
254
+ "execution_count": null,
255
+ "id": "2ce24d62",
256
+ "metadata": {
257
+ "ExecuteTime": {
258
+ "end_time": "2022-06-28T22:19:05.931097Z",
259
+ "start_time": "2022-06-28T22:19:05.927089Z"
260
+ }
261
+ },
262
+ "outputs": [],
263
+ "source": [
264
+ "## sync registry and then load offline\n",
265
+ "# LOCAL_REGISTRY = \"s3://nrel-dsgrid-registry-archive\"\n",
266
+ "registry_path = os.getenv(\"DSGRID_REGISTRY_PATH\", default=LOCAL_REGISTRY)\n",
267
+ "registry_path"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": null,
273
+ "id": "f15cdc99",
274
+ "metadata": {
275
+ "ExecuteTime": {
276
+ "end_time": "2022-06-28T22:19:40.741402Z",
277
+ "start_time": "2022-06-28T22:19:11.930311Z"
278
+ }
279
+ },
280
+ "outputs": [],
281
+ "source": [
282
+ "sync_and_pull = True # <--- registry config only\n",
283
+ "if sync_and_pull:\n",
284
+ " print(f\"syncing registry: {registry_path}\")\n",
285
+ " RegistryManager.load(registry_path, offline_mode=False)"
286
+ ]
287
+ },
288
+ {
289
+ "cell_type": "code",
290
+ "execution_count": null,
291
+ "id": "bd2c937c",
292
+ "metadata": {
293
+ "ExecuteTime": {
294
+ "end_time": "2022-06-28T22:19:41.397174Z",
295
+ "start_time": "2022-06-28T22:19:40.743809Z"
296
+ }
297
+ },
298
+ "outputs": [],
299
+ "source": [
300
+ "# ETH@Review: Were you intending to write something to the right of the arrow?\n",
301
+ "offline_mode = True # <---\n",
302
+ "\n",
303
+ "registry_mgr = RegistryManager.load(registry_path, offline_mode=offline_mode)\n",
304
+ "project_mgr = registry_mgr.project_manager\n",
305
+ "dataset_mgr = registry_mgr.dataset_manager\n",
306
+ "dim_map_mgr = registry_mgr.dimension_mapping_manager\n",
307
+ "dim_mgr = registry_mgr.dimension_manager\n",
308
+ "# ETH@Review: This line seems out of place. Or change \"Loading\" to \"Loaded\"?\n",
309
+ "print(f\"Loaded dsgrid registry at: {registry_path}\")"
310
+ ]
311
+ },
312
+ {
313
+ "cell_type": "code",
314
+ "execution_count": null,
315
+ "id": "23c1b83f",
316
+ "metadata": {
317
+ "ExecuteTime": {
318
+ "end_time": "2022-06-28T22:19:48.637994Z",
319
+ "start_time": "2022-06-28T22:19:41.399044Z"
320
+ }
321
+ },
322
+ "outputs": [],
323
+ "source": [
324
+ "project_mgr.show(max_width=30, drop_fields=[\"Date\", \"Submitter\"])"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "code",
329
+ "execution_count": null,
330
+ "id": "37b93d4f",
331
+ "metadata": {},
332
+ "outputs": [],
333
+ "source": []
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": null,
338
+ "id": "d4815918",
339
+ "metadata": {
340
+ "ExecuteTime": {
341
+ "end_time": "2022-06-18T01:51:06.629410Z",
342
+ "start_time": "2022-06-18T01:51:06.627155Z"
343
+ }
344
+ },
345
+ "outputs": [],
346
+ "source": [
347
+ "# %%timeit\n",
348
+ "# ## Dan's test\n",
349
+ "# from dsgrid.config.time_dimension_base_config import TimeDimensionBaseConfig\n",
350
+ "\n",
351
+ "# i = 0\n",
352
+ "# for d_id in registry_mgr.dimension_manager._id_to_type:\n",
353
+ "# config = registry_mgr.dimension_manager.get_by_id(d_id)\n",
354
+ "# if not isinstance(config, TimeDimensionBaseConfig):\n",
355
+ "# config.get_records_dataframe().count()\n",
356
+ "# i += 1\n",
357
+ "\n",
358
+ "# print(i)"
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "markdown",
363
+ "id": "484d7f4c",
364
+ "metadata": {},
365
+ "source": [
366
+ "## 3.2. Load Project\n",
367
+ "This section is mostly exploratory (For *Section 4. Queries*, only need to load project) \n",
368
+ "\n",
369
+ "#### Some user criteria:\n",
370
+ "At the projects, I want to be able to:\n",
371
+ "- Examine what's available in the project:\n",
372
+ " * Show project dimensions by type, show resolution by type - I don't care: base/supplemental, mappings, id\n",
373
+ " * Get unique records by dimension/resolution\n",
374
+ " * Get unique records by selected dimension sets\n",
375
+ " * Show mapped dataset\n",
376
+ " * Show unit (or select a unit of analysis) and fuel types\n",
377
+ "- Make queries using:\n",
378
+ " * Project dimensions + fuel types + time resolutions\n",
379
+ " * Get all types of statistics (max, mean, min, percentiles, count, sum)\n",
380
+ " \n",
381
+ "- dataset level: never mapped, think TEMPO,\n",
382
+ "- interface to allow for query optimization\n",
383
+ " \n",
384
+ "#### Notes:\n",
385
+ " * Project_manager has access to all other managers.\n",
386
+ " * Each manager has the responsiblity to retrieve configs\n",
387
+ " * Access ConfigModel from configs"
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "execution_count": null,
393
+ "id": "8083ad86",
394
+ "metadata": {
395
+ "ExecuteTime": {
396
+ "end_time": "2022-06-18T01:51:08.162466Z",
397
+ "start_time": "2022-06-18T01:51:06.631419Z"
398
+ }
399
+ },
400
+ "outputs": [],
401
+ "source": [
402
+ "# load projct\n",
403
+ "project_id = \"dsgrid_conus_2022\" # <---\n",
404
+ "project = project_mgr.load_project(project_id)\n",
405
+ "\n",
406
+ "print(\"project loaded\")"
407
+ ]
408
+ },
409
+ {
410
+ "cell_type": "markdown",
411
+ "id": "cd51fbba",
412
+ "metadata": {},
413
+ "source": [
414
+ "## 3.3. Load Project Datasets"
415
+ ]
416
+ },
417
+ {
418
+ "cell_type": "markdown",
419
+ "id": "89660b28",
420
+ "metadata": {},
421
+ "source": [
422
+ "### 3.3.3. TEMPO\n",
423
+ "\n",
424
+ "load and check tempo dataset here"
425
+ ]
426
+ },
427
+ {
428
+ "cell_type": "code",
429
+ "execution_count": null,
430
+ "id": "13295901",
431
+ "metadata": {
432
+ "ExecuteTime": {
433
+ "end_time": "2022-06-18T01:51:13.121796Z",
434
+ "start_time": "2022-06-18T01:51:08.166523Z"
435
+ }
436
+ },
437
+ "outputs": [],
438
+ "source": [
439
+ "dataset_id = \"tempo_conus_2022\" # <----\n",
440
+ "project.load_dataset(dataset_id)\n",
441
+ "tempo = project.get_dataset(dataset_id)\n",
442
+ "print(\"tempo dataset loaded\")"
443
+ ]
444
+ },
445
+ {
446
+ "cell_type": "code",
447
+ "execution_count": null,
448
+ "id": "fbce31e1",
449
+ "metadata": {
450
+ "ExecuteTime": {
451
+ "end_time": "2022-06-18T01:51:13.126258Z",
452
+ "start_time": "2022-06-18T01:51:13.123686Z"
453
+ }
454
+ },
455
+ "outputs": [],
456
+ "source": [
457
+ "### TO BE DELETED ###\n",
458
+ "tempo_load_data_lookup = tempo.load_data_lookup\n",
459
+ "tempo_load_data = tempo.load_data\n",
460
+ "\n",
461
+ "# file = \"/scratch/dthom/tempo_load_data3.parquet\" # <---\n",
462
+ "# tempo_load_data = spark.read.parquet(file)"
463
+ ]
464
+ },
465
+ {
466
+ "cell_type": "code",
467
+ "execution_count": null,
468
+ "id": "9211fdc6",
469
+ "metadata": {
470
+ "ExecuteTime": {
471
+ "end_time": "2022-06-18T01:51:13.697017Z",
472
+ "start_time": "2022-06-18T01:51:13.128635Z"
473
+ }
474
+ },
475
+ "outputs": [],
476
+ "source": [
477
+ "tempo_mapped_load_data_lookup = tempo._handler._remap_dimension_columns(tempo_load_data_lookup)\n",
478
+ "tempo_mapped_load_data = tempo._handler._remap_dimension_columns(tempo_load_data)"
479
+ ]
480
+ },
481
+ {
482
+ "cell_type": "code",
483
+ "execution_count": null,
484
+ "id": "8037d63a",
485
+ "metadata": {
486
+ "ExecuteTime": {
487
+ "end_time": "2022-06-18T01:51:13.701676Z",
488
+ "start_time": "2022-06-18T01:51:13.699034Z"
489
+ }
490
+ },
491
+ "outputs": [],
492
+ "source": [
493
+ "del tempo_load_data_lookup\n",
494
+ "del tempo_load_data"
495
+ ]
496
+ },
497
+ {
498
+ "cell_type": "markdown",
499
+ "id": "2719ca95",
500
+ "metadata": {},
501
+ "source": [
502
+ "## 4. Queries\n",
503
+ "### Query util functions"
504
+ ]
505
+ },
506
+ {
507
+ "cell_type": "markdown",
508
+ "id": "3a58e461",
509
+ "metadata": {},
510
+ "source": [
511
+ "### 4.1. Hourly electricity consumption by *scenario, model_year, and ReEDS PCA*"
512
+ ]
513
+ },
514
+ {
515
+ "cell_type": "code",
516
+ "execution_count": null,
517
+ "id": "95324872",
518
+ "metadata": {
519
+ "ExecuteTime": {
520
+ "end_time": "2022-06-18T01:51:14.175522Z",
521
+ "start_time": "2022-06-18T01:51:13.703388Z"
522
+ }
523
+ },
524
+ "outputs": [],
525
+ "source": [
526
+ "### all_enduses-totelectric_enduses map\n",
527
+ "\n",
528
+ "dim_map_id = \"conus-2022-detailed-end-uses-kwh__all-electric-end-uses__c4149547-1209-4ce3-bb4c-3ab292067e8a\" # <---\n",
529
+ "electric_enduses_map = dim_map_mgr.get_by_id(dim_map_id).get_records_dataframe()\n",
530
+ "\n",
531
+ "### get all project electric end uses\n",
532
+ "electric_enduses = (\n",
533
+ " electric_enduses_map.filter(\"to_id is not NULL\")\n",
534
+ " .select(\"from_id\")\n",
535
+ " .toPandas()[\"from_id\"]\n",
536
+ " .to_list()\n",
537
+ ")\n",
538
+ "electric_enduses"
539
+ ]
540
+ },
541
+ {
542
+ "cell_type": "code",
543
+ "execution_count": null,
544
+ "id": "fc6700c0",
545
+ "metadata": {
546
+ "ExecuteTime": {
547
+ "end_time": "2022-06-18T01:51:14.483047Z",
548
+ "start_time": "2022-06-18T01:51:14.177069Z"
549
+ }
550
+ },
551
+ "outputs": [],
552
+ "source": [
553
+ "### county-to-PCA map\n",
554
+ "dim_map_id = \"us_counties_2020_l48__reeds_pca__fcc554e1-87c9-483f-89e3-a0df9563cf89\" # <---\n",
555
+ "county_to_pca_map = dim_map_mgr.get_by_id(dim_map_id).get_records_dataframe()\n",
556
+ "county_to_pca_map.show()"
557
+ ]
558
+ },
559
+ {
560
+ "cell_type": "markdown",
561
+ "id": "16a76b85",
562
+ "metadata": {},
563
+ "source": [
564
+ "### 4.1.3. TEMPO\n",
565
+ "query TEMPO data here"
566
+ ]
567
+ },
568
+ {
569
+ "cell_type": "code",
570
+ "execution_count": null,
571
+ "id": "c590dc5e",
572
+ "metadata": {
573
+ "ExecuteTime": {
574
+ "end_time": "2022-06-18T01:51:15.726162Z",
575
+ "start_time": "2022-06-18T01:51:14.485411Z"
576
+ }
577
+ },
578
+ "outputs": [],
579
+ "source": [
580
+ "## Load timezone map (not registered)\n",
581
+ "timezone_file = \"s3://nrel-dsgrid-int-scratch/scratch-lliu2/county_fip_to_local_prevailing_time.csv\" # \"/scratch/lliu2/project_county_timezone/county_fip_to_local_prevailing_time.csv\"\n",
582
+ "tz_map = spark.read.csv(timezone_file, header=True)\n",
583
+ "tz_map = tz_map.withColumn(\"from_fraction\", F.lit(1))\n",
584
+ "tz_map.show()"
585
+ ]
586
+ },
587
+ {
588
+ "cell_type": "code",
589
+ "execution_count": null,
590
+ "id": "87a88b0c",
591
+ "metadata": {
592
+ "ExecuteTime": {
593
+ "end_time": "2022-06-18T01:51:15.733246Z",
594
+ "start_time": "2022-06-18T01:51:15.728420Z"
595
+ }
596
+ },
597
+ "outputs": [],
598
+ "source": [
599
+ "### get electric end uses for transportation\n",
600
+ "tra_elec_enduses = [col for col in tempo_mapped_load_data.columns if col in electric_enduses]\n",
601
+ "tra_elec_enduses"
602
+ ]
603
+ },
604
+ {
605
+ "cell_type": "code",
606
+ "execution_count": null,
607
+ "id": "4970e36b",
608
+ "metadata": {
609
+ "ExecuteTime": {
610
+ "end_time": "2022-06-18T01:51:15.737460Z",
611
+ "start_time": "2022-06-18T01:51:15.735225Z"
612
+ }
613
+ },
614
+ "outputs": [],
615
+ "source": [
616
+ "### TO BE DELETED\n",
617
+ "# tempo_mapped_load_data_lookup = tempo_mapped_load_data_lookup.filter(\"id in ('1621180393', '770011011', '1058530452')\")\n",
618
+ "# tempo_mapped_load_data = tempo_mapped_load_data.filter(\"id in ('1621180393', '770011011', '1058530452')\")"
619
+ ]
620
+ },
621
+ {
622
+ "cell_type": "code",
623
+ "execution_count": null,
624
+ "id": "54adf133",
625
+ "metadata": {
626
+ "ExecuteTime": {
627
+ "end_time": "2022-06-18T01:51:15.762916Z",
628
+ "start_time": "2022-06-18T01:51:15.739144Z"
629
+ }
630
+ },
631
+ "outputs": [],
632
+ "source": [
633
+ "%%time\n",
634
+ "## 0. consolidate load_data: get total hourly electricity consumption by id\n",
635
+ "# make get_time_cols accessible at dataset level\n",
636
+ "tra_elec_kwh = tempo_mapped_load_data.select(\n",
637
+ " \"id\",\n",
638
+ " \"day_of_week\",\n",
639
+ " \"hour\",\n",
640
+ " \"month\",\n",
641
+ " sum([F.col(col) for col in tra_elec_enduses]).alias(\"electricity\"),\n",
642
+ ")\n",
643
+ "# tra_elec_kwh.show()"
644
+ ]
645
+ },
646
+ {
647
+ "cell_type": "code",
648
+ "execution_count": null,
649
+ "id": "14e42de4",
650
+ "metadata": {
651
+ "ExecuteTime": {
652
+ "end_time": "2022-06-18T01:51:15.831343Z",
653
+ "start_time": "2022-06-18T01:51:15.765116Z"
654
+ }
655
+ },
656
+ "outputs": [],
657
+ "source": [
658
+ "%%time\n",
659
+ "## 1. map load_data_lookup to timezone\n",
660
+ "load_data_lookup = (\n",
661
+ " tempo_mapped_load_data_lookup.filter(\"id is not NULL\")\n",
662
+ " .select(\"sector\", \"scenario\", \"model_year\", \"geography\", \"id\", \"fraction\")\n",
663
+ " .join(\n",
664
+ " tz_map,\n",
665
+ " on=F.col(\"geography\") == tz_map.from_id,\n",
666
+ " how=\"left\",\n",
667
+ " )\n",
668
+ " .drop(\"from_id\")\n",
669
+ " .withColumnRenamed(\"to_id\", \"timezone\")\n",
670
+ ")\n",
671
+ "\n",
672
+ "## combine fraction\n",
673
+ "nonfraction_cols = [x for x in load_data_lookup.columns if x not in {\"fraction\", \"from_fraction\"}]\n",
674
+ "load_data_lookup = load_data_lookup.fillna(1, subset=[\"from_fraction\"]).selectExpr(\n",
675
+ " *nonfraction_cols, \"fraction*from_fraction AS fraction\"\n",
676
+ ")\n",
677
+ "# load_data_lookup.show()"
678
+ ]
679
+ },
680
+ {
681
+ "cell_type": "code",
682
+ "execution_count": null,
683
+ "id": "b1edc12e",
684
+ "metadata": {
685
+ "ExecuteTime": {
686
+ "end_time": "2022-06-18T01:51:15.897261Z",
687
+ "start_time": "2022-06-18T01:51:15.833472Z"
688
+ }
689
+ },
690
+ "outputs": [],
691
+ "source": [
692
+ "%%time\n",
693
+ "## 2. join load_data and lookup\n",
694
+ "tra_elec_kwh = load_data_lookup.join(\n",
695
+ " tra_elec_kwh,\n",
696
+ " on=\"id\",\n",
697
+ " how=\"left\",\n",
698
+ ").drop(\"id\")\n",
699
+ "\n",
700
+ "tra_elec_kwh = tra_elec_kwh.groupBy(\n",
701
+ " \"sector\",\n",
702
+ " \"scenario\",\n",
703
+ " \"geography\",\n",
704
+ " \"model_year\",\n",
705
+ " \"timezone\",\n",
706
+ " \"day_of_week\",\n",
707
+ " \"month\",\n",
708
+ " \"hour\",\n",
709
+ ").agg(F.sum(F.col(\"fraction\") * F.col(\"electricity\")).alias(\"electricity\"))\n",
710
+ "\n",
711
+ "## cache df\n",
712
+ "# tra_elec_kwh = tra_elec_kwh.cache()\n",
713
+ "# tra_elec_kwh.show()"
714
+ ]
715
+ },
716
+ {
717
+ "cell_type": "code",
718
+ "execution_count": null,
719
+ "id": "3b9756a8",
720
+ "metadata": {
721
+ "ExecuteTime": {
722
+ "end_time": "2022-06-18T01:51:17.848803Z",
723
+ "start_time": "2022-06-18T01:51:15.899510Z"
724
+ }
725
+ },
726
+ "outputs": [],
727
+ "source": [
728
+ "%%time\n",
729
+ "year = 2012 # <--- weather year\n",
730
+ "sys_tz = TimeZone.EST.tz\n",
731
+ "timezones_local = [TimeZone.EPT, TimeZone.CPT, TimeZone.MPT, TimeZone.PPT]\n",
732
+ "\n",
733
+ "## 3. create range of model_year\n",
734
+ "model_time_pd = []\n",
735
+ "for tz in timezones_local:\n",
736
+ " model_time_df = pd.DataFrame()\n",
737
+ " # create time range in local time\n",
738
+ " model_time_df[\"timestamp\"] = pd.date_range(\n",
739
+ " start=datetime(year=int(year), month=1, day=1, hour=0),\n",
740
+ " end=datetime(year=int(year), month=12, day=31, hour=23),\n",
741
+ " tz=tz.tz,\n",
742
+ " freq=\"H\",\n",
743
+ " )\n",
744
+ " model_time_df[\"timezone\"] = tz.value\n",
745
+ " model_time_df[\"day_of_week\"] = model_time_df[\"timestamp\"].dt.day_of_week.astype(str)\n",
746
+ " model_time_df[\"month\"] = model_time_df[\"timestamp\"].dt.month.astype(str)\n",
747
+ " model_time_df[\"hour\"] = model_time_df[\"timestamp\"].dt.hour.astype(str)\n",
748
+ "\n",
749
+ " # convert to main timezone\n",
750
+ " model_time_df[\"timestamp\"] = model_time_df[\"timestamp\"].dt.tz_convert(sys_tz)\n",
751
+ " # wrap time to year\n",
752
+ " model_time_df[\"timestamp\"] = model_time_df[\"timestamp\"].apply(lambda x: x.replace(year=year))\n",
753
+ "\n",
754
+ " model_time_pd.append(model_time_df)\n",
755
+ "\n",
756
+ "model_time_pd = pd.concat(model_time_pd, axis=0, ignore_index=True)\n",
757
+ "model_time_pd[\"timestamp\"] = (\n",
758
+ " model_time_pd[\"timestamp\"].dt.tz_localize(None).astype(str)\n",
759
+ ") # conver timestamp to str, this is important!\n",
760
+ "print(model_time_pd)\n",
761
+ "\n",
762
+ "# convert to spark df\n",
763
+ "schema = sparktypes.StructType(\n",
764
+ " [\n",
765
+ " sparktypes.StructField(\"timestamp\", sparktypes.StringType(), False),\n",
766
+ " sparktypes.StructField(\"timezone\", sparktypes.StringType(), False),\n",
767
+ " sparktypes.StructField(\"day_of_week\", sparktypes.StringType(), False),\n",
768
+ " sparktypes.StructField(\"month\", sparktypes.StringType(), False),\n",
769
+ " sparktypes.StructField(\"hour\", sparktypes.StringType(), False),\n",
770
+ " ]\n",
771
+ ")\n",
772
+ "model_time = spark.createDataFrame(model_time_pd, schema=schema)\n",
773
+ "\n",
774
+ "## covert timestamp from str to timestamp\n",
775
+ "model_time = model_time.withColumn(\n",
776
+ " \"timestamp\",\n",
777
+ " F.from_unixtime(\n",
778
+ " F.unix_timestamp(F.col(\"timestamp\"), \"yyyy-MM-dd HH:mm:ss\"), \"yyyy-MM-dd HH:mm:ss\"\n",
779
+ " ),\n",
780
+ ")\n",
781
+ "model_time = model_time.withColumn(\"timestamp\", F.to_timestamp(\"timestamp\"))\n",
782
+ "model_time = model_time.cache()\n",
783
+ "\n",
784
+ "print(model_time.printSchema())\n",
785
+ "print(model_time.count())\n",
786
+ "model_time.show()"
787
+ ]
788
+ },
789
+ {
790
+ "cell_type": "code",
791
+ "execution_count": null,
792
+ "id": "554bc22f",
793
+ "metadata": {
794
+ "ExecuteTime": {
795
+ "end_time": "2022-06-18T01:51:17.875693Z",
796
+ "start_time": "2022-06-18T01:51:17.851121Z"
797
+ }
798
+ },
799
+ "outputs": [],
800
+ "source": [
801
+ "%%time\n",
802
+ "## 4. expand to model_years\n",
803
+ "tra_elec_kwh = model_time.join(\n",
804
+ " tra_elec_kwh, on=[\"timezone\", \"day_of_week\", \"month\", \"hour\"], how=\"right\"\n",
805
+ ").drop(\"day_of_week\", \"month\", \"hour\")\n",
806
+ "\n",
807
+ "## cache df\n",
808
+ "# tra_elec_kwh = tra_elec_kwh.cache()\n",
809
+ "# tra_elec_kwh.show()"
810
+ ]
811
+ },
812
+ {
813
+ "cell_type": "code",
814
+ "execution_count": null,
815
+ "id": "734e3970",
816
+ "metadata": {
817
+ "ExecuteTime": {
818
+ "end_time": "2022-06-18T01:51:17.915745Z",
819
+ "start_time": "2022-06-18T01:51:17.878004Z"
820
+ }
821
+ },
822
+ "outputs": [],
823
+ "source": [
824
+ "%%time\n",
825
+ "# 5. map load_data_lookup to PCA\n",
826
+ "tra_elec_kwh = (\n",
827
+ " tra_elec_kwh.join(\n",
828
+ " county_to_pca_map, on=F.col(\"geography\") == county_to_pca_map.from_id, how=\"left\"\n",
829
+ " )\n",
830
+ " .drop(\"from_id\")\n",
831
+ " .drop(\"geography\")\n",
832
+ " .withColumnRenamed(\"to_id\", \"geography\")\n",
833
+ " .groupBy(\"sector\", \"scenario\", \"geography\", \"model_year\", \"timestamp\")\n",
834
+ " .agg(F.sum(\"electricity\").alias(\"electricity\"))\n",
835
+ ")\n",
836
+ "\n",
837
+ "# tra_elec_kwh.show()"
838
+ ]
839
+ },
840
+ {
841
+ "cell_type": "code",
842
+ "execution_count": null,
843
+ "id": "13233566",
844
+ "metadata": {
845
+ "ExecuteTime": {
846
+ "end_time": "2022-06-18T03:28:13.295454Z",
847
+ "start_time": "2022-06-18T01:51:17.917568Z"
848
+ }
849
+ },
850
+ "outputs": [],
851
+ "source": [
852
+ "%%time\n",
853
+ "### 6. save as partitions\n",
854
+ "tra_output_file = \"s3://nrel-dsgrid-int-scratch/scratch-lliu2/tempo_projections.parquet\" # Path(f\"/scratch/{getpass.getuser()}/tempo_projections.parquet\")\n",
855
+ "\n",
856
+ "# # refresh file dir\n",
857
+ "if Path(tra_output_file).exists():\n",
858
+ " shutil.rmtree(tra_output_file)\n",
859
+ "\n",
860
+ "if Path(tra_output_file).exists():\n",
861
+ " raise ValueError(\n",
862
+ " f\"file: {tra_output_file} already exist. `shutile.rmtree(tra_output_file)` to override.\"\n",
863
+ " )\n",
864
+ "\n",
865
+ "tra_elec_kwh.sort(\"scenario\", \"model_year\", \"geography\", \"timestamp\").repartition(\n",
866
+ " \"scenario\", \"model_year\"\n",
867
+ ").write.partitionBy(\"scenario\", \"model_year\").option(\"path\", tra_output_file).saveAsTable(\n",
868
+ " \"tra_elec_kwh\", format=\"parquet\"\n",
869
+ ")\n",
870
+ "\n",
871
+ "print(\"tra_elec_kwh saved\")"
872
+ ]
873
+ },
874
+ {
875
+ "cell_type": "code",
876
+ "execution_count": null,
877
+ "id": "853ffc6d",
878
+ "metadata": {
879
+ "ExecuteTime": {
880
+ "end_time": "2022-06-18T03:28:13.301224Z",
881
+ "start_time": "2022-06-18T03:28:13.297395Z"
882
+ }
883
+ },
884
+ "outputs": [],
885
+ "source": [
886
+ "# %%time\n",
887
+ "# ########## load transportation projection data ###########\n",
888
+ "# tra_output_file = \"s3://nrel-dsgrid-int-scratch/scratch-lliu2/tempo_projections.parquet\" #Path(f\"/scratch/{getpass.getuser()}/tempo_projections.parquet\")\n",
889
+ "\n",
890
+ "# if Path(tra_output_file).exists():\n",
891
+ "# tra_elec_kwh = read_dataframe(tra_output_file)\n",
892
+ "# print(\"tra_elec_kwh loaded\")\n",
893
+ "# else:\n",
894
+ "# print(f\"tra_output_file={tra_output_file} does not exist\")"
895
+ ]
896
+ },
897
+ {
898
+ "cell_type": "code",
899
+ "execution_count": null,
900
+ "id": "28f6277a",
901
+ "metadata": {
902
+ "ExecuteTime": {
903
+ "end_time": "2022-06-18T05:24:12.421483Z",
904
+ "start_time": "2022-06-18T04:03:24.836808Z"
905
+ }
906
+ },
907
+ "outputs": [],
908
+ "source": [
909
+ "%%time\n",
910
+ "ts = tra_elec_kwh.groupBy(\"timestamp\").count().orderBy(\"timestamp\").toPandas()\n",
911
+ "ts"
912
+ ]
913
+ },
914
+ {
915
+ "cell_type": "code",
916
+ "execution_count": null,
917
+ "id": "87d8b42a",
918
+ "metadata": {},
919
+ "outputs": [],
920
+ "source": []
921
+ }
922
+ ],
923
+ "metadata": {
924
+ "kernelspec": {
925
+ "display_name": "Python 3.8.10 ('dsgrid')",
926
+ "language": "python",
927
+ "name": "python3"
928
+ },
929
+ "language_info": {
930
+ "codemirror_mode": {
931
+ "name": "ipython",
932
+ "version": 3
933
+ },
934
+ "file_extension": ".py",
935
+ "mimetype": "text/x-python",
936
+ "name": "python",
937
+ "nbconvert_exporter": "python",
938
+ "pygments_lexer": "ipython3",
939
+ "version": "3.8.10"
940
+ },
941
+ "vscode": {
942
+ "interpreter": {
943
+ "hash": "2458d4f391e03ccae12714782d51aa387d09e7b7a16d6832b1f2bffaf5a9bcc2"
944
+ }
945
+ }
946
+ },
947
+ "nbformat": 4,
948
+ "nbformat_minor": 5
949
+ }