buildstock-fetch 1.3.1__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of buildstock-fetch might be problematic. Click here for more details.
- buildstock_fetch/data/buildstock_releases.json +133 -83
- buildstock_fetch/data/buildstock_upgrades_lookup.json +23 -0
- buildstock_fetch/main.py +580 -118
- buildstock_fetch/main_cli.py +108 -19
- {buildstock_fetch-1.3.1.dist-info → buildstock_fetch-1.4.3.dist-info}/METADATA +32 -13
- {buildstock_fetch-1.3.1.dist-info → buildstock_fetch-1.4.3.dist-info}/RECORD +10 -9
- {buildstock_fetch-1.3.1.dist-info → buildstock_fetch-1.4.3.dist-info}/WHEEL +0 -0
- {buildstock_fetch-1.3.1.dist-info → buildstock_fetch-1.4.3.dist-info}/entry_points.txt +0 -0
- {buildstock_fetch-1.3.1.dist-info → buildstock_fetch-1.4.3.dist-info}/licenses/LICENSE +0 -0
- {buildstock_fetch-1.3.1.dist-info → buildstock_fetch-1.4.3.dist-info}/top_level.txt +0 -0
buildstock_fetch/main.py
CHANGED
|
@@ -1,15 +1,20 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
|
+
import gc
|
|
2
3
|
import json
|
|
4
|
+
import os
|
|
3
5
|
import tempfile
|
|
4
6
|
import zipfile
|
|
5
7
|
from dataclasses import asdict, dataclass
|
|
6
8
|
from datetime import timedelta
|
|
7
9
|
from importlib.resources import files
|
|
8
10
|
from pathlib import Path
|
|
9
|
-
from typing import Optional, Union
|
|
11
|
+
from typing import Any, Optional, Union
|
|
10
12
|
|
|
13
|
+
import boto3
|
|
11
14
|
import polars as pl
|
|
12
15
|
import requests
|
|
16
|
+
from botocore import UNSIGNED
|
|
17
|
+
from botocore.config import Config
|
|
13
18
|
from rich.console import Console
|
|
14
19
|
from rich.progress import (
|
|
15
20
|
BarColumn,
|
|
@@ -23,6 +28,8 @@ from rich.progress import (
|
|
|
23
28
|
TransferSpeedColumn,
|
|
24
29
|
)
|
|
25
30
|
|
|
31
|
+
# from buildstock_fetch.main_cli import _get_all_available_releases
|
|
32
|
+
|
|
26
33
|
|
|
27
34
|
class InvalidProductError(ValueError):
|
|
28
35
|
"""Raised when an invalid product is provided."""
|
|
@@ -96,6 +103,7 @@ class RequestedFileTypes:
|
|
|
96
103
|
load_curve_daily: bool = False
|
|
97
104
|
load_curve_monthly: bool = False
|
|
98
105
|
load_curve_annual: bool = False
|
|
106
|
+
trip_schedules: bool = False
|
|
99
107
|
weather: bool = False
|
|
100
108
|
|
|
101
109
|
|
|
@@ -193,10 +201,14 @@ class BuildingID:
|
|
|
193
201
|
return f"{self.base_url}metadata/upgrade{str(int(self.upgrade_id)).zfill(2)}.parquet"
|
|
194
202
|
elif self.release_year == "2024":
|
|
195
203
|
if self.res_com == "comstock" and self.weather == "amy2018" and self.release_number == "2":
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
204
|
+
if self.upgrade_id == "0":
|
|
205
|
+
upgrade_filename = "baseline"
|
|
206
|
+
else:
|
|
207
|
+
upgrade_filename = f"upgrade{str(int(self.upgrade_id)).zfill(2)}"
|
|
208
|
+
return (
|
|
209
|
+
f"{self.base_url}metadata_and_annual_results/by_state_and_county/full/parquet/"
|
|
210
|
+
f"state={self.state}/county={self._get_county_name()}/{self.state}_{self._get_county_name()}_{upgrade_filename}.parquet"
|
|
211
|
+
)
|
|
200
212
|
else:
|
|
201
213
|
if self.upgrade_id == "0":
|
|
202
214
|
return f"{self.base_url}metadata/baseline.parquet"
|
|
@@ -206,12 +218,12 @@ class BuildingID:
|
|
|
206
218
|
self.release_year == "2025"
|
|
207
219
|
and self.res_com == "comstock"
|
|
208
220
|
and self.weather == "amy2018"
|
|
209
|
-
and self.release_number == "1"
|
|
221
|
+
and (self.release_number == "1" or self.release_number == "2")
|
|
210
222
|
):
|
|
211
|
-
return
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
223
|
+
return (
|
|
224
|
+
f"{self.base_url}metadata_and_annual_results/by_state_and_county/full/parquet/"
|
|
225
|
+
f"state={self.state}/county={self._get_county_name()}/{self.state}_{self._get_county_name()}_upgrade{self.upgrade_id}.parquet"
|
|
226
|
+
)
|
|
215
227
|
else:
|
|
216
228
|
return ""
|
|
217
229
|
|
|
@@ -630,6 +642,21 @@ def _download_with_progress(url: str, output_file: Path, progress: Progress, tas
|
|
|
630
642
|
return downloaded_size
|
|
631
643
|
|
|
632
644
|
|
|
645
|
+
def _extract_metadata_columns_to_keep(metadata_file: Path) -> list[str]:
|
|
646
|
+
"""Extract metadata columns from a schema."""
|
|
647
|
+
schema = pl.scan_parquet(metadata_file).collect_schema()
|
|
648
|
+
|
|
649
|
+
columns_to_keep = []
|
|
650
|
+
for col in schema:
|
|
651
|
+
if (
|
|
652
|
+
any(keyword in col for keyword in ["upgrade", "bldg_id", "metadata_index"])
|
|
653
|
+
or col.startswith("in.")
|
|
654
|
+
or col.startswith("in.")
|
|
655
|
+
):
|
|
656
|
+
columns_to_keep.append(col)
|
|
657
|
+
return columns_to_keep
|
|
658
|
+
|
|
659
|
+
|
|
633
660
|
def _download_with_progress_metadata(url: str, output_file: Path, progress: Progress, task_id: TaskID) -> int:
|
|
634
661
|
"""Download a metadata file with progress tracking and append to existing file if it exists."""
|
|
635
662
|
# Get file size first
|
|
@@ -646,36 +673,26 @@ def _download_with_progress_metadata(url: str, output_file: Path, progress: Prog
|
|
|
646
673
|
|
|
647
674
|
# Check if output file already exists
|
|
648
675
|
if output_file.exists():
|
|
649
|
-
# Read existing parquet file
|
|
650
|
-
existing_df = pl.read_parquet(output_file)
|
|
651
|
-
|
|
652
|
-
# Download new data to temporary file
|
|
653
676
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as temp_file:
|
|
654
677
|
temp_path = Path(temp_file.name)
|
|
678
|
+
with open(temp_path, "wb") as file:
|
|
679
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
680
|
+
if chunk:
|
|
681
|
+
file.write(chunk)
|
|
682
|
+
downloaded_size += len(chunk)
|
|
683
|
+
if total_size > 0:
|
|
684
|
+
progress.update(task_id, completed=downloaded_size)
|
|
685
|
+
_process_single_metadata_file(temp_path)
|
|
655
686
|
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
progress.update(task_id, completed=downloaded_size)
|
|
665
|
-
|
|
666
|
-
# Read new data
|
|
667
|
-
new_df = pl.read_parquet(temp_path)
|
|
668
|
-
|
|
669
|
-
# Concatenate existing and new data, removing duplicates
|
|
670
|
-
combined_df = pl.concat([existing_df, new_df]).unique()
|
|
671
|
-
|
|
672
|
-
# Write combined data back to original file
|
|
673
|
-
combined_df.write_parquet(output_file)
|
|
687
|
+
existing_file = pl.scan_parquet(output_file)
|
|
688
|
+
new_file = pl.scan_parquet(temp_path)
|
|
689
|
+
combined_file = pl.concat([existing_file, new_file])
|
|
690
|
+
# Remove duplicate rows based on bldg_id column
|
|
691
|
+
deduplicated_file = combined_file.collect().unique(subset=["bldg_id"], keep="first")
|
|
692
|
+
deduplicated_file.write_parquet(output_file)
|
|
693
|
+
gc.collect()
|
|
694
|
+
os.remove(temp_path)
|
|
674
695
|
|
|
675
|
-
finally:
|
|
676
|
-
# Clean up temp file
|
|
677
|
-
if temp_path.exists():
|
|
678
|
-
temp_path.unlink()
|
|
679
696
|
else:
|
|
680
697
|
# File doesn't exist, download normally
|
|
681
698
|
with open(str(output_file), "wb") as file:
|
|
@@ -686,9 +703,47 @@ def _download_with_progress_metadata(url: str, output_file: Path, progress: Prog
|
|
|
686
703
|
if total_size > 0:
|
|
687
704
|
progress.update(task_id, completed=downloaded_size)
|
|
688
705
|
|
|
706
|
+
_process_single_metadata_file(output_file)
|
|
707
|
+
|
|
689
708
|
return downloaded_size
|
|
690
709
|
|
|
691
710
|
|
|
711
|
+
def _process_single_metadata_file(metadata_file: Path) -> None:
|
|
712
|
+
"""Process a single metadata file to keep only columns containing specified keywords."""
|
|
713
|
+
# First, get column names without loading data into memory
|
|
714
|
+
schema = pl.scan_parquet(metadata_file).collect_schema()
|
|
715
|
+
|
|
716
|
+
# Filter columns to only keep those containing "bldg_id", "upgrade", "metadata_index", or "out."
|
|
717
|
+
# and remove columns that start with "in."
|
|
718
|
+
columns_to_keep = []
|
|
719
|
+
for col in schema:
|
|
720
|
+
if any(keyword in col for keyword in ["bldg_id", "upgrade", "metadata_index"]) or col.startswith("in."):
|
|
721
|
+
columns_to_keep.append(col)
|
|
722
|
+
|
|
723
|
+
# Use streaming operations to avoid loading entire file into memory
|
|
724
|
+
# Create a temporary file to write the filtered data
|
|
725
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as temp_file:
|
|
726
|
+
temp_file_path = temp_file.name
|
|
727
|
+
|
|
728
|
+
try:
|
|
729
|
+
# Stream the data: select columns and write in one operation
|
|
730
|
+
filtered_metadata_file = pl.scan_parquet(metadata_file).select(columns_to_keep).collect()
|
|
731
|
+
filtered_metadata_file.write_parquet(temp_file_path)
|
|
732
|
+
|
|
733
|
+
# Replace the original file with the filtered one
|
|
734
|
+
os.replace(temp_file_path, metadata_file)
|
|
735
|
+
|
|
736
|
+
# Force garbage collection to free memory immediately
|
|
737
|
+
gc.collect()
|
|
738
|
+
|
|
739
|
+
except Exception:
|
|
740
|
+
# Clean up temp file if something goes wrong
|
|
741
|
+
if os.path.exists(temp_file_path):
|
|
742
|
+
os.remove(temp_file_path)
|
|
743
|
+
raise
|
|
744
|
+
return
|
|
745
|
+
|
|
746
|
+
|
|
692
747
|
def _get_time_step_grouping_key(aggregate_time_step: str) -> tuple[str, str]:
|
|
693
748
|
"""Get the grouping key and format string for a given time step.
|
|
694
749
|
|
|
@@ -749,10 +804,7 @@ def _create_aggregation_expressions(load_curve: pl.DataFrame, column_aggregation
|
|
|
749
804
|
def _aggregate_load_curve_aggregate(
|
|
750
805
|
load_curve: pl.DataFrame, aggregate_time_step: str, release_year: str
|
|
751
806
|
) -> pl.DataFrame:
|
|
752
|
-
"""Aggregate the 15-minute load curve to specified time step based on aggregation rules.
|
|
753
|
-
|
|
754
|
-
Removes the last row to ensure complete aggregation periods.
|
|
755
|
-
"""
|
|
807
|
+
"""Aggregate the 15-minute load curve to specified time step based on aggregation rules."""
|
|
756
808
|
# Read the aggregation rules from CSV
|
|
757
809
|
if release_year == "2024":
|
|
758
810
|
load_curve_map = LOAD_CURVE_COLUMN_AGGREGATION.joinpath("2024_resstock_load_curve_columns.csv")
|
|
@@ -833,6 +885,7 @@ def _download_and_process_aggregate(
|
|
|
833
885
|
# Process with Polars
|
|
834
886
|
load_curve_15min = pl.read_parquet(temp_path)
|
|
835
887
|
load_curve_aggregate = _aggregate_load_curve_aggregate(load_curve_15min, aggregate_time_step, release_year)
|
|
888
|
+
_add_time_aggregation_columns(load_curve_aggregate, aggregate_time_step)
|
|
836
889
|
|
|
837
890
|
# Save processed file to final destination
|
|
838
891
|
load_curve_aggregate.write_parquet(output_file)
|
|
@@ -1013,6 +1066,39 @@ def download_15min_load_curve_with_progress(
|
|
|
1013
1066
|
return output_file
|
|
1014
1067
|
|
|
1015
1068
|
|
|
1069
|
+
def _add_time_aggregation_columns(load_curve_aggregate: pl.DataFrame, aggregate_time_step: str) -> None:
|
|
1070
|
+
"""Add time-based columns to the dataframe based on aggregation type.
|
|
1071
|
+
|
|
1072
|
+
Args:
|
|
1073
|
+
df: Polars DataFrame with a 'timestamp' column
|
|
1074
|
+
aggregate_time_step: Type of aggregation ('hourly', 'daily', 'monthly')
|
|
1075
|
+
"""
|
|
1076
|
+
if aggregate_time_step == "hourly":
|
|
1077
|
+
# Add year, month, day, and hour columns
|
|
1078
|
+
new_df = load_curve_aggregate.with_columns([
|
|
1079
|
+
pl.col("timestamp").dt.year().alias("year"),
|
|
1080
|
+
pl.col("timestamp").dt.month().alias("month"),
|
|
1081
|
+
pl.col("timestamp").dt.day().alias("day"),
|
|
1082
|
+
pl.col("timestamp").dt.hour().alias("hour"),
|
|
1083
|
+
])
|
|
1084
|
+
load_curve_aggregate.__dict__.update(new_df.__dict__)
|
|
1085
|
+
elif aggregate_time_step == "daily":
|
|
1086
|
+
# Add year, month, and day columns
|
|
1087
|
+
new_df = load_curve_aggregate.with_columns([
|
|
1088
|
+
pl.col("timestamp").dt.year().alias("year"),
|
|
1089
|
+
pl.col("timestamp").dt.month().alias("month"),
|
|
1090
|
+
pl.col("timestamp").dt.day().alias("day"),
|
|
1091
|
+
])
|
|
1092
|
+
load_curve_aggregate.__dict__.update(new_df.__dict__)
|
|
1093
|
+
elif aggregate_time_step == "monthly":
|
|
1094
|
+
# Add year and month columns
|
|
1095
|
+
new_df = load_curve_aggregate.with_columns([
|
|
1096
|
+
pl.col("timestamp").dt.year().alias("year"),
|
|
1097
|
+
pl.col("timestamp").dt.month().alias("month"),
|
|
1098
|
+
])
|
|
1099
|
+
load_curve_aggregate.__dict__.update(new_df.__dict__)
|
|
1100
|
+
|
|
1101
|
+
|
|
1016
1102
|
def download_aggregate_time_step_load_curve_with_progress(
|
|
1017
1103
|
bldg_id: BuildingID,
|
|
1018
1104
|
output_dir: Path,
|
|
@@ -1067,6 +1153,7 @@ def download_aggregate_time_step_load_curve_with_progress(
|
|
|
1067
1153
|
load_curve_aggregate = _aggregate_load_curve_aggregate(
|
|
1068
1154
|
load_curve_15min, aggregate_time_step, bldg_id.release_year
|
|
1069
1155
|
)
|
|
1156
|
+
_add_time_aggregation_columns(load_curve_aggregate, aggregate_time_step)
|
|
1070
1157
|
|
|
1071
1158
|
# Save processed file to final destination
|
|
1072
1159
|
load_curve_aggregate.write_parquet(output_file)
|
|
@@ -1080,28 +1167,32 @@ def download_aggregate_time_step_load_curve_with_progress(
|
|
|
1080
1167
|
def _parse_requested_file_type(file_type: tuple[str, ...]) -> RequestedFileTypes:
|
|
1081
1168
|
"""Parse the file type string into a RequestedFileTypes object."""
|
|
1082
1169
|
file_type_obj = RequestedFileTypes()
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1170
|
+
|
|
1171
|
+
# Map file type strings to their corresponding attributes
|
|
1172
|
+
type_mapping = {
|
|
1173
|
+
"hpxml": "hpxml",
|
|
1174
|
+
"schedule": "schedule",
|
|
1175
|
+
"metadata": "metadata",
|
|
1176
|
+
"load_curve_15min": "load_curve_15min",
|
|
1177
|
+
"load_curve_hourly": "load_curve_hourly",
|
|
1178
|
+
"load_curve_daily": "load_curve_daily",
|
|
1179
|
+
"load_curve_monthly": "load_curve_monthly",
|
|
1180
|
+
"load_curve_annual": "load_curve_annual",
|
|
1181
|
+
"trip_schedules": "trip_schedules",
|
|
1182
|
+
"weather": "weather",
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
# Set attributes based on what's in the file_type tuple
|
|
1186
|
+
for type_str, attr_name in type_mapping.items():
|
|
1187
|
+
if type_str in file_type:
|
|
1188
|
+
setattr(file_type_obj, attr_name, True)
|
|
1189
|
+
|
|
1101
1190
|
return file_type_obj
|
|
1102
1191
|
|
|
1103
1192
|
|
|
1104
|
-
def
|
|
1193
|
+
def _filter_metadata_requested_bldg_ids(
|
|
1194
|
+
bldg_ids: list[BuildingID], output_dir: Path, downloaded_paths: list[Path]
|
|
1195
|
+
) -> None:
|
|
1105
1196
|
"""Process the results of a completed metadata download."""
|
|
1106
1197
|
metadata_to_bldg_id_mapping: dict[Path, list[int]] = {}
|
|
1107
1198
|
for bldg_id in bldg_ids:
|
|
@@ -1120,14 +1211,69 @@ def _process_metadata_results(bldg_ids: list[BuildingID], output_dir: Path, down
|
|
|
1120
1211
|
metadata_to_bldg_id_mapping[output_file] = [bldg_id.bldg_id]
|
|
1121
1212
|
|
|
1122
1213
|
for metadata_file, bldg_id_list in metadata_to_bldg_id_mapping.items():
|
|
1123
|
-
# Use
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1214
|
+
# Use streaming operations to avoid loading entire file into memory
|
|
1215
|
+
# Stream the data: filter rows, select columns, and write in one operation
|
|
1216
|
+
filtered_metadata_file = pl.scan_parquet(metadata_file).filter(pl.col("bldg_id").is_in(bldg_id_list)).collect()
|
|
1217
|
+
|
|
1218
|
+
# Replace the original file with the filtered one
|
|
1219
|
+
filtered_metadata_file.write_parquet(metadata_file)
|
|
1220
|
+
|
|
1221
|
+
# Force garbage collection to free memory immediately
|
|
1222
|
+
gc.collect()
|
|
1127
1223
|
|
|
1128
1224
|
return
|
|
1129
1225
|
|
|
1130
1226
|
|
|
1227
|
+
def _process_annual_load_curve_file(file_path: Path) -> None:
|
|
1228
|
+
"""Process an annual load curve file to keep only columns containing specified keywords.
|
|
1229
|
+
|
|
1230
|
+
Args:
|
|
1231
|
+
file_path: Path to the annual load curve parquet file to process.
|
|
1232
|
+
"""
|
|
1233
|
+
# First, get column names without loading data into memory
|
|
1234
|
+
schema = pl.scan_parquet(file_path).collect_schema()
|
|
1235
|
+
|
|
1236
|
+
# Filter columns to only keep those containing "bldg_id", "upgrade", "metadata_index", or "out."
|
|
1237
|
+
# and remove columns that start with "in."
|
|
1238
|
+
columns_to_keep = []
|
|
1239
|
+
for col in schema:
|
|
1240
|
+
if (
|
|
1241
|
+
any(keyword in col for keyword in ["bldg_id", "upgrade", "metadata_index"]) or col.startswith("out.")
|
|
1242
|
+
) and not col.startswith("in."):
|
|
1243
|
+
columns_to_keep.append(col)
|
|
1244
|
+
|
|
1245
|
+
# Use streaming operations to avoid loading entire file into memory
|
|
1246
|
+
# Create a temporary file to write the filtered data
|
|
1247
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as temp_file:
|
|
1248
|
+
temp_file_path = temp_file.name
|
|
1249
|
+
|
|
1250
|
+
# Stream the data: select columns and write in one operation
|
|
1251
|
+
filtered_file = pl.scan_parquet(file_path).select(columns_to_keep).collect()
|
|
1252
|
+
filtered_file.write_parquet(temp_file_path)
|
|
1253
|
+
|
|
1254
|
+
# Replace the original file with the filtered one
|
|
1255
|
+
os.replace(temp_file_path, file_path)
|
|
1256
|
+
|
|
1257
|
+
# Force garbage collection to free memory immediately
|
|
1258
|
+
gc.collect()
|
|
1259
|
+
|
|
1260
|
+
|
|
1261
|
+
def _process_annual_load_curve_results(downloaded_paths: list[Path]) -> None:
|
|
1262
|
+
"""Process all downloaded annual load curve files to filter columns.
|
|
1263
|
+
|
|
1264
|
+
Args:
|
|
1265
|
+
downloaded_paths: List of all downloaded file paths.
|
|
1266
|
+
"""
|
|
1267
|
+
# Filter for annual load curve files
|
|
1268
|
+
annual_load_curve_files = [
|
|
1269
|
+
path for path in downloaded_paths if "load_curve_annual" in str(path) and path.suffix == ".parquet"
|
|
1270
|
+
]
|
|
1271
|
+
|
|
1272
|
+
# Process each annual load curve file
|
|
1273
|
+
for file_path in annual_load_curve_files:
|
|
1274
|
+
_process_annual_load_curve_file(file_path)
|
|
1275
|
+
|
|
1276
|
+
|
|
1131
1277
|
def _process_download_results(
|
|
1132
1278
|
future: concurrent.futures.Future,
|
|
1133
1279
|
bldg_id: BuildingID,
|
|
@@ -1184,14 +1330,9 @@ def _download_metadata_with_progress(
|
|
|
1184
1330
|
if download_url in metadata_urls:
|
|
1185
1331
|
metadata_urls.remove(download_url)
|
|
1186
1332
|
metadata_task = progress.add_task(
|
|
1187
|
-
f"[yellow]Downloading metadata: {
|
|
1333
|
+
f"[yellow]Downloading metadata: {bldg_id.get_release_name()} - (upgrade {bldg_id.upgrade_id}) - {bldg_id.state}",
|
|
1188
1334
|
total=0, # Will be updated when we get the file size
|
|
1189
1335
|
)
|
|
1190
|
-
# Get file size first
|
|
1191
|
-
response = requests.head(download_url, timeout=30)
|
|
1192
|
-
response.raise_for_status()
|
|
1193
|
-
total_size = int(response.headers.get("content-length", 0))
|
|
1194
|
-
progress.update(metadata_task, total=total_size)
|
|
1195
1336
|
|
|
1196
1337
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
1197
1338
|
try:
|
|
@@ -1272,62 +1413,64 @@ def _download_15min_load_curves_parallel(
|
|
|
1272
1413
|
console: Console,
|
|
1273
1414
|
) -> None:
|
|
1274
1415
|
"""Download 15-minute load curves in parallel with progress tracking."""
|
|
1275
|
-
# Create progress tasks for 15-minute load curve downloads
|
|
1276
|
-
load_curve_tasks = {}
|
|
1277
|
-
for i, bldg_id in enumerate(bldg_ids):
|
|
1278
|
-
task_id = progress.add_task(
|
|
1279
|
-
f"[magenta]Load curve {bldg_id.bldg_id} (upgrade {bldg_id.upgrade_id})",
|
|
1280
|
-
total=0, # Will be updated when we get the file size
|
|
1281
|
-
)
|
|
1282
|
-
load_curve_tasks[i] = task_id
|
|
1283
1416
|
|
|
1284
|
-
# Create
|
|
1417
|
+
# Create progress tasks based on dataset size
|
|
1418
|
+
if len(bldg_ids) > 500:
|
|
1419
|
+
load_curve_tasks = _create_batch_progress_tasks_15min(bldg_ids, progress, console)
|
|
1420
|
+
else:
|
|
1421
|
+
load_curve_tasks = _create_individual_progress_tasks_15min(bldg_ids, progress)
|
|
1422
|
+
|
|
1423
|
+
# Create download functions
|
|
1285
1424
|
def download_15min_with_task_id(bldg_id: BuildingID, output_dir: Path, task_id: TaskID) -> Path:
|
|
1286
1425
|
return download_15min_load_curve_with_progress(bldg_id, output_dir, progress, task_id)
|
|
1287
1426
|
|
|
1288
1427
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1428
|
+
if len(bldg_ids) > 500:
|
|
1429
|
+
# Process in batches for large datasets
|
|
1430
|
+
num_batches = 20
|
|
1431
|
+
batch_size = ((len(bldg_ids) + num_batches - 1) // num_batches + 99) // 100 * 100
|
|
1432
|
+
future_to_bldg = {}
|
|
1293
1433
|
|
|
1434
|
+
for batch_idx in range(0, len(bldg_ids), batch_size):
|
|
1435
|
+
batch = bldg_ids[batch_idx : batch_idx + batch_size]
|
|
1436
|
+
# Skip empty batches
|
|
1437
|
+
if not batch:
|
|
1438
|
+
break
|
|
1439
|
+
|
|
1440
|
+
task_id = load_curve_tasks[batch_idx // batch_size]
|
|
1441
|
+
|
|
1442
|
+
for bldg_id in batch:
|
|
1443
|
+
future = executor.submit(
|
|
1444
|
+
_download_15min_with_batch_progress,
|
|
1445
|
+
bldg_id,
|
|
1446
|
+
output_dir,
|
|
1447
|
+
task_id,
|
|
1448
|
+
progress,
|
|
1449
|
+
)
|
|
1450
|
+
future_to_bldg[future] = bldg_id
|
|
1451
|
+
else:
|
|
1452
|
+
# Original behavior for smaller datasets
|
|
1453
|
+
future_to_bldg = {
|
|
1454
|
+
executor.submit(download_15min_with_task_id, bldg_id, output_dir, load_curve_tasks[i]): bldg_id
|
|
1455
|
+
for i, bldg_id in enumerate(bldg_ids)
|
|
1456
|
+
}
|
|
1457
|
+
|
|
1458
|
+
# Process completed futures
|
|
1294
1459
|
for future in concurrent.futures.as_completed(future_to_bldg):
|
|
1295
1460
|
bldg_id = future_to_bldg[future]
|
|
1296
|
-
|
|
1297
|
-
output_file = future.result()
|
|
1298
|
-
downloaded_paths.append(output_file)
|
|
1299
|
-
except No15minLoadCurveError:
|
|
1300
|
-
output_file = (
|
|
1301
|
-
output_dir
|
|
1302
|
-
/ bldg_id.get_release_name()
|
|
1303
|
-
/ "load_curve_15min"
|
|
1304
|
-
/ f"state={bldg_id.state}"
|
|
1305
|
-
/ f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
|
|
1306
|
-
/ f"bldg{str(bldg_id.bldg_id).zfill(7)}_load_curve_15min.parquet"
|
|
1307
|
-
)
|
|
1308
|
-
failed_downloads.append(str(output_file))
|
|
1309
|
-
console.print(f"[red]15 min load curve not available for {bldg_id.get_release_name()}[/red]")
|
|
1310
|
-
raise
|
|
1311
|
-
except Exception as e:
|
|
1312
|
-
output_file = (
|
|
1313
|
-
output_dir
|
|
1314
|
-
/ bldg_id.get_release_name()
|
|
1315
|
-
/ "load_curve_15min"
|
|
1316
|
-
/ f"state={bldg_id.state}"
|
|
1317
|
-
/ f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
|
|
1318
|
-
/ f"bldg{str(bldg_id.bldg_id).zfill(7)}_load_curve_15min.parquet"
|
|
1319
|
-
)
|
|
1320
|
-
failed_downloads.append(str(output_file))
|
|
1321
|
-
console.print(f"[red]Download failed for 15 min load curve {bldg_id.bldg_id}: {e}[/red]")
|
|
1461
|
+
_process_download_future_15min(future, bldg_id, output_dir, downloaded_paths, failed_downloads, console)
|
|
1322
1462
|
|
|
1323
1463
|
|
|
1324
1464
|
def _create_batch_progress_tasks(
|
|
1325
1465
|
bldg_ids: list[BuildingID], aggregate_time_step: str, progress: Progress, console: Console
|
|
1326
1466
|
) -> dict[int, TaskID]:
|
|
1327
1467
|
"""Create progress tasks for batch processing."""
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1468
|
+
num_batches = 20
|
|
1469
|
+
# Calculate batch size rounded up to nearest 100
|
|
1470
|
+
batch_size = ((len(bldg_ids) + num_batches - 1) // num_batches + 99) // 100 * 100
|
|
1471
|
+
console.print(
|
|
1472
|
+
f"[blue]Using batch processing: {len(bldg_ids)} buildings split into {num_batches} batches of up to {batch_size} buildings each[/blue]"
|
|
1473
|
+
)
|
|
1331
1474
|
|
|
1332
1475
|
load_curve_tasks = {}
|
|
1333
1476
|
for i in range(num_batches):
|
|
@@ -1336,6 +1479,10 @@ def _create_batch_progress_tasks(
|
|
|
1336
1479
|
end_idx = min(start_idx + batch_size, len(bldg_ids))
|
|
1337
1480
|
batch_count = end_idx - start_idx
|
|
1338
1481
|
|
|
1482
|
+
# Skip empty or negative batches
|
|
1483
|
+
if batch_count <= 0:
|
|
1484
|
+
break
|
|
1485
|
+
|
|
1339
1486
|
console.print(f"[blue]Batch {i + 1}/{num_batches}: {batch_count} buildings[/blue]")
|
|
1340
1487
|
|
|
1341
1488
|
task_id = progress.add_task(
|
|
@@ -1347,6 +1494,39 @@ def _create_batch_progress_tasks(
|
|
|
1347
1494
|
return load_curve_tasks
|
|
1348
1495
|
|
|
1349
1496
|
|
|
1497
|
+
def _create_batch_progress_tasks_15min(
|
|
1498
|
+
bldg_ids: list[BuildingID], progress: Progress, console: Console
|
|
1499
|
+
) -> dict[int, TaskID]:
|
|
1500
|
+
"""Create progress tasks for 15-minute load curve batch processing."""
|
|
1501
|
+
num_batches = 20
|
|
1502
|
+
# Calculate batch size rounded up to nearest 100
|
|
1503
|
+
batch_size = ((len(bldg_ids) + num_batches - 1) // num_batches + 99) // 100 * 100
|
|
1504
|
+
console.print(
|
|
1505
|
+
f"[blue]Using batch processing: {len(bldg_ids)} buildings split into {num_batches} batches of up to {batch_size} buildings each[/blue]"
|
|
1506
|
+
)
|
|
1507
|
+
|
|
1508
|
+
load_curve_tasks = {}
|
|
1509
|
+
for i in range(num_batches):
|
|
1510
|
+
# Calculate how many buildings are in this batch
|
|
1511
|
+
start_idx = i * batch_size
|
|
1512
|
+
end_idx = min(start_idx + batch_size, len(bldg_ids))
|
|
1513
|
+
batch_count = end_idx - start_idx
|
|
1514
|
+
|
|
1515
|
+
# Skip empty or negative batches
|
|
1516
|
+
if batch_count <= 0:
|
|
1517
|
+
break
|
|
1518
|
+
|
|
1519
|
+
console.print(f"[blue]Batch {i + 1}/{num_batches}: {batch_count} buildings[/blue]")
|
|
1520
|
+
|
|
1521
|
+
task_id = progress.add_task(
|
|
1522
|
+
f"[magenta]Batch {i + 1}/{num_batches} (15min)",
|
|
1523
|
+
total=batch_count, # Set total to the number of buildings in this batch
|
|
1524
|
+
)
|
|
1525
|
+
load_curve_tasks[i] = task_id
|
|
1526
|
+
|
|
1527
|
+
return load_curve_tasks
|
|
1528
|
+
|
|
1529
|
+
|
|
1350
1530
|
def _create_individual_progress_tasks(bldg_ids: list[BuildingID], progress: Progress) -> dict[int, TaskID]:
|
|
1351
1531
|
"""Create progress tasks for individual building processing."""
|
|
1352
1532
|
load_curve_tasks = {}
|
|
@@ -1359,6 +1539,18 @@ def _create_individual_progress_tasks(bldg_ids: list[BuildingID], progress: Prog
|
|
|
1359
1539
|
return load_curve_tasks
|
|
1360
1540
|
|
|
1361
1541
|
|
|
1542
|
+
def _create_individual_progress_tasks_15min(bldg_ids: list[BuildingID], progress: Progress) -> dict[int, TaskID]:
|
|
1543
|
+
"""Create progress tasks for individual 15-minute load curve processing."""
|
|
1544
|
+
load_curve_tasks = {}
|
|
1545
|
+
for i, bldg_id in enumerate(bldg_ids):
|
|
1546
|
+
task_id = progress.add_task(
|
|
1547
|
+
f"[magenta]Load curve {bldg_id.bldg_id} (upgrade {bldg_id.upgrade_id})",
|
|
1548
|
+
total=0, # Will be updated when we get the file size
|
|
1549
|
+
)
|
|
1550
|
+
load_curve_tasks[i] = task_id
|
|
1551
|
+
return load_curve_tasks
|
|
1552
|
+
|
|
1553
|
+
|
|
1362
1554
|
def _download_aggregate_with_batch_progress(
|
|
1363
1555
|
bldg_id: BuildingID, output_dir: Path, task_id: TaskID, aggregate_time_step: str, progress: Progress
|
|
1364
1556
|
) -> Path:
|
|
@@ -1370,6 +1562,17 @@ def _download_aggregate_with_batch_progress(
|
|
|
1370
1562
|
return result
|
|
1371
1563
|
|
|
1372
1564
|
|
|
1565
|
+
def _download_15min_with_batch_progress(
|
|
1566
|
+
bldg_id: BuildingID, output_dir: Path, task_id: TaskID, progress: Progress
|
|
1567
|
+
) -> Path:
|
|
1568
|
+
"""Download 15-minute load curve with batch progress tracking."""
|
|
1569
|
+
# Download the file without individual progress tracking
|
|
1570
|
+
result = download_15min_load_curve_with_progress(bldg_id, output_dir, None, None)
|
|
1571
|
+
# Update batch progress by 1
|
|
1572
|
+
progress.update(task_id, advance=1)
|
|
1573
|
+
return result
|
|
1574
|
+
|
|
1575
|
+
|
|
1373
1576
|
def _process_download_future(
|
|
1374
1577
|
future: concurrent.futures.Future,
|
|
1375
1578
|
bldg_id: BuildingID,
|
|
@@ -1406,6 +1609,43 @@ def _process_download_future(
|
|
|
1406
1609
|
console.print(f"[red]Download failed for monthly load curve {bldg_id.bldg_id}: {e}[/red]")
|
|
1407
1610
|
|
|
1408
1611
|
|
|
1612
|
+
def _process_download_future_15min(
|
|
1613
|
+
future: concurrent.futures.Future,
|
|
1614
|
+
bldg_id: BuildingID,
|
|
1615
|
+
output_dir: Path,
|
|
1616
|
+
downloaded_paths: list[Path],
|
|
1617
|
+
failed_downloads: list[str],
|
|
1618
|
+
console: Console,
|
|
1619
|
+
) -> None:
|
|
1620
|
+
"""Process a completed 15-minute download future."""
|
|
1621
|
+
try:
|
|
1622
|
+
output_file = future.result()
|
|
1623
|
+
downloaded_paths.append(output_file)
|
|
1624
|
+
except No15minLoadCurveError:
|
|
1625
|
+
output_file = (
|
|
1626
|
+
output_dir
|
|
1627
|
+
/ bldg_id.get_release_name()
|
|
1628
|
+
/ "load_curve_15min"
|
|
1629
|
+
/ f"state={bldg_id.state}"
|
|
1630
|
+
/ f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
|
|
1631
|
+
/ f"bldg{str(bldg_id.bldg_id).zfill(7)}_load_curve_15min.parquet"
|
|
1632
|
+
)
|
|
1633
|
+
failed_downloads.append(str(output_file))
|
|
1634
|
+
console.print(f"[red]15 min load curve not available for {bldg_id.get_release_name()}[/red]")
|
|
1635
|
+
raise
|
|
1636
|
+
except Exception as e:
|
|
1637
|
+
output_file = (
|
|
1638
|
+
output_dir
|
|
1639
|
+
/ bldg_id.get_release_name()
|
|
1640
|
+
/ "load_curve_15min"
|
|
1641
|
+
/ f"state={bldg_id.state}"
|
|
1642
|
+
/ f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
|
|
1643
|
+
/ f"bldg{str(bldg_id.bldg_id).zfill(7)}_load_curve_15min.parquet"
|
|
1644
|
+
)
|
|
1645
|
+
failed_downloads.append(str(output_file))
|
|
1646
|
+
console.print(f"[red]Download failed for 15 min load curve {bldg_id.bldg_id}: {e}[/red]")
|
|
1647
|
+
|
|
1648
|
+
|
|
1409
1649
|
def _download_aggregate_load_curves_parallel(
|
|
1410
1650
|
bldg_ids: list[BuildingID],
|
|
1411
1651
|
output_dir: Path,
|
|
@@ -1416,7 +1656,7 @@ def _download_aggregate_load_curves_parallel(
|
|
|
1416
1656
|
failed_downloads: list[str],
|
|
1417
1657
|
console: Console,
|
|
1418
1658
|
) -> None:
|
|
1419
|
-
"""Download
|
|
1659
|
+
"""Download aggregate load curves in parallel with progress tracking."""
|
|
1420
1660
|
|
|
1421
1661
|
# Create progress tasks based on dataset size
|
|
1422
1662
|
if len(bldg_ids) > 500:
|
|
@@ -1435,11 +1675,16 @@ def _download_aggregate_load_curves_parallel(
|
|
|
1435
1675
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
1436
1676
|
if len(bldg_ids) > 500:
|
|
1437
1677
|
# Process in batches for large datasets
|
|
1438
|
-
|
|
1678
|
+
num_batches = 20
|
|
1679
|
+
batch_size = ((len(bldg_ids) + num_batches - 1) // num_batches + 99) // 100 * 100
|
|
1439
1680
|
future_to_bldg = {}
|
|
1440
1681
|
|
|
1441
1682
|
for batch_idx in range(0, len(bldg_ids), batch_size):
|
|
1442
1683
|
batch = bldg_ids[batch_idx : batch_idx + batch_size]
|
|
1684
|
+
# Skip empty batches
|
|
1685
|
+
if not batch:
|
|
1686
|
+
break
|
|
1687
|
+
|
|
1443
1688
|
task_id = load_curve_tasks[batch_idx // batch_size]
|
|
1444
1689
|
|
|
1445
1690
|
for bldg_id in batch:
|
|
@@ -1481,7 +1726,8 @@ def _download_metadata(
|
|
|
1481
1726
|
if not bldg_ids:
|
|
1482
1727
|
return
|
|
1483
1728
|
_download_metadata_with_progress(bldg_ids, output_dir, progress, downloaded_paths, failed_downloads, console)
|
|
1484
|
-
|
|
1729
|
+
# Only keep the requested bldg_ids in the metadata file
|
|
1730
|
+
_filter_metadata_requested_bldg_ids(bldg_ids, output_dir, downloaded_paths)
|
|
1485
1731
|
|
|
1486
1732
|
|
|
1487
1733
|
def download_annual_load_curve_with_progress(
|
|
@@ -1594,6 +1840,132 @@ def _download_annual_load_curves_parallel(
|
|
|
1594
1840
|
console.print(f"[red]Download failed for annual load curve {bldg_id.bldg_id}: {e}[/red]")
|
|
1595
1841
|
|
|
1596
1842
|
|
|
1843
|
+
def _get_parquet_files_for_state(s3_client: Any, bucket: str, s3_prefix: str) -> list[str]:
|
|
1844
|
+
"""Get list of parquet files for a given S3 prefix."""
|
|
1845
|
+
paginator = s3_client.get_paginator("list_objects_v2")
|
|
1846
|
+
parquet_files = []
|
|
1847
|
+
for page in paginator.paginate(Bucket=bucket, Prefix=s3_prefix):
|
|
1848
|
+
for obj in page.get("Contents", []):
|
|
1849
|
+
if obj["Key"].endswith(".parquet"):
|
|
1850
|
+
parquet_files.append(obj["Key"])
|
|
1851
|
+
return parquet_files
|
|
1852
|
+
|
|
1853
|
+
|
|
1854
|
+
def _download_and_read_parquet_files(
|
|
1855
|
+
s3_client: Any, bucket: str, parquet_files: list[str], output_dir: Path
|
|
1856
|
+
) -> list[Any]:
|
|
1857
|
+
"""Download and read parquet files, returning a list of dataframes."""
|
|
1858
|
+
# Ensure output directory exists
|
|
1859
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
1860
|
+
|
|
1861
|
+
state_dataframes = []
|
|
1862
|
+
for s3_key in parquet_files:
|
|
1863
|
+
temp_file = output_dir / f"temp_{s3_key.split('/')[-1]}"
|
|
1864
|
+
s3_client.download_file(bucket, s3_key, str(temp_file))
|
|
1865
|
+
df = pl.read_parquet(str(temp_file))
|
|
1866
|
+
state_dataframes.append(df)
|
|
1867
|
+
temp_file.unlink()
|
|
1868
|
+
return state_dataframes
|
|
1869
|
+
|
|
1870
|
+
|
|
1871
|
+
def _process_state_data(
|
|
1872
|
+
s3_client: Any, bucket: str, prefix: str, release: str, state: str, output_dir: Path
|
|
1873
|
+
) -> tuple[list[Any], bool]:
|
|
1874
|
+
"""Process data for a single state, returning (dataframes, has_data)."""
|
|
1875
|
+
s3_prefix = f"{prefix}release={release}/state={state}/"
|
|
1876
|
+
parquet_files = _get_parquet_files_for_state(s3_client, bucket, s3_prefix)
|
|
1877
|
+
|
|
1878
|
+
if not parquet_files:
|
|
1879
|
+
return [], False
|
|
1880
|
+
|
|
1881
|
+
state_dataframes = _download_and_read_parquet_files(s3_client, bucket, parquet_files, output_dir)
|
|
1882
|
+
if state_dataframes:
|
|
1883
|
+
state_combined_df = pl.concat(state_dataframes)
|
|
1884
|
+
return [state_combined_df], True
|
|
1885
|
+
return [], False
|
|
1886
|
+
|
|
1887
|
+
|
|
1888
|
+
def _save_filtered_state_data(
|
|
1889
|
+
state_df: Any, state: str, bldg_ids: list[BuildingID], release: str, output_dir: Path, downloaded_paths: list[Path]
|
|
1890
|
+
) -> None:
|
|
1891
|
+
"""Save filtered data for a specific state."""
|
|
1892
|
+
bldg_id_list = [str(bldg.bldg_id) for bldg in bldg_ids if bldg.state == state]
|
|
1893
|
+
if not bldg_id_list:
|
|
1894
|
+
return
|
|
1895
|
+
|
|
1896
|
+
filtered_df = state_df.filter(pl.col("bldg_id").is_in(bldg_id_list))
|
|
1897
|
+
if filtered_df.height == 0:
|
|
1898
|
+
return
|
|
1899
|
+
|
|
1900
|
+
output_file = output_dir / release / "trip_schedules" / f"state={state}" / "trip_schedules.parquet"
|
|
1901
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
1902
|
+
filtered_df.write_parquet(str(output_file))
|
|
1903
|
+
downloaded_paths.append(output_file)
|
|
1904
|
+
|
|
1905
|
+
|
|
1906
|
+
def _download_trip_schedules_data(
|
|
1907
|
+
bldg_ids: list[BuildingID],
|
|
1908
|
+
output_dir: Path,
|
|
1909
|
+
downloaded_paths: list[Path],
|
|
1910
|
+
bucket: str = "buildstock-fetch",
|
|
1911
|
+
prefix: str = "ev_demand/trip_schedules/",
|
|
1912
|
+
) -> None:
|
|
1913
|
+
"""
|
|
1914
|
+
Download and filter trip schedules data for specific building IDs.
|
|
1915
|
+
|
|
1916
|
+
Args:
|
|
1917
|
+
bldg_ids: List of BuildingID objects to filter for.
|
|
1918
|
+
output_dir: Directory to save the downloaded files.
|
|
1919
|
+
downloaded_paths: List to append successful download paths to.
|
|
1920
|
+
bucket: Name of the S3 bucket.
|
|
1921
|
+
prefix: S3 prefix for the trip schedules data.
|
|
1922
|
+
|
|
1923
|
+
Raises:
|
|
1924
|
+
NoBuildingDataError: If no buildings from bldg_ids are found in any available state data.
|
|
1925
|
+
"""
|
|
1926
|
+
import warnings
|
|
1927
|
+
|
|
1928
|
+
release = bldg_ids[0].get_release_name()
|
|
1929
|
+
states_list = list({bldg.state for bldg in bldg_ids})
|
|
1930
|
+
|
|
1931
|
+
s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
|
|
1932
|
+
|
|
1933
|
+
all_dataframes = []
|
|
1934
|
+
available_states = []
|
|
1935
|
+
unavailable_states = []
|
|
1936
|
+
|
|
1937
|
+
# Process each state
|
|
1938
|
+
for state in states_list:
|
|
1939
|
+
state_dataframes, has_data = _process_state_data(s3, bucket, prefix, release, state, output_dir)
|
|
1940
|
+
|
|
1941
|
+
if has_data:
|
|
1942
|
+
available_states.append(state)
|
|
1943
|
+
all_dataframes.extend(state_dataframes)
|
|
1944
|
+
else:
|
|
1945
|
+
unavailable_states.append(state)
|
|
1946
|
+
|
|
1947
|
+
# Issue warnings for unavailable states
|
|
1948
|
+
if unavailable_states:
|
|
1949
|
+
warnings.warn(
|
|
1950
|
+
f"No trip schedules data found for {release} in states: {', '.join(unavailable_states)}. "
|
|
1951
|
+
f"Continuing with available states: {', '.join(available_states)}.",
|
|
1952
|
+
stacklevel=2,
|
|
1953
|
+
)
|
|
1954
|
+
|
|
1955
|
+
if not all_dataframes:
|
|
1956
|
+
msg = f"No trip schedules data found for {release} in any of the requested states: {', '.join(states_list)}"
|
|
1957
|
+
raise NoBuildingDataError(msg)
|
|
1958
|
+
|
|
1959
|
+
# Save filtered data for each available state separately
|
|
1960
|
+
for i, state_df in enumerate(all_dataframes):
|
|
1961
|
+
state = available_states[i]
|
|
1962
|
+
_save_filtered_state_data(state_df, state, bldg_ids, release, output_dir, downloaded_paths)
|
|
1963
|
+
|
|
1964
|
+
if not any(bldg.state in available_states for bldg in bldg_ids):
|
|
1965
|
+
msg = f"No trip schedules data found for buildings {[bldg.bldg_id for bldg in bldg_ids]} in {release} for any available state"
|
|
1966
|
+
raise NoBuildingDataError(msg)
|
|
1967
|
+
|
|
1968
|
+
|
|
1597
1969
|
def _download_weather_files_parallel(
|
|
1598
1970
|
bldg_ids: list[BuildingID],
|
|
1599
1971
|
output_dir: Path,
|
|
@@ -1731,6 +2103,8 @@ def fetch_bldg_data(
|
|
|
1731
2103
|
total_files += len(bldg_ids) # Add 15-minute load curve files
|
|
1732
2104
|
if file_type_obj.load_curve_hourly:
|
|
1733
2105
|
total_files += len(bldg_ids) # Add hourly load curve files
|
|
2106
|
+
if file_type_obj.load_curve_daily:
|
|
2107
|
+
total_files += len(bldg_ids) # Add daily load curve files
|
|
1734
2108
|
if file_type_obj.load_curve_monthly:
|
|
1735
2109
|
total_files += len(bldg_ids) # Add monthly load curve files
|
|
1736
2110
|
if file_type_obj.load_curve_annual:
|
|
@@ -1767,6 +2141,13 @@ def fetch_bldg_data(
|
|
|
1767
2141
|
weather_states,
|
|
1768
2142
|
)
|
|
1769
2143
|
|
|
2144
|
+
# TODO: add EV related files
|
|
2145
|
+
# TODO: Write a function for downloading EV related files from SB's s3 bucket.
|
|
2146
|
+
# It should dynamically build the download url based on the release_name + state combo.
|
|
2147
|
+
# Make sure to follow the directory structure for downloading the files.
|
|
2148
|
+
if file_type_obj.trip_schedules:
|
|
2149
|
+
_download_trip_schedules_data(bldg_ids, output_dir, downloaded_paths)
|
|
2150
|
+
|
|
1770
2151
|
_print_download_summary(downloaded_paths, failed_downloads, console)
|
|
1771
2152
|
|
|
1772
2153
|
return downloaded_paths, failed_downloads
|
|
@@ -1817,6 +2198,19 @@ def _execute_downloads(
|
|
|
1817
2198
|
console,
|
|
1818
2199
|
)
|
|
1819
2200
|
|
|
2201
|
+
if file_type_obj.load_curve_daily:
|
|
2202
|
+
aggregate_time_step = "daily"
|
|
2203
|
+
_download_aggregate_load_curves_parallel(
|
|
2204
|
+
bldg_ids,
|
|
2205
|
+
output_dir,
|
|
2206
|
+
aggregate_time_step,
|
|
2207
|
+
max_workers,
|
|
2208
|
+
progress,
|
|
2209
|
+
downloaded_paths,
|
|
2210
|
+
failed_downloads,
|
|
2211
|
+
console,
|
|
2212
|
+
)
|
|
2213
|
+
|
|
1820
2214
|
if file_type_obj.load_curve_monthly:
|
|
1821
2215
|
aggregate_time_step = "monthly"
|
|
1822
2216
|
_download_aggregate_load_curves_parallel(
|
|
@@ -1835,6 +2229,8 @@ def _execute_downloads(
|
|
|
1835
2229
|
_download_annual_load_curves_parallel(
|
|
1836
2230
|
bldg_ids, output_dir, max_workers, progress, downloaded_paths, failed_downloads, console
|
|
1837
2231
|
)
|
|
2232
|
+
# Process annual load curve files to filter columns
|
|
2233
|
+
_process_annual_load_curve_results(downloaded_paths)
|
|
1838
2234
|
|
|
1839
2235
|
# Get weather files if requested.
|
|
1840
2236
|
if file_type_obj.weather:
|
|
@@ -1846,12 +2242,78 @@ def _execute_downloads(
|
|
|
1846
2242
|
if __name__ == "__main__": # pragma: no cover
|
|
1847
2243
|
bldg_ids = [
|
|
1848
2244
|
BuildingID(
|
|
1849
|
-
bldg_id=
|
|
2245
|
+
bldg_id=19713,
|
|
2246
|
+
release_year="2024",
|
|
2247
|
+
res_com="comstock",
|
|
2248
|
+
weather="amy2018",
|
|
2249
|
+
upgrade_id="0",
|
|
2250
|
+
release_number="2",
|
|
2251
|
+
state="NY",
|
|
2252
|
+
),
|
|
2253
|
+
BuildingID(
|
|
2254
|
+
bldg_id=658,
|
|
2255
|
+
release_year="2024",
|
|
2256
|
+
res_com="comstock",
|
|
2257
|
+
weather="amy2018",
|
|
2258
|
+
upgrade_id="0",
|
|
2259
|
+
release_number="2",
|
|
2260
|
+
state="NY",
|
|
2261
|
+
),
|
|
2262
|
+
BuildingID(
|
|
2263
|
+
bldg_id=659,
|
|
2264
|
+
release_year="2024",
|
|
2265
|
+
res_com="comstock",
|
|
2266
|
+
weather="amy2018",
|
|
2267
|
+
upgrade_id="0",
|
|
2268
|
+
release_number="2",
|
|
2269
|
+
state="NY",
|
|
1850
2270
|
),
|
|
1851
2271
|
]
|
|
1852
|
-
file_type = ("
|
|
2272
|
+
file_type = ("metadata",)
|
|
1853
2273
|
output_dir = Path("data")
|
|
1854
|
-
|
|
1855
|
-
downloaded_paths, failed_downloads = fetch_bldg_data(bldg_ids, file_type, output_dir
|
|
2274
|
+
|
|
2275
|
+
downloaded_paths, failed_downloads = fetch_bldg_data(bldg_ids, file_type, output_dir)
|
|
1856
2276
|
print(downloaded_paths)
|
|
1857
2277
|
print(failed_downloads)
|
|
2278
|
+
bldg_ids = [
|
|
2279
|
+
BuildingID(
|
|
2280
|
+
bldg_id=21023,
|
|
2281
|
+
release_year="2024",
|
|
2282
|
+
res_com="comstock",
|
|
2283
|
+
weather="amy2018",
|
|
2284
|
+
upgrade_id="0",
|
|
2285
|
+
release_number="2",
|
|
2286
|
+
state="NY",
|
|
2287
|
+
),
|
|
2288
|
+
BuildingID(
|
|
2289
|
+
bldg_id=18403,
|
|
2290
|
+
release_year="2024",
|
|
2291
|
+
res_com="comstock",
|
|
2292
|
+
weather="amy2018",
|
|
2293
|
+
upgrade_id="0",
|
|
2294
|
+
release_number="2",
|
|
2295
|
+
state="NY",
|
|
2296
|
+
),
|
|
2297
|
+
BuildingID(
|
|
2298
|
+
bldg_id=70769,
|
|
2299
|
+
release_year="2024",
|
|
2300
|
+
res_com="comstock",
|
|
2301
|
+
weather="amy2018",
|
|
2302
|
+
upgrade_id="0",
|
|
2303
|
+
release_number="2",
|
|
2304
|
+
state="NV",
|
|
2305
|
+
),
|
|
2306
|
+
BuildingID(
|
|
2307
|
+
bldg_id=68227,
|
|
2308
|
+
release_year="2024",
|
|
2309
|
+
res_com="comstock",
|
|
2310
|
+
weather="amy2018",
|
|
2311
|
+
upgrade_id="0",
|
|
2312
|
+
release_number="2",
|
|
2313
|
+
state="NV",
|
|
2314
|
+
),
|
|
2315
|
+
]
|
|
2316
|
+
file_type = ("metadata",)
|
|
2317
|
+
output_dir = Path("data")
|
|
2318
|
+
|
|
2319
|
+
downloaded_paths, failed_downloads = fetch_bldg_data(bldg_ids, file_type, output_dir)
|