buildstock-fetch 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of buildstock-fetch might be problematic. Click here for more details.

buildstock_fetch/main.py CHANGED
@@ -1,15 +1,20 @@
1
1
  import concurrent.futures
2
+ import gc
2
3
  import json
4
+ import os
3
5
  import tempfile
4
6
  import zipfile
5
7
  from dataclasses import asdict, dataclass
6
8
  from datetime import timedelta
7
9
  from importlib.resources import files
8
10
  from pathlib import Path
9
- from typing import Optional, Union
11
+ from typing import Any, Optional, Union
10
12
 
13
+ import boto3
11
14
  import polars as pl
12
15
  import requests
16
+ from botocore import UNSIGNED
17
+ from botocore.config import Config
13
18
  from rich.console import Console
14
19
  from rich.progress import (
15
20
  BarColumn,
@@ -23,6 +28,8 @@ from rich.progress import (
23
28
  TransferSpeedColumn,
24
29
  )
25
30
 
31
+ # from buildstock_fetch.main_cli import _get_all_available_releases
32
+
26
33
 
27
34
  class InvalidProductError(ValueError):
28
35
  """Raised when an invalid product is provided."""
@@ -96,6 +103,7 @@ class RequestedFileTypes:
96
103
  load_curve_daily: bool = False
97
104
  load_curve_monthly: bool = False
98
105
  load_curve_annual: bool = False
106
+ trip_schedules: bool = False
99
107
  weather: bool = False
100
108
 
101
109
 
@@ -193,10 +201,14 @@ class BuildingID:
193
201
  return f"{self.base_url}metadata/upgrade{str(int(self.upgrade_id)).zfill(2)}.parquet"
194
202
  elif self.release_year == "2024":
195
203
  if self.res_com == "comstock" and self.weather == "amy2018" and self.release_number == "2":
196
- return ""
197
- # This release does not have a single national metadata file.
198
- # Instead, it has a metadata file for each county.
199
- # We need a way to download them all and combine based on the state
204
+ if self.upgrade_id == "0":
205
+ upgrade_filename = "baseline"
206
+ else:
207
+ upgrade_filename = f"upgrade{str(int(self.upgrade_id)).zfill(2)}"
208
+ return (
209
+ f"{self.base_url}metadata_and_annual_results/by_state_and_county/full/parquet/"
210
+ f"state={self.state}/county={self._get_county_name()}/{self.state}_{self._get_county_name()}_{upgrade_filename}.parquet"
211
+ )
200
212
  else:
201
213
  if self.upgrade_id == "0":
202
214
  return f"{self.base_url}metadata/baseline.parquet"
@@ -206,12 +218,12 @@ class BuildingID:
206
218
  self.release_year == "2025"
207
219
  and self.res_com == "comstock"
208
220
  and self.weather == "amy2018"
209
- and self.release_number == "1"
221
+ and (self.release_number == "1" or self.release_number == "2")
210
222
  ):
211
- return ""
212
- # This release does not have a single national metadata file.
213
- # Instead, it has a metadata file for each county.
214
- # We need a way to download them all and combine based on the state
223
+ return (
224
+ f"{self.base_url}metadata_and_annual_results/by_state_and_county/full/parquet/"
225
+ f"state={self.state}/county={self._get_county_name()}/{self.state}_{self._get_county_name()}_upgrade{self.upgrade_id}.parquet"
226
+ )
215
227
  else:
216
228
  return ""
217
229
 
@@ -630,6 +642,21 @@ def _download_with_progress(url: str, output_file: Path, progress: Progress, tas
630
642
  return downloaded_size
631
643
 
632
644
 
645
+ def _extract_metadata_columns_to_keep(metadata_file: Path) -> list[str]:
646
+ """Extract metadata columns from a schema."""
647
+ schema = pl.scan_parquet(metadata_file).collect_schema()
648
+
649
+ columns_to_keep = []
650
+ for col in schema:
651
+ if (
652
+ any(keyword in col for keyword in ["upgrade", "bldg_id", "metadata_index"])
653
+ or col.startswith("in.")
654
+ or col.startswith("in.")
655
+ ):
656
+ columns_to_keep.append(col)
657
+ return columns_to_keep
658
+
659
+
633
660
  def _download_with_progress_metadata(url: str, output_file: Path, progress: Progress, task_id: TaskID) -> int:
634
661
  """Download a metadata file with progress tracking and append to existing file if it exists."""
635
662
  # Get file size first
@@ -646,36 +673,26 @@ def _download_with_progress_metadata(url: str, output_file: Path, progress: Prog
646
673
 
647
674
  # Check if output file already exists
648
675
  if output_file.exists():
649
- # Read existing parquet file
650
- existing_df = pl.read_parquet(output_file)
651
-
652
- # Download new data to temporary file
653
676
  with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as temp_file:
654
677
  temp_path = Path(temp_file.name)
678
+ with open(temp_path, "wb") as file:
679
+ for chunk in response.iter_content(chunk_size=8192):
680
+ if chunk:
681
+ file.write(chunk)
682
+ downloaded_size += len(chunk)
683
+ if total_size > 0:
684
+ progress.update(task_id, completed=downloaded_size)
685
+ _process_single_metadata_file(temp_path)
655
686
 
656
- try:
657
- # Download to temp file
658
- with open(temp_path, "wb") as file:
659
- for chunk in response.iter_content(chunk_size=8192):
660
- if chunk:
661
- file.write(chunk)
662
- downloaded_size += len(chunk)
663
- if total_size > 0:
664
- progress.update(task_id, completed=downloaded_size)
665
-
666
- # Read new data
667
- new_df = pl.read_parquet(temp_path)
668
-
669
- # Concatenate existing and new data, removing duplicates
670
- combined_df = pl.concat([existing_df, new_df]).unique()
671
-
672
- # Write combined data back to original file
673
- combined_df.write_parquet(output_file)
687
+ existing_file = pl.scan_parquet(output_file)
688
+ new_file = pl.scan_parquet(temp_path)
689
+ combined_file = pl.concat([existing_file, new_file])
690
+ # Remove duplicate rows based on bldg_id column
691
+ deduplicated_file = combined_file.collect().unique(subset=["bldg_id"], keep="first")
692
+ deduplicated_file.write_parquet(output_file)
693
+ gc.collect()
694
+ os.remove(temp_path)
674
695
 
675
- finally:
676
- # Clean up temp file
677
- if temp_path.exists():
678
- temp_path.unlink()
679
696
  else:
680
697
  # File doesn't exist, download normally
681
698
  with open(str(output_file), "wb") as file:
@@ -686,9 +703,47 @@ def _download_with_progress_metadata(url: str, output_file: Path, progress: Prog
686
703
  if total_size > 0:
687
704
  progress.update(task_id, completed=downloaded_size)
688
705
 
706
+ _process_single_metadata_file(output_file)
707
+
689
708
  return downloaded_size
690
709
 
691
710
 
711
+ def _process_single_metadata_file(metadata_file: Path) -> None:
712
+ """Process a single metadata file to keep only columns containing specified keywords."""
713
+ # First, get column names without loading data into memory
714
+ schema = pl.scan_parquet(metadata_file).collect_schema()
715
+
716
+ # Filter columns to only keep those containing "bldg_id", "upgrade", "metadata_index", or "out."
717
+ # and remove columns that start with "in."
718
+ columns_to_keep = []
719
+ for col in schema:
720
+ if any(keyword in col for keyword in ["bldg_id", "upgrade", "metadata_index"]) or col.startswith("in."):
721
+ columns_to_keep.append(col)
722
+
723
+ # Use streaming operations to avoid loading entire file into memory
724
+ # Create a temporary file to write the filtered data
725
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as temp_file:
726
+ temp_file_path = temp_file.name
727
+
728
+ try:
729
+ # Stream the data: select columns and write in one operation
730
+ filtered_metadata_file = pl.scan_parquet(metadata_file).select(columns_to_keep).collect()
731
+ filtered_metadata_file.write_parquet(temp_file_path)
732
+
733
+ # Replace the original file with the filtered one
734
+ os.replace(temp_file_path, metadata_file)
735
+
736
+ # Force garbage collection to free memory immediately
737
+ gc.collect()
738
+
739
+ except Exception:
740
+ # Clean up temp file if something goes wrong
741
+ if os.path.exists(temp_file_path):
742
+ os.remove(temp_file_path)
743
+ raise
744
+ return
745
+
746
+
692
747
  def _get_time_step_grouping_key(aggregate_time_step: str) -> tuple[str, str]:
693
748
  """Get the grouping key and format string for a given time step.
694
749
 
@@ -749,10 +804,7 @@ def _create_aggregation_expressions(load_curve: pl.DataFrame, column_aggregation
749
804
  def _aggregate_load_curve_aggregate(
750
805
  load_curve: pl.DataFrame, aggregate_time_step: str, release_year: str
751
806
  ) -> pl.DataFrame:
752
- """Aggregate the 15-minute load curve to specified time step based on aggregation rules.
753
-
754
- Removes the last row to ensure complete aggregation periods.
755
- """
807
+ """Aggregate the 15-minute load curve to specified time step based on aggregation rules."""
756
808
  # Read the aggregation rules from CSV
757
809
  if release_year == "2024":
758
810
  load_curve_map = LOAD_CURVE_COLUMN_AGGREGATION.joinpath("2024_resstock_load_curve_columns.csv")
@@ -833,6 +885,7 @@ def _download_and_process_aggregate(
833
885
  # Process with Polars
834
886
  load_curve_15min = pl.read_parquet(temp_path)
835
887
  load_curve_aggregate = _aggregate_load_curve_aggregate(load_curve_15min, aggregate_time_step, release_year)
888
+ _add_time_aggregation_columns(load_curve_aggregate, aggregate_time_step)
836
889
 
837
890
  # Save processed file to final destination
838
891
  load_curve_aggregate.write_parquet(output_file)
@@ -1013,6 +1066,39 @@ def download_15min_load_curve_with_progress(
1013
1066
  return output_file
1014
1067
 
1015
1068
 
1069
+ def _add_time_aggregation_columns(load_curve_aggregate: pl.DataFrame, aggregate_time_step: str) -> None:
1070
+ """Add time-based columns to the dataframe based on aggregation type.
1071
+
1072
+ Args:
1073
+ df: Polars DataFrame with a 'timestamp' column
1074
+ aggregate_time_step: Type of aggregation ('hourly', 'daily', 'monthly')
1075
+ """
1076
+ if aggregate_time_step == "hourly":
1077
+ # Add year, month, day, and hour columns
1078
+ new_df = load_curve_aggregate.with_columns([
1079
+ pl.col("timestamp").dt.year().alias("year"),
1080
+ pl.col("timestamp").dt.month().alias("month"),
1081
+ pl.col("timestamp").dt.day().alias("day"),
1082
+ pl.col("timestamp").dt.hour().alias("hour"),
1083
+ ])
1084
+ load_curve_aggregate.__dict__.update(new_df.__dict__)
1085
+ elif aggregate_time_step == "daily":
1086
+ # Add year, month, and day columns
1087
+ new_df = load_curve_aggregate.with_columns([
1088
+ pl.col("timestamp").dt.year().alias("year"),
1089
+ pl.col("timestamp").dt.month().alias("month"),
1090
+ pl.col("timestamp").dt.day().alias("day"),
1091
+ ])
1092
+ load_curve_aggregate.__dict__.update(new_df.__dict__)
1093
+ elif aggregate_time_step == "monthly":
1094
+ # Add year and month columns
1095
+ new_df = load_curve_aggregate.with_columns([
1096
+ pl.col("timestamp").dt.year().alias("year"),
1097
+ pl.col("timestamp").dt.month().alias("month"),
1098
+ ])
1099
+ load_curve_aggregate.__dict__.update(new_df.__dict__)
1100
+
1101
+
1016
1102
  def download_aggregate_time_step_load_curve_with_progress(
1017
1103
  bldg_id: BuildingID,
1018
1104
  output_dir: Path,
@@ -1067,6 +1153,7 @@ def download_aggregate_time_step_load_curve_with_progress(
1067
1153
  load_curve_aggregate = _aggregate_load_curve_aggregate(
1068
1154
  load_curve_15min, aggregate_time_step, bldg_id.release_year
1069
1155
  )
1156
+ _add_time_aggregation_columns(load_curve_aggregate, aggregate_time_step)
1070
1157
 
1071
1158
  # Save processed file to final destination
1072
1159
  load_curve_aggregate.write_parquet(output_file)
@@ -1080,28 +1167,32 @@ def download_aggregate_time_step_load_curve_with_progress(
1080
1167
  def _parse_requested_file_type(file_type: tuple[str, ...]) -> RequestedFileTypes:
1081
1168
  """Parse the file type string into a RequestedFileTypes object."""
1082
1169
  file_type_obj = RequestedFileTypes()
1083
- if "hpxml" in file_type:
1084
- file_type_obj.hpxml = True
1085
- if "schedule" in file_type:
1086
- file_type_obj.schedule = True
1087
- if "metadata" in file_type:
1088
- file_type_obj.metadata = True
1089
- if "load_curve_15min" in file_type:
1090
- file_type_obj.load_curve_15min = True
1091
- if "load_curve_hourly" in file_type:
1092
- file_type_obj.load_curve_hourly = True
1093
- if "load_curve_daily" in file_type:
1094
- file_type_obj.load_curve_daily = True
1095
- if "load_curve_monthly" in file_type:
1096
- file_type_obj.load_curve_monthly = True
1097
- if "load_curve_annual" in file_type:
1098
- file_type_obj.load_curve_annual = True
1099
- if "weather" in file_type:
1100
- file_type_obj.weather = True
1170
+
1171
+ # Map file type strings to their corresponding attributes
1172
+ type_mapping = {
1173
+ "hpxml": "hpxml",
1174
+ "schedule": "schedule",
1175
+ "metadata": "metadata",
1176
+ "load_curve_15min": "load_curve_15min",
1177
+ "load_curve_hourly": "load_curve_hourly",
1178
+ "load_curve_daily": "load_curve_daily",
1179
+ "load_curve_monthly": "load_curve_monthly",
1180
+ "load_curve_annual": "load_curve_annual",
1181
+ "trip_schedules": "trip_schedules",
1182
+ "weather": "weather",
1183
+ }
1184
+
1185
+ # Set attributes based on what's in the file_type tuple
1186
+ for type_str, attr_name in type_mapping.items():
1187
+ if type_str in file_type:
1188
+ setattr(file_type_obj, attr_name, True)
1189
+
1101
1190
  return file_type_obj
1102
1191
 
1103
1192
 
1104
- def _process_metadata_results(bldg_ids: list[BuildingID], output_dir: Path, downloaded_paths: list[Path]) -> None:
1193
+ def _filter_metadata_requested_bldg_ids(
1194
+ bldg_ids: list[BuildingID], output_dir: Path, downloaded_paths: list[Path]
1195
+ ) -> None:
1105
1196
  """Process the results of a completed metadata download."""
1106
1197
  metadata_to_bldg_id_mapping: dict[Path, list[int]] = {}
1107
1198
  for bldg_id in bldg_ids:
@@ -1120,14 +1211,69 @@ def _process_metadata_results(bldg_ids: list[BuildingID], output_dir: Path, down
1120
1211
  metadata_to_bldg_id_mapping[output_file] = [bldg_id.bldg_id]
1121
1212
 
1122
1213
  for metadata_file, bldg_id_list in metadata_to_bldg_id_mapping.items():
1123
- # Use scan_parquet for lazy evaluation and better memory efficiency
1124
- metadata_df_filtered = pl.scan_parquet(metadata_file).filter(pl.col("bldg_id").is_in(bldg_id_list)).collect()
1125
- # Write the filtered dataframe back to the same file
1126
- metadata_df_filtered.write_parquet(metadata_file)
1214
+ # Use streaming operations to avoid loading entire file into memory
1215
+ # Stream the data: filter rows, select columns, and write in one operation
1216
+ filtered_metadata_file = pl.scan_parquet(metadata_file).filter(pl.col("bldg_id").is_in(bldg_id_list)).collect()
1217
+
1218
+ # Replace the original file with the filtered one
1219
+ filtered_metadata_file.write_parquet(metadata_file)
1220
+
1221
+ # Force garbage collection to free memory immediately
1222
+ gc.collect()
1127
1223
 
1128
1224
  return
1129
1225
 
1130
1226
 
1227
+ def _process_annual_load_curve_file(file_path: Path) -> None:
1228
+ """Process an annual load curve file to keep only columns containing specified keywords.
1229
+
1230
+ Args:
1231
+ file_path: Path to the annual load curve parquet file to process.
1232
+ """
1233
+ # First, get column names without loading data into memory
1234
+ schema = pl.scan_parquet(file_path).collect_schema()
1235
+
1236
+ # Filter columns to only keep those containing "bldg_id", "upgrade", "metadata_index", or "out."
1237
+ # and remove columns that start with "in."
1238
+ columns_to_keep = []
1239
+ for col in schema:
1240
+ if (
1241
+ any(keyword in col for keyword in ["bldg_id", "upgrade", "metadata_index"]) or col.startswith("out.")
1242
+ ) and not col.startswith("in."):
1243
+ columns_to_keep.append(col)
1244
+
1245
+ # Use streaming operations to avoid loading entire file into memory
1246
+ # Create a temporary file to write the filtered data
1247
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as temp_file:
1248
+ temp_file_path = temp_file.name
1249
+
1250
+ # Stream the data: select columns and write in one operation
1251
+ filtered_file = pl.scan_parquet(file_path).select(columns_to_keep).collect()
1252
+ filtered_file.write_parquet(temp_file_path)
1253
+
1254
+ # Replace the original file with the filtered one
1255
+ os.replace(temp_file_path, file_path)
1256
+
1257
+ # Force garbage collection to free memory immediately
1258
+ gc.collect()
1259
+
1260
+
1261
+ def _process_annual_load_curve_results(downloaded_paths: list[Path]) -> None:
1262
+ """Process all downloaded annual load curve files to filter columns.
1263
+
1264
+ Args:
1265
+ downloaded_paths: List of all downloaded file paths.
1266
+ """
1267
+ # Filter for annual load curve files
1268
+ annual_load_curve_files = [
1269
+ path for path in downloaded_paths if "load_curve_annual" in str(path) and path.suffix == ".parquet"
1270
+ ]
1271
+
1272
+ # Process each annual load curve file
1273
+ for file_path in annual_load_curve_files:
1274
+ _process_annual_load_curve_file(file_path)
1275
+
1276
+
1131
1277
  def _process_download_results(
1132
1278
  future: concurrent.futures.Future,
1133
1279
  bldg_id: BuildingID,
@@ -1184,14 +1330,9 @@ def _download_metadata_with_progress(
1184
1330
  if download_url in metadata_urls:
1185
1331
  metadata_urls.remove(download_url)
1186
1332
  metadata_task = progress.add_task(
1187
- f"[yellow]Downloading metadata: {download_url}",
1333
+ f"[yellow]Downloading metadata: {bldg_id.get_release_name()} - (upgrade {bldg_id.upgrade_id}) - {bldg_id.state}",
1188
1334
  total=0, # Will be updated when we get the file size
1189
1335
  )
1190
- # Get file size first
1191
- response = requests.head(download_url, timeout=30)
1192
- response.raise_for_status()
1193
- total_size = int(response.headers.get("content-length", 0))
1194
- progress.update(metadata_task, total=total_size)
1195
1336
 
1196
1337
  output_file.parent.mkdir(parents=True, exist_ok=True)
1197
1338
  try:
@@ -1272,62 +1413,64 @@ def _download_15min_load_curves_parallel(
1272
1413
  console: Console,
1273
1414
  ) -> None:
1274
1415
  """Download 15-minute load curves in parallel with progress tracking."""
1275
- # Create progress tasks for 15-minute load curve downloads
1276
- load_curve_tasks = {}
1277
- for i, bldg_id in enumerate(bldg_ids):
1278
- task_id = progress.add_task(
1279
- f"[magenta]Load curve {bldg_id.bldg_id} (upgrade {bldg_id.upgrade_id})",
1280
- total=0, # Will be updated when we get the file size
1281
- )
1282
- load_curve_tasks[i] = task_id
1283
1416
 
1284
- # Create a modified version of the download function that uses the specific task IDs
1417
+ # Create progress tasks based on dataset size
1418
+ if len(bldg_ids) > 500:
1419
+ load_curve_tasks = _create_batch_progress_tasks_15min(bldg_ids, progress, console)
1420
+ else:
1421
+ load_curve_tasks = _create_individual_progress_tasks_15min(bldg_ids, progress)
1422
+
1423
+ # Create download functions
1285
1424
  def download_15min_with_task_id(bldg_id: BuildingID, output_dir: Path, task_id: TaskID) -> Path:
1286
1425
  return download_15min_load_curve_with_progress(bldg_id, output_dir, progress, task_id)
1287
1426
 
1288
1427
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
1289
- future_to_bldg = {
1290
- executor.submit(download_15min_with_task_id, bldg_id, output_dir, load_curve_tasks[i]): bldg_id
1291
- for i, bldg_id in enumerate(bldg_ids)
1292
- }
1428
+ if len(bldg_ids) > 500:
1429
+ # Process in batches for large datasets
1430
+ num_batches = 20
1431
+ batch_size = ((len(bldg_ids) + num_batches - 1) // num_batches + 99) // 100 * 100
1432
+ future_to_bldg = {}
1293
1433
 
1434
+ for batch_idx in range(0, len(bldg_ids), batch_size):
1435
+ batch = bldg_ids[batch_idx : batch_idx + batch_size]
1436
+ # Skip empty batches
1437
+ if not batch:
1438
+ break
1439
+
1440
+ task_id = load_curve_tasks[batch_idx // batch_size]
1441
+
1442
+ for bldg_id in batch:
1443
+ future = executor.submit(
1444
+ _download_15min_with_batch_progress,
1445
+ bldg_id,
1446
+ output_dir,
1447
+ task_id,
1448
+ progress,
1449
+ )
1450
+ future_to_bldg[future] = bldg_id
1451
+ else:
1452
+ # Original behavior for smaller datasets
1453
+ future_to_bldg = {
1454
+ executor.submit(download_15min_with_task_id, bldg_id, output_dir, load_curve_tasks[i]): bldg_id
1455
+ for i, bldg_id in enumerate(bldg_ids)
1456
+ }
1457
+
1458
+ # Process completed futures
1294
1459
  for future in concurrent.futures.as_completed(future_to_bldg):
1295
1460
  bldg_id = future_to_bldg[future]
1296
- try:
1297
- output_file = future.result()
1298
- downloaded_paths.append(output_file)
1299
- except No15minLoadCurveError:
1300
- output_file = (
1301
- output_dir
1302
- / bldg_id.get_release_name()
1303
- / "load_curve_15min"
1304
- / f"state={bldg_id.state}"
1305
- / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
1306
- / f"bldg{str(bldg_id.bldg_id).zfill(7)}_load_curve_15min.parquet"
1307
- )
1308
- failed_downloads.append(str(output_file))
1309
- console.print(f"[red]15 min load curve not available for {bldg_id.get_release_name()}[/red]")
1310
- raise
1311
- except Exception as e:
1312
- output_file = (
1313
- output_dir
1314
- / bldg_id.get_release_name()
1315
- / "load_curve_15min"
1316
- / f"state={bldg_id.state}"
1317
- / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
1318
- / f"bldg{str(bldg_id.bldg_id).zfill(7)}_load_curve_15min.parquet"
1319
- )
1320
- failed_downloads.append(str(output_file))
1321
- console.print(f"[red]Download failed for 15 min load curve {bldg_id.bldg_id}: {e}[/red]")
1461
+ _process_download_future_15min(future, bldg_id, output_dir, downloaded_paths, failed_downloads, console)
1322
1462
 
1323
1463
 
1324
1464
  def _create_batch_progress_tasks(
1325
1465
  bldg_ids: list[BuildingID], aggregate_time_step: str, progress: Progress, console: Console
1326
1466
  ) -> dict[int, TaskID]:
1327
1467
  """Create progress tasks for batch processing."""
1328
- batch_size = 100
1329
- num_batches = (len(bldg_ids) + batch_size - 1) // batch_size
1330
- console.print(f"[blue]Using batch processing: {len(bldg_ids)} buildings split into {num_batches} batches[/blue]")
1468
+ num_batches = 20
1469
+ # Calculate batch size rounded up to nearest 100
1470
+ batch_size = ((len(bldg_ids) + num_batches - 1) // num_batches + 99) // 100 * 100
1471
+ console.print(
1472
+ f"[blue]Using batch processing: {len(bldg_ids)} buildings split into {num_batches} batches of up to {batch_size} buildings each[/blue]"
1473
+ )
1331
1474
 
1332
1475
  load_curve_tasks = {}
1333
1476
  for i in range(num_batches):
@@ -1336,6 +1479,10 @@ def _create_batch_progress_tasks(
1336
1479
  end_idx = min(start_idx + batch_size, len(bldg_ids))
1337
1480
  batch_count = end_idx - start_idx
1338
1481
 
1482
+ # Skip empty or negative batches
1483
+ if batch_count <= 0:
1484
+ break
1485
+
1339
1486
  console.print(f"[blue]Batch {i + 1}/{num_batches}: {batch_count} buildings[/blue]")
1340
1487
 
1341
1488
  task_id = progress.add_task(
@@ -1347,6 +1494,39 @@ def _create_batch_progress_tasks(
1347
1494
  return load_curve_tasks
1348
1495
 
1349
1496
 
1497
+ def _create_batch_progress_tasks_15min(
1498
+ bldg_ids: list[BuildingID], progress: Progress, console: Console
1499
+ ) -> dict[int, TaskID]:
1500
+ """Create progress tasks for 15-minute load curve batch processing."""
1501
+ num_batches = 20
1502
+ # Calculate batch size rounded up to nearest 100
1503
+ batch_size = ((len(bldg_ids) + num_batches - 1) // num_batches + 99) // 100 * 100
1504
+ console.print(
1505
+ f"[blue]Using batch processing: {len(bldg_ids)} buildings split into {num_batches} batches of up to {batch_size} buildings each[/blue]"
1506
+ )
1507
+
1508
+ load_curve_tasks = {}
1509
+ for i in range(num_batches):
1510
+ # Calculate how many buildings are in this batch
1511
+ start_idx = i * batch_size
1512
+ end_idx = min(start_idx + batch_size, len(bldg_ids))
1513
+ batch_count = end_idx - start_idx
1514
+
1515
+ # Skip empty or negative batches
1516
+ if batch_count <= 0:
1517
+ break
1518
+
1519
+ console.print(f"[blue]Batch {i + 1}/{num_batches}: {batch_count} buildings[/blue]")
1520
+
1521
+ task_id = progress.add_task(
1522
+ f"[magenta]Batch {i + 1}/{num_batches} (15min)",
1523
+ total=batch_count, # Set total to the number of buildings in this batch
1524
+ )
1525
+ load_curve_tasks[i] = task_id
1526
+
1527
+ return load_curve_tasks
1528
+
1529
+
1350
1530
  def _create_individual_progress_tasks(bldg_ids: list[BuildingID], progress: Progress) -> dict[int, TaskID]:
1351
1531
  """Create progress tasks for individual building processing."""
1352
1532
  load_curve_tasks = {}
@@ -1359,6 +1539,18 @@ def _create_individual_progress_tasks(bldg_ids: list[BuildingID], progress: Prog
1359
1539
  return load_curve_tasks
1360
1540
 
1361
1541
 
1542
+ def _create_individual_progress_tasks_15min(bldg_ids: list[BuildingID], progress: Progress) -> dict[int, TaskID]:
1543
+ """Create progress tasks for individual 15-minute load curve processing."""
1544
+ load_curve_tasks = {}
1545
+ for i, bldg_id in enumerate(bldg_ids):
1546
+ task_id = progress.add_task(
1547
+ f"[magenta]Load curve {bldg_id.bldg_id} (upgrade {bldg_id.upgrade_id})",
1548
+ total=0, # Will be updated when we get the file size
1549
+ )
1550
+ load_curve_tasks[i] = task_id
1551
+ return load_curve_tasks
1552
+
1553
+
1362
1554
  def _download_aggregate_with_batch_progress(
1363
1555
  bldg_id: BuildingID, output_dir: Path, task_id: TaskID, aggregate_time_step: str, progress: Progress
1364
1556
  ) -> Path:
@@ -1370,6 +1562,17 @@ def _download_aggregate_with_batch_progress(
1370
1562
  return result
1371
1563
 
1372
1564
 
1565
+ def _download_15min_with_batch_progress(
1566
+ bldg_id: BuildingID, output_dir: Path, task_id: TaskID, progress: Progress
1567
+ ) -> Path:
1568
+ """Download 15-minute load curve with batch progress tracking."""
1569
+ # Download the file without individual progress tracking
1570
+ result = download_15min_load_curve_with_progress(bldg_id, output_dir, None, None)
1571
+ # Update batch progress by 1
1572
+ progress.update(task_id, advance=1)
1573
+ return result
1574
+
1575
+
1373
1576
  def _process_download_future(
1374
1577
  future: concurrent.futures.Future,
1375
1578
  bldg_id: BuildingID,
@@ -1406,6 +1609,43 @@ def _process_download_future(
1406
1609
  console.print(f"[red]Download failed for monthly load curve {bldg_id.bldg_id}: {e}[/red]")
1407
1610
 
1408
1611
 
1612
+ def _process_download_future_15min(
1613
+ future: concurrent.futures.Future,
1614
+ bldg_id: BuildingID,
1615
+ output_dir: Path,
1616
+ downloaded_paths: list[Path],
1617
+ failed_downloads: list[str],
1618
+ console: Console,
1619
+ ) -> None:
1620
+ """Process a completed 15-minute download future."""
1621
+ try:
1622
+ output_file = future.result()
1623
+ downloaded_paths.append(output_file)
1624
+ except No15minLoadCurveError:
1625
+ output_file = (
1626
+ output_dir
1627
+ / bldg_id.get_release_name()
1628
+ / "load_curve_15min"
1629
+ / f"state={bldg_id.state}"
1630
+ / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
1631
+ / f"bldg{str(bldg_id.bldg_id).zfill(7)}_load_curve_15min.parquet"
1632
+ )
1633
+ failed_downloads.append(str(output_file))
1634
+ console.print(f"[red]15 min load curve not available for {bldg_id.get_release_name()}[/red]")
1635
+ raise
1636
+ except Exception as e:
1637
+ output_file = (
1638
+ output_dir
1639
+ / bldg_id.get_release_name()
1640
+ / "load_curve_15min"
1641
+ / f"state={bldg_id.state}"
1642
+ / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
1643
+ / f"bldg{str(bldg_id.bldg_id).zfill(7)}_load_curve_15min.parquet"
1644
+ )
1645
+ failed_downloads.append(str(output_file))
1646
+ console.print(f"[red]Download failed for 15 min load curve {bldg_id.bldg_id}: {e}[/red]")
1647
+
1648
+
1409
1649
  def _download_aggregate_load_curves_parallel(
1410
1650
  bldg_ids: list[BuildingID],
1411
1651
  output_dir: Path,
@@ -1416,7 +1656,7 @@ def _download_aggregate_load_curves_parallel(
1416
1656
  failed_downloads: list[str],
1417
1657
  console: Console,
1418
1658
  ) -> None:
1419
- """Download monthly load curves in parallel with progress tracking."""
1659
+ """Download aggregate load curves in parallel with progress tracking."""
1420
1660
 
1421
1661
  # Create progress tasks based on dataset size
1422
1662
  if len(bldg_ids) > 500:
@@ -1435,11 +1675,16 @@ def _download_aggregate_load_curves_parallel(
1435
1675
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
1436
1676
  if len(bldg_ids) > 500:
1437
1677
  # Process in batches for large datasets
1438
- batch_size = 100
1678
+ num_batches = 20
1679
+ batch_size = ((len(bldg_ids) + num_batches - 1) // num_batches + 99) // 100 * 100
1439
1680
  future_to_bldg = {}
1440
1681
 
1441
1682
  for batch_idx in range(0, len(bldg_ids), batch_size):
1442
1683
  batch = bldg_ids[batch_idx : batch_idx + batch_size]
1684
+ # Skip empty batches
1685
+ if not batch:
1686
+ break
1687
+
1443
1688
  task_id = load_curve_tasks[batch_idx // batch_size]
1444
1689
 
1445
1690
  for bldg_id in batch:
@@ -1481,7 +1726,8 @@ def _download_metadata(
1481
1726
  if not bldg_ids:
1482
1727
  return
1483
1728
  _download_metadata_with_progress(bldg_ids, output_dir, progress, downloaded_paths, failed_downloads, console)
1484
- _process_metadata_results(bldg_ids, output_dir, downloaded_paths)
1729
+ # Only keep the requested bldg_ids in the metadata file
1730
+ _filter_metadata_requested_bldg_ids(bldg_ids, output_dir, downloaded_paths)
1485
1731
 
1486
1732
 
1487
1733
  def download_annual_load_curve_with_progress(
@@ -1594,6 +1840,132 @@ def _download_annual_load_curves_parallel(
1594
1840
  console.print(f"[red]Download failed for annual load curve {bldg_id.bldg_id}: {e}[/red]")
1595
1841
 
1596
1842
 
1843
+ def _get_parquet_files_for_state(s3_client: Any, bucket: str, s3_prefix: str) -> list[str]:
1844
+ """Get list of parquet files for a given S3 prefix."""
1845
+ paginator = s3_client.get_paginator("list_objects_v2")
1846
+ parquet_files = []
1847
+ for page in paginator.paginate(Bucket=bucket, Prefix=s3_prefix):
1848
+ for obj in page.get("Contents", []):
1849
+ if obj["Key"].endswith(".parquet"):
1850
+ parquet_files.append(obj["Key"])
1851
+ return parquet_files
1852
+
1853
+
1854
+ def _download_and_read_parquet_files(
1855
+ s3_client: Any, bucket: str, parquet_files: list[str], output_dir: Path
1856
+ ) -> list[Any]:
1857
+ """Download and read parquet files, returning a list of dataframes."""
1858
+ # Ensure output directory exists
1859
+ output_dir.mkdir(parents=True, exist_ok=True)
1860
+
1861
+ state_dataframes = []
1862
+ for s3_key in parquet_files:
1863
+ temp_file = output_dir / f"temp_{s3_key.split('/')[-1]}"
1864
+ s3_client.download_file(bucket, s3_key, str(temp_file))
1865
+ df = pl.read_parquet(str(temp_file))
1866
+ state_dataframes.append(df)
1867
+ temp_file.unlink()
1868
+ return state_dataframes
1869
+
1870
+
1871
+ def _process_state_data(
1872
+ s3_client: Any, bucket: str, prefix: str, release: str, state: str, output_dir: Path
1873
+ ) -> tuple[list[Any], bool]:
1874
+ """Process data for a single state, returning (dataframes, has_data)."""
1875
+ s3_prefix = f"{prefix}release={release}/state={state}/"
1876
+ parquet_files = _get_parquet_files_for_state(s3_client, bucket, s3_prefix)
1877
+
1878
+ if not parquet_files:
1879
+ return [], False
1880
+
1881
+ state_dataframes = _download_and_read_parquet_files(s3_client, bucket, parquet_files, output_dir)
1882
+ if state_dataframes:
1883
+ state_combined_df = pl.concat(state_dataframes)
1884
+ return [state_combined_df], True
1885
+ return [], False
1886
+
1887
+
1888
+ def _save_filtered_state_data(
1889
+ state_df: Any, state: str, bldg_ids: list[BuildingID], release: str, output_dir: Path, downloaded_paths: list[Path]
1890
+ ) -> None:
1891
+ """Save filtered data for a specific state."""
1892
+ bldg_id_list = [str(bldg.bldg_id) for bldg in bldg_ids if bldg.state == state]
1893
+ if not bldg_id_list:
1894
+ return
1895
+
1896
+ filtered_df = state_df.filter(pl.col("bldg_id").is_in(bldg_id_list))
1897
+ if filtered_df.height == 0:
1898
+ return
1899
+
1900
+ output_file = output_dir / release / "trip_schedules" / f"state={state}" / "trip_schedules.parquet"
1901
+ output_file.parent.mkdir(parents=True, exist_ok=True)
1902
+ filtered_df.write_parquet(str(output_file))
1903
+ downloaded_paths.append(output_file)
1904
+
1905
+
1906
+ def _download_trip_schedules_data(
1907
+ bldg_ids: list[BuildingID],
1908
+ output_dir: Path,
1909
+ downloaded_paths: list[Path],
1910
+ bucket: str = "buildstock-fetch",
1911
+ prefix: str = "ev_demand/trip_schedules/",
1912
+ ) -> None:
1913
+ """
1914
+ Download and filter trip schedules data for specific building IDs.
1915
+
1916
+ Args:
1917
+ bldg_ids: List of BuildingID objects to filter for.
1918
+ output_dir: Directory to save the downloaded files.
1919
+ downloaded_paths: List to append successful download paths to.
1920
+ bucket: Name of the S3 bucket.
1921
+ prefix: S3 prefix for the trip schedules data.
1922
+
1923
+ Raises:
1924
+ NoBuildingDataError: If no buildings from bldg_ids are found in any available state data.
1925
+ """
1926
+ import warnings
1927
+
1928
+ release = bldg_ids[0].get_release_name()
1929
+ states_list = list({bldg.state for bldg in bldg_ids})
1930
+
1931
+ s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
1932
+
1933
+ all_dataframes = []
1934
+ available_states = []
1935
+ unavailable_states = []
1936
+
1937
+ # Process each state
1938
+ for state in states_list:
1939
+ state_dataframes, has_data = _process_state_data(s3, bucket, prefix, release, state, output_dir)
1940
+
1941
+ if has_data:
1942
+ available_states.append(state)
1943
+ all_dataframes.extend(state_dataframes)
1944
+ else:
1945
+ unavailable_states.append(state)
1946
+
1947
+ # Issue warnings for unavailable states
1948
+ if unavailable_states:
1949
+ warnings.warn(
1950
+ f"No trip schedules data found for {release} in states: {', '.join(unavailable_states)}. "
1951
+ f"Continuing with available states: {', '.join(available_states)}.",
1952
+ stacklevel=2,
1953
+ )
1954
+
1955
+ if not all_dataframes:
1956
+ msg = f"No trip schedules data found for {release} in any of the requested states: {', '.join(states_list)}"
1957
+ raise NoBuildingDataError(msg)
1958
+
1959
+ # Save filtered data for each available state separately
1960
+ for i, state_df in enumerate(all_dataframes):
1961
+ state = available_states[i]
1962
+ _save_filtered_state_data(state_df, state, bldg_ids, release, output_dir, downloaded_paths)
1963
+
1964
+ if not any(bldg.state in available_states for bldg in bldg_ids):
1965
+ msg = f"No trip schedules data found for buildings {[bldg.bldg_id for bldg in bldg_ids]} in {release} for any available state"
1966
+ raise NoBuildingDataError(msg)
1967
+
1968
+
1597
1969
  def _download_weather_files_parallel(
1598
1970
  bldg_ids: list[BuildingID],
1599
1971
  output_dir: Path,
@@ -1731,6 +2103,8 @@ def fetch_bldg_data(
1731
2103
  total_files += len(bldg_ids) # Add 15-minute load curve files
1732
2104
  if file_type_obj.load_curve_hourly:
1733
2105
  total_files += len(bldg_ids) # Add hourly load curve files
2106
+ if file_type_obj.load_curve_daily:
2107
+ total_files += len(bldg_ids) # Add daily load curve files
1734
2108
  if file_type_obj.load_curve_monthly:
1735
2109
  total_files += len(bldg_ids) # Add monthly load curve files
1736
2110
  if file_type_obj.load_curve_annual:
@@ -1767,6 +2141,13 @@ def fetch_bldg_data(
1767
2141
  weather_states,
1768
2142
  )
1769
2143
 
2144
+ # TODO: add EV related files
2145
+ # TODO: Write a function for downloading EV related files from SB's s3 bucket.
2146
+ # It should dynamically build the download url based on the release_name + state combo.
2147
+ # Make sure to follow the directory structure for downloading the files.
2148
+ if file_type_obj.trip_schedules:
2149
+ _download_trip_schedules_data(bldg_ids, output_dir, downloaded_paths)
2150
+
1770
2151
  _print_download_summary(downloaded_paths, failed_downloads, console)
1771
2152
 
1772
2153
  return downloaded_paths, failed_downloads
@@ -1817,6 +2198,19 @@ def _execute_downloads(
1817
2198
  console,
1818
2199
  )
1819
2200
 
2201
+ if file_type_obj.load_curve_daily:
2202
+ aggregate_time_step = "daily"
2203
+ _download_aggregate_load_curves_parallel(
2204
+ bldg_ids,
2205
+ output_dir,
2206
+ aggregate_time_step,
2207
+ max_workers,
2208
+ progress,
2209
+ downloaded_paths,
2210
+ failed_downloads,
2211
+ console,
2212
+ )
2213
+
1820
2214
  if file_type_obj.load_curve_monthly:
1821
2215
  aggregate_time_step = "monthly"
1822
2216
  _download_aggregate_load_curves_parallel(
@@ -1835,6 +2229,8 @@ def _execute_downloads(
1835
2229
  _download_annual_load_curves_parallel(
1836
2230
  bldg_ids, output_dir, max_workers, progress, downloaded_paths, failed_downloads, console
1837
2231
  )
2232
+ # Process annual load curve files to filter columns
2233
+ _process_annual_load_curve_results(downloaded_paths)
1838
2234
 
1839
2235
  # Get weather files if requested.
1840
2236
  if file_type_obj.weather:
@@ -1846,12 +2242,78 @@ def _execute_downloads(
1846
2242
  if __name__ == "__main__": # pragma: no cover
1847
2243
  bldg_ids = [
1848
2244
  BuildingID(
1849
- bldg_id=67, release_year="2024", res_com="comstock", weather="tmy3", upgrade_id="0", release_number="2"
2245
+ bldg_id=19713,
2246
+ release_year="2024",
2247
+ res_com="comstock",
2248
+ weather="amy2018",
2249
+ upgrade_id="0",
2250
+ release_number="2",
2251
+ state="NY",
2252
+ ),
2253
+ BuildingID(
2254
+ bldg_id=658,
2255
+ release_year="2024",
2256
+ res_com="comstock",
2257
+ weather="amy2018",
2258
+ upgrade_id="0",
2259
+ release_number="2",
2260
+ state="NY",
2261
+ ),
2262
+ BuildingID(
2263
+ bldg_id=659,
2264
+ release_year="2024",
2265
+ res_com="comstock",
2266
+ weather="amy2018",
2267
+ upgrade_id="0",
2268
+ release_number="2",
2269
+ state="NY",
1850
2270
  ),
1851
2271
  ]
1852
- file_type = ("weather",)
2272
+ file_type = ("metadata",)
1853
2273
  output_dir = Path("data")
1854
- weather_states: list[str] = []
1855
- downloaded_paths, failed_downloads = fetch_bldg_data(bldg_ids, file_type, output_dir, weather_states=weather_states)
2274
+
2275
+ downloaded_paths, failed_downloads = fetch_bldg_data(bldg_ids, file_type, output_dir)
1856
2276
  print(downloaded_paths)
1857
2277
  print(failed_downloads)
2278
+ bldg_ids = [
2279
+ BuildingID(
2280
+ bldg_id=21023,
2281
+ release_year="2024",
2282
+ res_com="comstock",
2283
+ weather="amy2018",
2284
+ upgrade_id="0",
2285
+ release_number="2",
2286
+ state="NY",
2287
+ ),
2288
+ BuildingID(
2289
+ bldg_id=18403,
2290
+ release_year="2024",
2291
+ res_com="comstock",
2292
+ weather="amy2018",
2293
+ upgrade_id="0",
2294
+ release_number="2",
2295
+ state="NY",
2296
+ ),
2297
+ BuildingID(
2298
+ bldg_id=70769,
2299
+ release_year="2024",
2300
+ res_com="comstock",
2301
+ weather="amy2018",
2302
+ upgrade_id="0",
2303
+ release_number="2",
2304
+ state="NV",
2305
+ ),
2306
+ BuildingID(
2307
+ bldg_id=68227,
2308
+ release_year="2024",
2309
+ res_com="comstock",
2310
+ weather="amy2018",
2311
+ upgrade_id="0",
2312
+ release_number="2",
2313
+ state="NV",
2314
+ ),
2315
+ ]
2316
+ file_type = ("metadata",)
2317
+ output_dir = Path("data")
2318
+
2319
+ downloaded_paths, failed_downloads = fetch_bldg_data(bldg_ids, file_type, output_dir)