buildstock-fetch 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of buildstock-fetch might be problematic. Click here for more details.

buildstock_fetch/main.py CHANGED
@@ -3,6 +3,7 @@ import json
3
3
  import tempfile
4
4
  import zipfile
5
5
  from dataclasses import asdict, dataclass
6
+ from datetime import timedelta
6
7
  from importlib.resources import files
7
8
  from pathlib import Path
8
9
  from typing import Optional, Union
@@ -71,18 +72,18 @@ class UnknownAggregationFunctionError(ValueError):
71
72
  pass
72
73
 
73
74
 
75
+ class NoWeatherFileError(ValueError):
76
+ """Raised when weather file is not available for a release."""
77
+
78
+ pass
79
+
80
+
74
81
  METADATA_DIR = Path(
75
82
  str(files("buildstock_fetch").joinpath("data").joinpath("building_data").joinpath("combined_metadata.parquet"))
76
83
  )
77
84
  RELEASE_JSON_FILE = Path(str(files("buildstock_fetch").joinpath("data").joinpath("buildstock_releases.json")))
78
- LOAD_CURVE_COLUMN_AGGREGATION = Path(
79
- str(
80
- files("buildstock_fetch")
81
- .joinpath("data")
82
- .joinpath("load_curve_column_map")
83
- .joinpath("2024_resstock_load_curve_columns.csv")
84
- )
85
- )
85
+ LOAD_CURVE_COLUMN_AGGREGATION = Path(str(files("buildstock_fetch").joinpath("data").joinpath("load_curve_column_map")))
86
+ WEATHER_FILE_DIR = Path(str(files("buildstock_fetch").joinpath("data").joinpath("weather_station_map")))
86
87
 
87
88
 
88
89
  @dataclass
@@ -95,6 +96,7 @@ class RequestedFileTypes:
95
96
  load_curve_daily: bool = False
96
97
  load_curve_monthly: bool = False
97
98
  load_curve_annual: bool = False
99
+ weather: bool = False
98
100
 
99
101
 
100
102
  @dataclass
@@ -274,6 +276,85 @@ class BuildingID:
274
276
  else:
275
277
  return ""
276
278
 
279
+ def get_weather_file_url(self) -> str:
280
+ """Generate the S3 download URL for this building."""
281
+ if self.get_weather_station_name() == "":
282
+ return ""
283
+ return self._build_weather_url()
284
+
285
+ def _build_weather_url(self) -> str:
286
+ """Build the weather file URL based on release year and weather type."""
287
+ if self.release_year == "2021":
288
+ return self._build_2021_weather_url()
289
+ elif self.release_year == "2022":
290
+ return self._build_2022_weather_url()
291
+ elif self.release_year == "2023":
292
+ return self._build_2023_weather_url()
293
+ elif self.release_year == "2024":
294
+ return self._build_2024_weather_url()
295
+ elif self.release_year == "2025":
296
+ return self._build_2025_weather_url()
297
+ else:
298
+ return ""
299
+
300
+ def _build_2021_weather_url(self) -> str:
301
+ """Build weather URL for 2021 release."""
302
+ if self.weather == "tmy3":
303
+ return f"{self.base_url}weather/{self.weather}/{self.get_weather_station_name()}_tmy3.csv"
304
+ elif self.weather == "amy2018":
305
+ return f"{self.base_url}weather/{self.weather}/{self.get_weather_station_name()}_2018.csv"
306
+ elif self.weather == "amy2012":
307
+ return f"{self.base_url}weather/{self.weather}/{self.get_weather_station_name()}_2012.csv"
308
+ else:
309
+ return ""
310
+
311
+ def _build_2022_weather_url(self) -> str:
312
+ """Build weather URL for 2022 release."""
313
+ if self.weather == "tmy3":
314
+ return f"{self.base_url}weather/state={self.state}/{self.get_weather_station_name()}_TMY3.csv"
315
+ elif self.weather == "amy2018":
316
+ return f"{self.base_url}weather/state={self.state}/{self.get_weather_station_name()}_2018.csv"
317
+ elif self.weather == "amy2012":
318
+ return f"{self.base_url}weather/state={self.state}/{self.get_weather_station_name()}_2012.csv"
319
+ else:
320
+ return ""
321
+
322
+ def _build_2023_weather_url(self) -> str:
323
+ """Build weather URL for 2023 release."""
324
+ if self.weather == "tmy3":
325
+ return f"{self.base_url}weather/{self.weather}/{self.get_weather_station_name()}_TMY3.csv"
326
+ elif self.weather == "amy2018":
327
+ return f"{self.base_url}weather/{self.weather}/{self.get_weather_station_name()}_2018.csv"
328
+ elif self.weather == "amy2012":
329
+ return f"{self.base_url}weather/{self.weather}/{self.get_weather_station_name()}_2012.csv"
330
+ else:
331
+ return ""
332
+
333
+ def _build_2024_weather_url(self) -> str:
334
+ """Build weather URL for 2024 release."""
335
+ if self.res_com == "comstock" and self.weather == "amy2018":
336
+ return f"{self.base_url}weather/{self.weather}/{self.get_weather_station_name()}_2018.csv"
337
+ else:
338
+ if self.weather == "tmy3":
339
+ return f"{self.base_url}weather/state={self.state}/{self.get_weather_station_name()}_TMY3.csv"
340
+ elif self.weather == "amy2018":
341
+ return f"{self.base_url}weather/state={self.state}/{self.get_weather_station_name()}_2018.csv"
342
+ elif self.weather == "amy2012":
343
+ return f"{self.base_url}weather/state={self.state}/{self.get_weather_station_name()}_2012.csv"
344
+ else:
345
+ return ""
346
+
347
+ def _build_2025_weather_url(self) -> str:
348
+ """Build weather URL for 2025 release."""
349
+ if self.weather == "tmy3":
350
+ return f"{self.base_url}weather/{self.weather}/{self.get_weather_station_name()}_TMY3.csv"
351
+ elif self.weather == "amy2018":
352
+ return f"{self.base_url}weather/{self.weather}/{self.get_weather_station_name()}_2018.csv"
353
+ elif self.weather == "amy2012":
354
+ return f"{self.base_url}weather/{self.weather}/{self.get_weather_station_name()}_2012.csv"
355
+ else:
356
+ return ""
357
+
277
358
  def get_annual_load_curve_filename(self) -> str:
278
359
  """Generate the filename for the annual load curve."""
279
360
  if self.release_year == "2021":
@@ -303,6 +384,28 @@ class BuildingID:
303
384
  else:
304
385
  return ""
305
386
 
387
+ def get_weather_station_name(self) -> str:
388
+ """Get the weather station name for this building."""
389
+ weather_map_df = pl.read_parquet(WEATHER_FILE_DIR)
390
+
391
+ # Filter by multiple fields for a more specific match
392
+ weather_station_map = weather_map_df.filter(
393
+ (pl.col("product") == self.res_com)
394
+ & (pl.col("release_year") == self.release_year)
395
+ & (pl.col("weather_file") == self.weather)
396
+ & (pl.col("release_version") == self.release_number)
397
+ & (pl.col("bldg_id") == self.bldg_id)
398
+ )
399
+
400
+ # Check if we found a match
401
+ if weather_station_map.height > 0:
402
+ # Return the weather station name from the first (and should be only) match
403
+ weather_station_name = weather_station_map.select("weather_station_name").item()
404
+ return str(weather_station_name) if weather_station_name is not None else ""
405
+ else:
406
+ # No match found, return empty string
407
+ return ""
408
+
306
409
  def _build_annual_load_state_url(self) -> str:
307
410
  """Build the state-level URL for annual load curve data.
308
411
 
@@ -430,6 +533,11 @@ def _validate_release_name(release_name: str) -> bool:
430
533
  return release_name in valid_release_names
431
534
 
432
535
 
536
+ def _resolve_unique_metadata_urls(bldg_ids: list[BuildingID]) -> list[str]:
537
+ """Resolve the unique metadata URLs for a list of building IDs."""
538
+ return list({bldg_id.get_metadata_url() for bldg_id in bldg_ids})
539
+
540
+
433
541
  def fetch_bldg_ids(
434
542
  product: str, release_year: str, weather_file: str, release_version: str, state: str, upgrade_id: str
435
543
  ) -> list[BuildingID]:
@@ -499,13 +607,13 @@ def fetch_bldg_ids(
499
607
  def _download_with_progress(url: str, output_file: Path, progress: Progress, task_id: TaskID) -> int:
500
608
  """Download a file with progress tracking."""
501
609
  # Get file size first
502
- response = requests.head(url, timeout=30)
610
+ response = requests.head(url, timeout=30, verify=True)
503
611
  response.raise_for_status()
504
612
  total_size = int(response.headers.get("content-length", 0))
505
613
  progress.update(task_id, total=total_size)
506
614
 
507
615
  # Download with streaming
508
- response = requests.get(url, stream=True, timeout=30)
616
+ response = requests.get(url, stream=True, timeout=30, verify=True)
509
617
  response.raise_for_status()
510
618
 
511
619
  downloaded_size = 0
@@ -522,6 +630,65 @@ def _download_with_progress(url: str, output_file: Path, progress: Progress, tas
522
630
  return downloaded_size
523
631
 
524
632
 
633
+ def _download_with_progress_metadata(url: str, output_file: Path, progress: Progress, task_id: TaskID) -> int:
634
+ """Download a metadata file with progress tracking and append to existing file if it exists."""
635
+ # Get file size first
636
+ response = requests.head(url, timeout=30, verify=True)
637
+ response.raise_for_status()
638
+ total_size = int(response.headers.get("content-length", 0))
639
+ progress.update(task_id, total=total_size)
640
+
641
+ # Download with streaming
642
+ response = requests.get(url, stream=True, timeout=30, verify=True)
643
+ response.raise_for_status()
644
+
645
+ downloaded_size = 0
646
+
647
+ # Check if output file already exists
648
+ if output_file.exists():
649
+ # Read existing parquet file
650
+ existing_df = pl.read_parquet(output_file)
651
+
652
+ # Download new data to temporary file
653
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as temp_file:
654
+ temp_path = Path(temp_file.name)
655
+
656
+ try:
657
+ # Download to temp file
658
+ with open(temp_path, "wb") as file:
659
+ for chunk in response.iter_content(chunk_size=8192):
660
+ if chunk:
661
+ file.write(chunk)
662
+ downloaded_size += len(chunk)
663
+ if total_size > 0:
664
+ progress.update(task_id, completed=downloaded_size)
665
+
666
+ # Read new data
667
+ new_df = pl.read_parquet(temp_path)
668
+
669
+ # Concatenate existing and new data, removing duplicates
670
+ combined_df = pl.concat([existing_df, new_df]).unique()
671
+
672
+ # Write combined data back to original file
673
+ combined_df.write_parquet(output_file)
674
+
675
+ finally:
676
+ # Clean up temp file
677
+ if temp_path.exists():
678
+ temp_path.unlink()
679
+ else:
680
+ # File doesn't exist, download normally
681
+ with open(str(output_file), "wb") as file:
682
+ for chunk in response.iter_content(chunk_size=8192):
683
+ if chunk:
684
+ file.write(chunk)
685
+ downloaded_size += len(chunk)
686
+ if total_size > 0:
687
+ progress.update(task_id, completed=downloaded_size)
688
+
689
+ return downloaded_size
690
+
691
+
525
692
  def _get_time_step_grouping_key(aggregate_time_step: str) -> tuple[str, str]:
526
693
  """Get the grouping key and format string for a given time step.
527
694
 
@@ -579,10 +746,19 @@ def _create_aggregation_expressions(load_curve: pl.DataFrame, column_aggregation
579
746
  return agg_exprs
580
747
 
581
748
 
582
- def _aggregate_load_curve_aggregate(load_curve: pl.DataFrame, aggregate_time_step: str) -> pl.DataFrame:
583
- """Aggregate the 15-minute load curve to specified time step based on aggregation rules."""
749
+ def _aggregate_load_curve_aggregate(
750
+ load_curve: pl.DataFrame, aggregate_time_step: str, release_year: str
751
+ ) -> pl.DataFrame:
752
+ """Aggregate the 15-minute load curve to specified time step based on aggregation rules.
753
+
754
+ Removes the last row to ensure complete aggregation periods.
755
+ """
584
756
  # Read the aggregation rules from CSV
585
- aggregation_rules = pl.read_csv(LOAD_CURVE_COLUMN_AGGREGATION)
757
+ if release_year == "2024":
758
+ load_curve_map = LOAD_CURVE_COLUMN_AGGREGATION.joinpath("2024_resstock_load_curve_columns.csv")
759
+ elif release_year == "2022":
760
+ load_curve_map = LOAD_CURVE_COLUMN_AGGREGATION.joinpath("2022_resstock_load_curve_columns.csv")
761
+ aggregation_rules = pl.read_csv(load_curve_map)
586
762
 
587
763
  # Create a dictionary mapping column names to their aggregation functions
588
764
  column_aggregations = dict(zip(aggregation_rules["name"], aggregation_rules["Aggregate_function"]))
@@ -595,6 +771,13 @@ def _aggregate_load_curve_aggregate(load_curve: pl.DataFrame, aggregate_time_ste
595
771
  # Convert timestamp to datetime if it's not already
596
772
  load_curve = load_curve.with_columns(pl.col("timestamp").cast(pl.Datetime))
597
773
 
774
+ # We want to subtract 15 minutes because the original load curve provides information
775
+ # for the previous 15 minutes for each timestamp. For example, the first timestamp is 00:00:15,
776
+ # and the columns correspond to consumption from 00:00:00 to 00:00:15. When aggregating,
777
+ # we want the 00:00:00 timestamp to correspond to the consumption from 00:00:00 to whenever the
778
+ # next timestamp is.
779
+ load_curve = load_curve.with_columns((pl.col("timestamp") - timedelta(minutes=15)).alias("timestamp"))
780
+
598
781
  # Get the grouping key configuration
599
782
  grouping_key, format_string = _get_time_step_grouping_key(aggregate_time_step)
600
783
 
@@ -614,11 +797,11 @@ def _aggregate_load_curve_aggregate(load_curve: pl.DataFrame, aggregate_time_ste
614
797
 
615
798
 
616
799
  def _download_and_process_aggregate(
617
- url: str, output_file: Path, progress: Progress, task_id: TaskID, aggregate_time_step: str
800
+ url: str, output_file: Path, progress: Progress, task_id: TaskID, aggregate_time_step: str, release_year: str
618
801
  ) -> int:
619
802
  """Download aggregate time step load curve to temporary file, process with Polars, and save result."""
620
803
  # Get file size first for progress tracking
621
- response = requests.head(url, timeout=30)
804
+ response = requests.head(url, timeout=30, verify=True)
622
805
  response.raise_for_status()
623
806
  total_size = int(response.headers.get("content-length", 0))
624
807
  progress.update(task_id, total=total_size)
@@ -635,7 +818,7 @@ def _download_and_process_aggregate(
635
818
  session.mount("https://", retry_strategy)
636
819
 
637
820
  # Download with streaming to temp file
638
- response = session.get(url, stream=True, timeout=60)
821
+ response = session.get(url, stream=True, timeout=60, verify=True)
639
822
  response.raise_for_status()
640
823
 
641
824
  downloaded_size = 0
@@ -649,7 +832,7 @@ def _download_and_process_aggregate(
649
832
 
650
833
  # Process with Polars
651
834
  load_curve_15min = pl.read_parquet(temp_path)
652
- load_curve_aggregate = _aggregate_load_curve_aggregate(load_curve_15min, aggregate_time_step)
835
+ load_curve_aggregate = _aggregate_load_curve_aggregate(load_curve_15min, aggregate_time_step, release_year)
653
836
 
654
837
  # Save processed file to final destination
655
838
  load_curve_aggregate.write_parquet(output_file)
@@ -706,7 +889,7 @@ def download_bldg_data(
706
889
  if progress and task_id is not None:
707
890
  _download_with_progress(download_url, output_file, progress, task_id)
708
891
  else:
709
- response = requests.get(download_url, timeout=30)
892
+ response = requests.get(download_url, timeout=30, verify=True)
710
893
  response.raise_for_status()
711
894
  output_file.write_bytes(response.content)
712
895
 
@@ -763,33 +946,6 @@ def download_bldg_data(
763
946
  return downloaded_paths
764
947
 
765
948
 
766
- def download_metadata(bldg_id: BuildingID, output_dir: Path) -> Path:
767
- """Download the metadata for a given building.
768
-
769
- Args:
770
- bldg_id: A BuildingID object to download metadata for.
771
- output_dir: Directory to save the downloaded metadata.
772
- """
773
-
774
- download_url = bldg_id.get_metadata_url()
775
- if download_url == "":
776
- message = f"Metadata is not available for {bldg_id.get_release_name()}"
777
- raise NoMetadataError(message)
778
- response = requests.get(download_url, timeout=30)
779
- response.raise_for_status()
780
- output_file = (
781
- output_dir
782
- / bldg_id.get_release_name()
783
- / "metadata"
784
- / f"state={bldg_id.state}"
785
- / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
786
- / "metadata.parquet"
787
- )
788
- output_file.parent.mkdir(parents=True, exist_ok=True)
789
- output_file.write_bytes(response.content)
790
- return output_file
791
-
792
-
793
949
  def download_15min_load_curve(bldg_id: BuildingID, output_dir: Path) -> Path:
794
950
  """Download the 15 min load profile timeseries for a given building.
795
951
 
@@ -802,7 +958,7 @@ def download_15min_load_curve(bldg_id: BuildingID, output_dir: Path) -> Path:
802
958
  if download_url == "":
803
959
  message = f"15 min load profile timeseries is not available for {bldg_id.get_release_name()}"
804
960
  raise No15minLoadCurveError(message)
805
- response = requests.get(download_url, timeout=30)
961
+ response = requests.get(download_url, timeout=30, verify=True)
806
962
  response.raise_for_status()
807
963
  output_file = (
808
964
  output_dir
@@ -850,7 +1006,7 @@ def download_15min_load_curve_with_progress(
850
1006
  if progress and task_id is not None:
851
1007
  _download_with_progress(download_url, output_file, progress, task_id)
852
1008
  else:
853
- response = requests.get(download_url, timeout=30)
1009
+ response = requests.get(download_url, timeout=30, verify=True)
854
1010
  response.raise_for_status()
855
1011
  output_file.write_bytes(response.content)
856
1012
 
@@ -894,19 +1050,23 @@ def download_aggregate_time_step_load_curve_with_progress(
894
1050
 
895
1051
  # Download with progress tracking if progress object is provided
896
1052
  if progress and task_id is not None:
897
- _download_and_process_aggregate(download_url, output_file, progress, task_id, aggregate_time_step)
1053
+ _download_and_process_aggregate(
1054
+ download_url, output_file, progress, task_id, aggregate_time_step, bldg_id.release_year
1055
+ )
898
1056
  else:
899
1057
  # For non-progress downloads, still use temp file approach for consistency
900
1058
  with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as temp_file:
901
1059
  temp_path = Path(temp_file.name)
902
1060
  try:
903
- response = requests.get(download_url, timeout=30)
1061
+ response = requests.get(download_url, timeout=30, verify=True)
904
1062
  response.raise_for_status()
905
1063
  temp_path.write_bytes(response.content)
906
1064
 
907
1065
  # Process with Polars
908
1066
  load_curve_15min = pl.read_parquet(temp_path)
909
- load_curve_aggregate = _aggregate_load_curve_aggregate(load_curve_15min, aggregate_time_step)
1067
+ load_curve_aggregate = _aggregate_load_curve_aggregate(
1068
+ load_curve_15min, aggregate_time_step, bldg_id.release_year
1069
+ )
910
1070
 
911
1071
  # Save processed file to final destination
912
1072
  load_curve_aggregate.write_parquet(output_file)
@@ -936,9 +1096,38 @@ def _parse_requested_file_type(file_type: tuple[str, ...]) -> RequestedFileTypes
936
1096
  file_type_obj.load_curve_monthly = True
937
1097
  if "load_curve_annual" in file_type:
938
1098
  file_type_obj.load_curve_annual = True
1099
+ if "weather" in file_type:
1100
+ file_type_obj.weather = True
939
1101
  return file_type_obj
940
1102
 
941
1103
 
1104
+ def _process_metadata_results(bldg_ids: list[BuildingID], output_dir: Path, downloaded_paths: list[Path]) -> None:
1105
+ """Process the results of a completed metadata download."""
1106
+ metadata_to_bldg_id_mapping: dict[Path, list[int]] = {}
1107
+ for bldg_id in bldg_ids:
1108
+ output_file = (
1109
+ output_dir
1110
+ / bldg_id.get_release_name()
1111
+ / "metadata"
1112
+ / f"state={bldg_id.state}"
1113
+ / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
1114
+ / "metadata.parquet"
1115
+ )
1116
+ if output_file in downloaded_paths:
1117
+ if output_file in metadata_to_bldg_id_mapping:
1118
+ metadata_to_bldg_id_mapping[output_file].append(bldg_id.bldg_id)
1119
+ else:
1120
+ metadata_to_bldg_id_mapping[output_file] = [bldg_id.bldg_id]
1121
+
1122
+ for metadata_file, bldg_id_list in metadata_to_bldg_id_mapping.items():
1123
+ # Use scan_parquet for lazy evaluation and better memory efficiency
1124
+ metadata_df_filtered = pl.scan_parquet(metadata_file).filter(pl.col("bldg_id").is_in(bldg_id_list)).collect()
1125
+ # Write the filtered dataframe back to the same file
1126
+ metadata_df_filtered.write_parquet(metadata_file)
1127
+
1128
+ return
1129
+
1130
+
942
1131
  def _process_download_results(
943
1132
  future: concurrent.futures.Future,
944
1133
  bldg_id: BuildingID,
@@ -965,37 +1154,73 @@ def _process_download_results(
965
1154
  console.print(f"[red]Download failed for bldg_id {bldg_id}: {e}[/red]")
966
1155
 
967
1156
 
968
- def _download_metadata_with_progress(bldg: BuildingID, output_dir: Path, progress: Progress) -> Path:
1157
+ def _download_metadata_with_progress(
1158
+ bldg_ids: list[BuildingID],
1159
+ output_dir: Path,
1160
+ progress: Progress,
1161
+ downloaded_paths: list[Path],
1162
+ failed_downloads: list[str],
1163
+ console: Console,
1164
+ ) -> tuple[list[Path], list[str]]:
969
1165
  """Download metadata file with progress tracking."""
970
- download_url = bldg.get_metadata_url()
971
- if download_url == "":
972
- message = f"Metadata is not available for {bldg.get_release_name()}"
973
- raise NoMetadataError(message)
1166
+ metadata_urls = _resolve_unique_metadata_urls(bldg_ids)
1167
+ downloaded_urls: list[str] = []
1168
+ for bldg_id in bldg_ids:
1169
+ output_file = (
1170
+ output_dir
1171
+ / bldg_id.get_release_name()
1172
+ / "metadata"
1173
+ / f"state={bldg_id.state}"
1174
+ / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
1175
+ / "metadata.parquet"
1176
+ )
1177
+ download_url = bldg_id.get_metadata_url()
1178
+ if download_url == "":
1179
+ failed_downloads.append(str(output_file))
1180
+ continue
1181
+ if download_url in downloaded_urls:
1182
+ continue
1183
+ downloaded_urls.append(download_url)
1184
+ if download_url in metadata_urls:
1185
+ metadata_urls.remove(download_url)
1186
+ metadata_task = progress.add_task(
1187
+ f"[yellow]Downloading metadata: {download_url}",
1188
+ total=0, # Will be updated when we get the file size
1189
+ )
1190
+ # Get file size first
1191
+ response = requests.head(download_url, timeout=30)
1192
+ response.raise_for_status()
1193
+ total_size = int(response.headers.get("content-length", 0))
1194
+ progress.update(metadata_task, total=total_size)
974
1195
 
975
- # Create metadata task with progress tracking
976
- metadata_task = progress.add_task(
977
- "[yellow]Downloading metadata",
978
- total=0, # Will be updated when we get the file size
979
- )
1196
+ output_file.parent.mkdir(parents=True, exist_ok=True)
1197
+ try:
1198
+ _download_with_progress_metadata(download_url, output_file, progress, metadata_task)
1199
+ downloaded_paths.append(output_file)
1200
+ except Exception as e:
1201
+ failed_downloads.append(str(output_file))
1202
+ console.print(f"[red]Download failed for metadata {bldg_id.bldg_id}: {e}[/red]")
980
1203
 
981
- # Get file size first
982
- response = requests.head(download_url, timeout=30)
983
- response.raise_for_status()
984
- total_size = int(response.headers.get("content-length", 0))
985
- progress.update(metadata_task, total=total_size)
1204
+ return downloaded_paths, failed_downloads
986
1205
 
987
- # Download with progress
1206
+
1207
+ def download_weather_file_with_progress(
1208
+ bldg_id: BuildingID, output_dir: Path, progress: Progress, task_id: TaskID
1209
+ ) -> Path:
1210
+ """Download weather file with progress tracking."""
1211
+ download_url = bldg_id.get_weather_file_url()
1212
+ if download_url == "":
1213
+ raise NoWeatherFileError()
988
1214
  output_file = (
989
1215
  output_dir
990
- / bldg.get_release_name()
991
- / "metadata"
992
- / f"state={bldg.state}"
993
- / f"upgrade={str(int(bldg.upgrade_id)).zfill(2)}"
994
- / "metadata.parquet"
1216
+ / bldg_id.get_release_name()
1217
+ / "weather"
1218
+ / f"state={bldg_id.state}"
1219
+ / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
1220
+ / f"{bldg_id.get_weather_station_name()}.csv"
995
1221
  )
996
1222
  output_file.parent.mkdir(parents=True, exist_ok=True)
997
- _download_with_progress(download_url, output_file, progress, metadata_task)
998
-
1223
+ _download_with_progress(download_url, output_file, progress, task_id)
999
1224
  return output_file
1000
1225
 
1001
1226
 
@@ -1244,19 +1469,19 @@ def _download_aggregate_load_curves_parallel(
1244
1469
  )
1245
1470
 
1246
1471
 
1247
- def _download_metadata_single(
1472
+ def _download_metadata(
1248
1473
  bldg_ids: list[BuildingID],
1249
1474
  output_dir: Path,
1250
1475
  progress: Progress,
1251
1476
  downloaded_paths: list[Path],
1477
+ failed_downloads: list[str],
1478
+ console: Console,
1252
1479
  ) -> None:
1253
1480
  """Download metadata file (only one needed per release)."""
1254
1481
  if not bldg_ids:
1255
1482
  return
1256
-
1257
- bldg = bldg_ids[0]
1258
- metadata_file = _download_metadata_with_progress(bldg, output_dir, progress)
1259
- downloaded_paths.append(metadata_file)
1483
+ _download_metadata_with_progress(bldg_ids, output_dir, progress, downloaded_paths, failed_downloads, console)
1484
+ _process_metadata_results(bldg_ids, output_dir, downloaded_paths)
1260
1485
 
1261
1486
 
1262
1487
  def download_annual_load_curve_with_progress(
@@ -1302,7 +1527,7 @@ def download_annual_load_curve_with_progress(
1302
1527
  if progress and task_id is not None:
1303
1528
  _download_with_progress(download_url, output_file, progress, task_id)
1304
1529
  else:
1305
- response = requests.get(download_url, timeout=30)
1530
+ response = requests.get(download_url, timeout=30, verify=True)
1306
1531
  response.raise_for_status()
1307
1532
  with open(output_file, "wb") as file:
1308
1533
  file.write(response.content)
@@ -1369,6 +1594,97 @@ def _download_annual_load_curves_parallel(
1369
1594
  console.print(f"[red]Download failed for annual load curve {bldg_id.bldg_id}: {e}[/red]")
1370
1595
 
1371
1596
 
1597
+ def _download_weather_files_parallel(
1598
+ bldg_ids: list[BuildingID],
1599
+ output_dir: Path,
1600
+ max_workers: int,
1601
+ progress: Progress,
1602
+ downloaded_paths: list[Path],
1603
+ failed_downloads: list[str],
1604
+ console: Console,
1605
+ weather_states: Union[list[str], None] = None,
1606
+ ) -> None:
1607
+ """Download weather files in parallel with progress tracking."""
1608
+ # Initialize weather_states to empty list if None
1609
+ if weather_states is None:
1610
+ weather_states = []
1611
+
1612
+ # Break if weather_states is empty
1613
+ if len(weather_states) == 0:
1614
+ for bldg_id in bldg_ids:
1615
+ output_file = (
1616
+ output_dir
1617
+ / bldg_id.get_release_name()
1618
+ / "weather"
1619
+ / f"state={bldg_id.state}"
1620
+ / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
1621
+ / f"{bldg_id.get_weather_station_name()}.csv"
1622
+ )
1623
+ failed_downloads.append(str(output_file))
1624
+ console.print(f"[red]Weather file not available for {bldg_id.get_release_name()}[/red]")
1625
+ return
1626
+ # Create progress tasks for weather file downloads
1627
+ weather_file_tasks = {}
1628
+ for i, bldg_id in enumerate(bldg_ids):
1629
+ if bldg_id.state in weather_states:
1630
+ task_id = progress.add_task(
1631
+ f"[magenta]Weather file {bldg_id.bldg_id} (upgrade {bldg_id.upgrade_id})",
1632
+ total=0, # Will be updated when we get the file size
1633
+ )
1634
+ weather_file_tasks[i] = task_id
1635
+ else:
1636
+ output_file = (
1637
+ output_dir
1638
+ / bldg_id.get_release_name()
1639
+ / "weather"
1640
+ / f"state={bldg_id.state}"
1641
+ / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
1642
+ / f"{bldg_id.get_weather_station_name()}.csv"
1643
+ )
1644
+ failed_downloads.append(str(output_file))
1645
+ console.print(f"[red]Weather file not available for {bldg_id.get_release_name()}[/red]")
1646
+
1647
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
1648
+ # Create a modified version of the download function that uses the specific task IDs
1649
+ def download_weather_file_with_task_id(bldg_id: BuildingID, output_dir: Path, task_id: TaskID) -> Path:
1650
+ return download_weather_file_with_progress(bldg_id, output_dir, progress, task_id)
1651
+
1652
+ future_to_bldg = {
1653
+ executor.submit(download_weather_file_with_task_id, bldg_id, output_dir, weather_file_tasks[i]): bldg_id
1654
+ for i, bldg_id in enumerate(bldg_ids)
1655
+ }
1656
+
1657
+ for future in concurrent.futures.as_completed(future_to_bldg):
1658
+ bldg_id = future_to_bldg[future]
1659
+ try:
1660
+ output_file = future.result()
1661
+ downloaded_paths.append(output_file)
1662
+ except NoWeatherFileError:
1663
+ output_file = (
1664
+ output_dir
1665
+ / bldg_id.get_release_name()
1666
+ / "weather"
1667
+ / f"state={bldg_id.state}"
1668
+ / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
1669
+ / f"{bldg_id.get_weather_station_name()}.csv"
1670
+ )
1671
+ failed_downloads.append(str(output_file))
1672
+ console.print(f"[red]Weather file not available for {bldg_id.get_release_name()}[/red]")
1673
+ raise
1674
+ except Exception as e:
1675
+ output_file = (
1676
+ output_dir
1677
+ / bldg_id.get_release_name()
1678
+ / "weather"
1679
+ / f"state={bldg_id.state}"
1680
+ / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
1681
+ / f"{bldg_id.get_weather_station_name()}.csv"
1682
+ )
1683
+ failed_downloads.append(str(output_file))
1684
+ console.print(f"[red]Download failed for weather file {bldg_id.bldg_id}: {e}[/red]")
1685
+ raise
1686
+
1687
+
1372
1688
  def _print_download_summary(downloaded_paths: list[Path], failed_downloads: list[str], console: Console) -> None:
1373
1689
  """Print a summary of the download results."""
1374
1690
  console.print("\n[bold green]Download complete![/bold green]")
@@ -1380,7 +1696,11 @@ def _print_download_summary(downloaded_paths: list[Path], failed_downloads: list
1380
1696
 
1381
1697
 
1382
1698
  def fetch_bldg_data(
1383
- bldg_ids: list[BuildingID], file_type: tuple[str, ...], output_dir: Path, max_workers: int = 5
1699
+ bldg_ids: list[BuildingID],
1700
+ file_type: tuple[str, ...],
1701
+ output_dir: Path,
1702
+ max_workers: int = 5,
1703
+ weather_states: Union[list[str], None] = None,
1384
1704
  ) -> tuple[list[Path], list[str]]:
1385
1705
  """Download building data for a given list of building ids
1386
1706
 
@@ -1395,19 +1715,29 @@ def fetch_bldg_data(
1395
1715
  file_type_obj = _parse_requested_file_type(file_type)
1396
1716
  console = Console()
1397
1717
 
1718
+ # Initialize weather_states to empty list if None
1719
+ if weather_states is None:
1720
+ weather_states = []
1721
+
1398
1722
  downloaded_paths: list[Path] = []
1399
1723
  failed_downloads: list[str] = []
1400
1724
 
1401
1725
  # Calculate total files to download
1402
1726
  total_files = 0
1403
1727
  if file_type_obj.metadata:
1404
- total_files += 1 # Add metadata file
1728
+ unique_metadata_urls = _resolve_unique_metadata_urls(bldg_ids)
1729
+ total_files += len(unique_metadata_urls) # Add metadata file
1405
1730
  if file_type_obj.load_curve_15min:
1406
1731
  total_files += len(bldg_ids) # Add 15-minute load curve files
1732
+ if file_type_obj.load_curve_hourly:
1733
+ total_files += len(bldg_ids) # Add hourly load curve files
1407
1734
  if file_type_obj.load_curve_monthly:
1408
- total_files += len(bldg_ids) # Add 15-minute load curve files
1735
+ total_files += len(bldg_ids) # Add monthly load curve files
1409
1736
  if file_type_obj.load_curve_annual:
1410
1737
  total_files += len(bldg_ids) # Add annual load curve files
1738
+ if file_type_obj.weather:
1739
+ available_bldg_ids = [bldg_id for bldg_id in bldg_ids if bldg_id.state in weather_states]
1740
+ total_files += len(available_bldg_ids) * len(weather_states) # Add weather map files
1411
1741
 
1412
1742
  console.print(f"\n[bold blue]Starting download of {total_files} files...[/bold blue]")
1413
1743
 
@@ -1425,45 +1755,103 @@ def fetch_bldg_data(
1425
1755
  console=console,
1426
1756
  transient=False,
1427
1757
  ) as progress:
1428
- # Download building data if requested.
1429
- if file_type_obj.hpxml or file_type_obj.schedule:
1430
- _download_building_data_parallel(
1431
- bldg_ids, file_type_obj, output_dir, max_workers, progress, downloaded_paths, failed_downloads, console
1432
- )
1758
+ _execute_downloads(
1759
+ file_type_obj,
1760
+ bldg_ids,
1761
+ output_dir,
1762
+ max_workers,
1763
+ progress,
1764
+ downloaded_paths,
1765
+ failed_downloads,
1766
+ console,
1767
+ weather_states,
1768
+ )
1433
1769
 
1434
- # Get metadata if requested. Only one building is needed to get the metadata.
1435
- if file_type_obj.metadata:
1436
- _download_metadata_single(bldg_ids, output_dir, progress, downloaded_paths)
1770
+ _print_download_summary(downloaded_paths, failed_downloads, console)
1437
1771
 
1438
- # Get 15 min load profile timeseries if requested.
1439
- if file_type_obj.load_curve_15min:
1440
- _download_15min_load_curves_parallel(
1441
- bldg_ids, output_dir, max_workers, progress, downloaded_paths, failed_downloads, console
1442
- )
1772
+ return downloaded_paths, failed_downloads
1443
1773
 
1444
- if file_type_obj.load_curve_monthly:
1445
- aggregate_time_step = "monthly"
1446
- _download_aggregate_load_curves_parallel(
1447
- bldg_ids,
1448
- output_dir,
1449
- aggregate_time_step,
1450
- max_workers,
1451
- progress,
1452
- downloaded_paths,
1453
- failed_downloads,
1454
- console,
1455
- )
1456
1774
 
1457
- # Get annual load curve if requested.
1458
- if file_type_obj.load_curve_annual:
1459
- _download_annual_load_curves_parallel(
1460
- bldg_ids, output_dir, max_workers, progress, downloaded_paths, failed_downloads, console
1461
- )
1775
+ def _execute_downloads(
1776
+ file_type_obj: RequestedFileTypes,
1777
+ bldg_ids: list[BuildingID],
1778
+ output_dir: Path,
1779
+ max_workers: int,
1780
+ progress: Progress,
1781
+ downloaded_paths: list[Path],
1782
+ failed_downloads: list[str],
1783
+ console: Console,
1784
+ weather_states: Union[list[str], None] = None,
1785
+ ) -> None:
1786
+ """Execute all requested downloads based on file type configuration."""
1787
+ # Initialize weather_states to empty list if None
1788
+ if weather_states is None:
1789
+ weather_states = []
1790
+
1791
+ # Download building data if requested.
1792
+ if file_type_obj.hpxml or file_type_obj.schedule:
1793
+ _download_building_data_parallel(
1794
+ bldg_ids, file_type_obj, output_dir, max_workers, progress, downloaded_paths, failed_downloads, console
1795
+ )
1462
1796
 
1463
- _print_download_summary(downloaded_paths, failed_downloads, console)
1797
+ # Get metadata if requested. Only one building is needed to get the metadata.
1798
+ if file_type_obj.metadata:
1799
+ _download_metadata(bldg_ids, output_dir, progress, downloaded_paths, failed_downloads, console)
1464
1800
 
1465
- return downloaded_paths, failed_downloads
1801
+ # Get 15 min load profile timeseries if requested.
1802
+ if file_type_obj.load_curve_15min:
1803
+ _download_15min_load_curves_parallel(
1804
+ bldg_ids, output_dir, max_workers, progress, downloaded_paths, failed_downloads, console
1805
+ )
1806
+
1807
+ if file_type_obj.load_curve_hourly:
1808
+ aggregate_time_step = "hourly"
1809
+ _download_aggregate_load_curves_parallel(
1810
+ bldg_ids,
1811
+ output_dir,
1812
+ aggregate_time_step,
1813
+ max_workers,
1814
+ progress,
1815
+ downloaded_paths,
1816
+ failed_downloads,
1817
+ console,
1818
+ )
1819
+
1820
+ if file_type_obj.load_curve_monthly:
1821
+ aggregate_time_step = "monthly"
1822
+ _download_aggregate_load_curves_parallel(
1823
+ bldg_ids,
1824
+ output_dir,
1825
+ aggregate_time_step,
1826
+ max_workers,
1827
+ progress,
1828
+ downloaded_paths,
1829
+ failed_downloads,
1830
+ console,
1831
+ )
1832
+
1833
+ # Get annual load curve if requested.
1834
+ if file_type_obj.load_curve_annual:
1835
+ _download_annual_load_curves_parallel(
1836
+ bldg_ids, output_dir, max_workers, progress, downloaded_paths, failed_downloads, console
1837
+ )
1838
+
1839
+ # Get weather files if requested.
1840
+ if file_type_obj.weather:
1841
+ _download_weather_files_parallel(
1842
+ bldg_ids, output_dir, max_workers, progress, downloaded_paths, failed_downloads, console, weather_states
1843
+ )
1466
1844
 
1467
1845
 
1468
1846
  if __name__ == "__main__": # pragma: no cover
1469
- print(fetch_bldg_ids("comstock", "2021", "tmy3", "1", "MA", "0")[:3])
1847
+ bldg_ids = [
1848
+ BuildingID(
1849
+ bldg_id=67, release_year="2024", res_com="comstock", weather="tmy3", upgrade_id="0", release_number="2"
1850
+ ),
1851
+ ]
1852
+ file_type = ("weather",)
1853
+ output_dir = Path("data")
1854
+ weather_states: list[str] = []
1855
+ downloaded_paths, failed_downloads = fetch_bldg_data(bldg_ids, file_type, output_dir, weather_states=weather_states)
1856
+ print(downloaded_paths)
1857
+ print(failed_downloads)