PyPI - buildstock-fetch - Versions diffs - 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl - Mend - Supply Chain Defender

buildstock-fetch 1.3.0py3-none-any.whl → 1.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of buildstock-fetch might be problematic. Click here for more details.

Files changed (11) hide show

buildstock_fetch/main.py CHANGED Viewed

@@ -1,15 +1,20 @@
 import concurrent.futures
+import gc
 import json
+import os
 import tempfile
 import zipfile
 from dataclasses import asdict, dataclass
 from datetime import timedelta
 from importlib.resources import files
 from pathlib import Path
-from typing import Optional, Union
+from typing import Any, Optional, Union
+import boto3
 import polars as pl
 import requests
+from botocore import UNSIGNED
+from botocore.config import Config
 from rich.console import Console
 from rich.progress import (
     BarColumn,
@@ -23,6 +28,8 @@ from rich.progress import (
     TransferSpeedColumn,
 )
+# from buildstock_fetch.main_cli import _get_all_available_releases
 class InvalidProductError(ValueError):
     """Raised when an invalid product is provided."""
@@ -96,6 +103,7 @@ class RequestedFileTypes:
     load_curve_daily: bool = False
     load_curve_monthly: bool = False
     load_curve_annual: bool = False
+    trip_schedules: bool = False
     weather: bool = False
@@ -193,10 +201,14 @@ class BuildingID:
                 return f"{self.base_url}metadata/upgrade{str(int(self.upgrade_id)).zfill(2)}.parquet"
         elif self.release_year == "2024":
             if self.res_com == "comstock" and self.weather == "amy2018" and self.release_number == "2":
-                return ""
-                # This release does not have a single national metadata file.
-                # Instead, it has a metadata file for each county.
-                # We need a way to download them all and combine based on the state
+                if self.upgrade_id == "0":
+                    upgrade_filename = "baseline"
+                else:
+                    upgrade_filename = f"upgrade{str(int(self.upgrade_id)).zfill(2)}"
+                return (
+                    f"{self.base_url}metadata_and_annual_results/by_state_and_county/full/parquet/"
+                    f"state={self.state}/county={self._get_county_name()}/{self.state}_{self._get_county_name()}_{upgrade_filename}.parquet"
+                )
             else:
                 if self.upgrade_id == "0":
                     return f"{self.base_url}metadata/baseline.parquet"
@@ -206,12 +218,12 @@ class BuildingID:
             self.release_year == "2025"
             and self.res_com == "comstock"
             and self.weather == "amy2018"
-            and self.release_number == "1"
+            and (self.release_number == "1" or self.release_number == "2")
         ):
-            return ""
-            # This release does not have a single national metadata file.
-            # Instead, it has a metadata file for each county.
-            # We need a way to download them all and combine based on the state
+            return (
+                f"{self.base_url}metadata_and_annual_results/by_state_and_county/full/parquet/"
+                f"state={self.state}/county={self._get_county_name()}/{self.state}_{self._get_county_name()}_upgrade{self.upgrade_id}.parquet"
+            )
         else:
             return ""
@@ -630,6 +642,21 @@ def _download_with_progress(url: str, output_file: Path, progress: Progress, tas
     return downloaded_size
+def _extract_metadata_columns_to_keep(metadata_file: Path) -> list[str]:
+    """Extract metadata columns from a schema."""
+    schema = pl.scan_parquet(metadata_file).collect_schema()
+    columns_to_keep = []
+    for col in schema:
+        if (
+            any(keyword in col for keyword in ["upgrade", "bldg_id", "metadata_index"])
+            or col.startswith("in.")
+            or col.startswith("in.")
+        ):
+            columns_to_keep.append(col)
+    return columns_to_keep
 def _download_with_progress_metadata(url: str, output_file: Path, progress: Progress, task_id: TaskID) -> int:
     """Download a metadata file with progress tracking and append to existing file if it exists."""
     # Get file size first
@@ -646,36 +673,26 @@ def _download_with_progress_metadata(url: str, output_file: Path, progress: Prog
     # Check if output file already exists
     if output_file.exists():
-        # Read existing parquet file
-        existing_df = pl.read_parquet(output_file)
-        # Download new data to temporary file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as temp_file:
             temp_path = Path(temp_file.name)
+            with open(temp_path, "wb") as file:
+                for chunk in response.iter_content(chunk_size=8192):
+                    if chunk:
+                        file.write(chunk)
+                        downloaded_size += len(chunk)
+                        if total_size > 0:
+                            progress.update(task_id, completed=downloaded_size)
+            _process_single_metadata_file(temp_path)
-            try:
-                # Download to temp file
-                with open(temp_path, "wb") as file:
-                    for chunk in response.iter_content(chunk_size=8192):
-                        if chunk:
-                            file.write(chunk)
-                            downloaded_size += len(chunk)
-                            if total_size > 0:
-                                progress.update(task_id, completed=downloaded_size)
-                # Read new data
-                new_df = pl.read_parquet(temp_path)
-                # Concatenate existing and new data, removing duplicates
-                combined_df = pl.concat([existing_df, new_df]).unique()
-                # Write combined data back to original file
-                combined_df.write_parquet(output_file)
+            existing_file = pl.scan_parquet(output_file)
+            new_file = pl.scan_parquet(temp_path)
+            combined_file = pl.concat([existing_file, new_file])
+            # Remove duplicate rows based on bldg_id column
+            deduplicated_file = combined_file.collect().unique(subset=["bldg_id"], keep="first")
+            deduplicated_file.write_parquet(output_file)
+            gc.collect()
+            os.remove(temp_path)
-            finally:
-                # Clean up temp file
-                if temp_path.exists():
-                    temp_path.unlink()
     else:
         # File doesn't exist, download normally
         with open(str(output_file), "wb") as file:
@@ -686,9 +703,47 @@ def _download_with_progress_metadata(url: str, output_file: Path, progress: Prog
                     if total_size > 0:
                         progress.update(task_id, completed=downloaded_size)
+        _process_single_metadata_file(output_file)
     return downloaded_size
+def _process_single_metadata_file(metadata_file: Path) -> None:
+    """Process a single metadata file to keep only columns containing specified keywords."""
+    # First, get column names without loading data into memory
+    schema = pl.scan_parquet(metadata_file).collect_schema()
+    # Filter columns to only keep those containing "bldg_id", "upgrade", "metadata_index", or "out."
+    # and remove columns that start with "in."
+    columns_to_keep = []
+    for col in schema:
+        if any(keyword in col for keyword in ["bldg_id", "upgrade", "metadata_index"]) or col.startswith("in."):
+            columns_to_keep.append(col)
+    # Use streaming operations to avoid loading entire file into memory
+    # Create a temporary file to write the filtered data
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as temp_file:
+        temp_file_path = temp_file.name
+        try:
+            # Stream the data: select columns and write in one operation
+            filtered_metadata_file = pl.scan_parquet(metadata_file).select(columns_to_keep).collect()
+            filtered_metadata_file.write_parquet(temp_file_path)
+            # Replace the original file with the filtered one
+            os.replace(temp_file_path, metadata_file)
+            # Force garbage collection to free memory immediately
+            gc.collect()
+        except Exception:
+            # Clean up temp file if something goes wrong
+            if os.path.exists(temp_file_path):
+                os.remove(temp_file_path)
+            raise
+    return
 def _get_time_step_grouping_key(aggregate_time_step: str) -> tuple[str, str]:
     """Get the grouping key and format string for a given time step.
@@ -749,10 +804,7 @@ def _create_aggregation_expressions(load_curve: pl.DataFrame, column_aggregation
 def _aggregate_load_curve_aggregate(
     load_curve: pl.DataFrame, aggregate_time_step: str, release_year: str
 ) -> pl.DataFrame:
-    """Aggregate the 15-minute load curve to specified time step based on aggregation rules.
-    Removes the last row to ensure complete aggregation periods.
-    """
+    """Aggregate the 15-minute load curve to specified time step based on aggregation rules."""
     # Read the aggregation rules from CSV
     if release_year == "2024":
         load_curve_map = LOAD_CURVE_COLUMN_AGGREGATION.joinpath("2024_resstock_load_curve_columns.csv")
@@ -833,6 +885,7 @@ def _download_and_process_aggregate(
             # Process with Polars
             load_curve_15min = pl.read_parquet(temp_path)
             load_curve_aggregate = _aggregate_load_curve_aggregate(load_curve_15min, aggregate_time_step, release_year)
+            _add_time_aggregation_columns(load_curve_aggregate, aggregate_time_step)
             # Save processed file to final destination
             load_curve_aggregate.write_parquet(output_file)
@@ -1013,6 +1066,39 @@ def download_15min_load_curve_with_progress(
     return output_file
+def _add_time_aggregation_columns(load_curve_aggregate: pl.DataFrame, aggregate_time_step: str) -> None:
+    """Add time-based columns to the dataframe based on aggregation type.
+    Args:
+        df: Polars DataFrame with a 'timestamp' column
+        aggregate_time_step: Type of aggregation ('hourly', 'daily', 'monthly')
+    """
+    if aggregate_time_step == "hourly":
+        # Add year, month, day, and hour columns
+        new_df = load_curve_aggregate.with_columns([
+            pl.col("timestamp").dt.year().alias("year"),
+            pl.col("timestamp").dt.month().alias("month"),
+            pl.col("timestamp").dt.day().alias("day"),
+            pl.col("timestamp").dt.hour().alias("hour"),
+        ])
+        load_curve_aggregate.__dict__.update(new_df.__dict__)
+    elif aggregate_time_step == "daily":
+        # Add year, month, and day columns
+        new_df = load_curve_aggregate.with_columns([
+            pl.col("timestamp").dt.year().alias("year"),
+            pl.col("timestamp").dt.month().alias("month"),
+            pl.col("timestamp").dt.day().alias("day"),
+        ])
+        load_curve_aggregate.__dict__.update(new_df.__dict__)
+    elif aggregate_time_step == "monthly":
+        # Add year and month columns
+        new_df = load_curve_aggregate.with_columns([
+            pl.col("timestamp").dt.year().alias("year"),
+            pl.col("timestamp").dt.month().alias("month"),
+        ])
+        load_curve_aggregate.__dict__.update(new_df.__dict__)
 def download_aggregate_time_step_load_curve_with_progress(
     bldg_id: BuildingID,
     output_dir: Path,
@@ -1067,6 +1153,7 @@ def download_aggregate_time_step_load_curve_with_progress(
                 load_curve_aggregate = _aggregate_load_curve_aggregate(
                     load_curve_15min, aggregate_time_step, bldg_id.release_year
                 )
+                _add_time_aggregation_columns(load_curve_aggregate, aggregate_time_step)
                 # Save processed file to final destination
                 load_curve_aggregate.write_parquet(output_file)
@@ -1080,28 +1167,32 @@ def download_aggregate_time_step_load_curve_with_progress(
 def _parse_requested_file_type(file_type: tuple[str, ...]) -> RequestedFileTypes:
     """Parse the file type string into a RequestedFileTypes object."""
     file_type_obj = RequestedFileTypes()
-    if "hpxml" in file_type:
-        file_type_obj.hpxml = True
-    if "schedule" in file_type:
-        file_type_obj.schedule = True
-    if "metadata" in file_type:
-        file_type_obj.metadata = True
-    if "load_curve_15min" in file_type:
-        file_type_obj.load_curve_15min = True
-    if "load_curve_hourly" in file_type:
-        file_type_obj.load_curve_hourly = True
-    if "load_curve_daily" in file_type:
-        file_type_obj.load_curve_daily = True
-    if "load_curve_monthly" in file_type:
-        file_type_obj.load_curve_monthly = True
-    if "load_curve_annual" in file_type:
-        file_type_obj.load_curve_annual = True
-    if "weather" in file_type:
-        file_type_obj.weather = True
+    # Map file type strings to their corresponding attributes
+    type_mapping = {
+        "hpxml": "hpxml",
+        "schedule": "schedule",
+        "metadata": "metadata",
+        "load_curve_15min": "load_curve_15min",
+        "load_curve_hourly": "load_curve_hourly",
+        "load_curve_daily": "load_curve_daily",
+        "load_curve_monthly": "load_curve_monthly",
+        "load_curve_annual": "load_curve_annual",
+        "trip_schedules": "trip_schedules",
+        "weather": "weather",
+    }
+    # Set attributes based on what's in the file_type tuple
+    for type_str, attr_name in type_mapping.items():
+        if type_str in file_type:
+            setattr(file_type_obj, attr_name, True)
     return file_type_obj
-def _process_metadata_results(bldg_ids: list[BuildingID], output_dir: Path, downloaded_paths: list[Path]) -> None:
+def _filter_metadata_requested_bldg_ids(
+    bldg_ids: list[BuildingID], output_dir: Path, downloaded_paths: list[Path]
+) -> None:
     """Process the results of a completed metadata download."""
     metadata_to_bldg_id_mapping: dict[Path, list[int]] = {}
     for bldg_id in bldg_ids:
@@ -1120,14 +1211,69 @@ def _process_metadata_results(bldg_ids: list[BuildingID], output_dir: Path, down
                 metadata_to_bldg_id_mapping[output_file] = [bldg_id.bldg_id]
     for metadata_file, bldg_id_list in metadata_to_bldg_id_mapping.items():
-        # Use scan_parquet for lazy evaluation and better memory efficiency
-        metadata_df_filtered = pl.scan_parquet(metadata_file).filter(pl.col("bldg_id").is_in(bldg_id_list)).collect()
-        # Write the filtered dataframe back to the same file
-        metadata_df_filtered.write_parquet(metadata_file)
+        # Use streaming operations to avoid loading entire file into memory
+        # Stream the data: filter rows, select columns, and write in one operation
+        filtered_metadata_file = pl.scan_parquet(metadata_file).filter(pl.col("bldg_id").is_in(bldg_id_list)).collect()
+        # Replace the original file with the filtered one
+        filtered_metadata_file.write_parquet(metadata_file)
+        # Force garbage collection to free memory immediately
+        gc.collect()
     return
+def _process_annual_load_curve_file(file_path: Path) -> None:
+    """Process an annual load curve file to keep only columns containing specified keywords.
+    Args:
+        file_path: Path to the annual load curve parquet file to process.
+    """
+    # First, get column names without loading data into memory
+    schema = pl.scan_parquet(file_path).collect_schema()
+    # Filter columns to only keep those containing "bldg_id", "upgrade", "metadata_index", or "out."
+    # and remove columns that start with "in."
+    columns_to_keep = []
+    for col in schema:
+        if (
+            any(keyword in col for keyword in ["bldg_id", "upgrade", "metadata_index"]) or col.startswith("out.")
+        ) and not col.startswith("in."):
+            columns_to_keep.append(col)
+    # Use streaming operations to avoid loading entire file into memory
+    # Create a temporary file to write the filtered data
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as temp_file:
+        temp_file_path = temp_file.name
+    # Stream the data: select columns and write in one operation
+    filtered_file = pl.scan_parquet(file_path).select(columns_to_keep).collect()
+    filtered_file.write_parquet(temp_file_path)
+    # Replace the original file with the filtered one
+    os.replace(temp_file_path, file_path)
+    # Force garbage collection to free memory immediately
+    gc.collect()
+def _process_annual_load_curve_results(downloaded_paths: list[Path]) -> None:
+    """Process all downloaded annual load curve files to filter columns.
+    Args:
+        downloaded_paths: List of all downloaded file paths.
+    """
+    # Filter for annual load curve files
+    annual_load_curve_files = [
+        path for path in downloaded_paths if "load_curve_annual" in str(path) and path.suffix == ".parquet"
+    ]
+    # Process each annual load curve file
+    for file_path in annual_load_curve_files:
+        _process_annual_load_curve_file(file_path)
 def _process_download_results(
     future: concurrent.futures.Future,
     bldg_id: BuildingID,
@@ -1184,14 +1330,9 @@ def _download_metadata_with_progress(
         if download_url in metadata_urls:
             metadata_urls.remove(download_url)
         metadata_task = progress.add_task(
-            f"[yellow]Downloading metadata: {download_url}",
+            f"[yellow]Downloading metadata: {bldg_id.get_release_name()} - (upgrade {bldg_id.upgrade_id}) - {bldg_id.state}",
             total=0,  # Will be updated when we get the file size
         )
-        # Get file size first
-        response = requests.head(download_url, timeout=30)
-        response.raise_for_status()
-        total_size = int(response.headers.get("content-length", 0))
-        progress.update(metadata_task, total=total_size)
         output_file.parent.mkdir(parents=True, exist_ok=True)
         try:
@@ -1272,62 +1413,64 @@ def _download_15min_load_curves_parallel(
     console: Console,
 ) -> None:
     """Download 15-minute load curves in parallel with progress tracking."""
-    # Create progress tasks for 15-minute load curve downloads
-    load_curve_tasks = {}
-    for i, bldg_id in enumerate(bldg_ids):
-        task_id = progress.add_task(
-            f"[magenta]Load curve {bldg_id.bldg_id} (upgrade {bldg_id.upgrade_id})",
-            total=0,  # Will be updated when we get the file size
-        )
-        load_curve_tasks[i] = task_id
-    # Create a modified version of the download function that uses the specific task IDs
+    # Create progress tasks based on dataset size
+    if len(bldg_ids) > 500:
+        load_curve_tasks = _create_batch_progress_tasks_15min(bldg_ids, progress, console)
+    else:
+        load_curve_tasks = _create_individual_progress_tasks_15min(bldg_ids, progress)
+    # Create download functions
     def download_15min_with_task_id(bldg_id: BuildingID, output_dir: Path, task_id: TaskID) -> Path:
         return download_15min_load_curve_with_progress(bldg_id, output_dir, progress, task_id)
     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-        future_to_bldg = {
-            executor.submit(download_15min_with_task_id, bldg_id, output_dir, load_curve_tasks[i]): bldg_id
-            for i, bldg_id in enumerate(bldg_ids)
-        }
+        if len(bldg_ids) > 500:
+            # Process in batches for large datasets
+            num_batches = 20
+            batch_size = ((len(bldg_ids) + num_batches - 1) // num_batches + 99) // 100 * 100
+            future_to_bldg = {}
+            for batch_idx in range(0, len(bldg_ids), batch_size):
+                batch = bldg_ids[batch_idx : batch_idx + batch_size]
+                # Skip empty batches
+                if not batch:
+                    break
+                task_id = load_curve_tasks[batch_idx // batch_size]
+                for bldg_id in batch:
+                    future = executor.submit(
+                        _download_15min_with_batch_progress,
+                        bldg_id,
+                        output_dir,
+                        task_id,
+                        progress,
+                    )
+                    future_to_bldg[future] = bldg_id
+        else:
+            # Original behavior for smaller datasets
+            future_to_bldg = {
+                executor.submit(download_15min_with_task_id, bldg_id, output_dir, load_curve_tasks[i]): bldg_id
+                for i, bldg_id in enumerate(bldg_ids)
+            }
+        # Process completed futures
         for future in concurrent.futures.as_completed(future_to_bldg):
             bldg_id = future_to_bldg[future]
-            try:
-                output_file = future.result()
-                downloaded_paths.append(output_file)
-            except No15minLoadCurveError:
-                output_file = (
-                    output_dir
-                    / bldg_id.get_release_name()
-                    / "load_curve_15min"
-                    / f"state={bldg_id.state}"
-                    / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
-                    / f"bldg{str(bldg_id.bldg_id).zfill(7)}_load_curve_15min.parquet"
-                )
-                failed_downloads.append(str(output_file))
-                console.print(f"[red]15 min load curve not available for {bldg_id.get_release_name()}[/red]")
-                raise
-            except Exception as e:
-                output_file = (
-                    output_dir
-                    / bldg_id.get_release_name()
-                    / "load_curve_15min"
-                    / f"state={bldg_id.state}"
-                    / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
-                    / f"bldg{str(bldg_id.bldg_id).zfill(7)}_load_curve_15min.parquet"
-                )
-                failed_downloads.append(str(output_file))
-                console.print(f"[red]Download failed for 15 min load curve {bldg_id.bldg_id}: {e}[/red]")
+            _process_download_future_15min(future, bldg_id, output_dir, downloaded_paths, failed_downloads, console)
 def _create_batch_progress_tasks(
     bldg_ids: list[BuildingID], aggregate_time_step: str, progress: Progress, console: Console
 ) -> dict[int, TaskID]:
     """Create progress tasks for batch processing."""
-    batch_size = 100
-    num_batches = (len(bldg_ids) + batch_size - 1) // batch_size
-    console.print(f"[blue]Using batch processing: {len(bldg_ids)} buildings split into {num_batches} batches[/blue]")
+    num_batches = 20
+    # Calculate batch size rounded up to nearest 100
+    batch_size = ((len(bldg_ids) + num_batches - 1) // num_batches + 99) // 100 * 100
+    console.print(
+        f"[blue]Using batch processing: {len(bldg_ids)} buildings split into {num_batches} batches of up to {batch_size} buildings each[/blue]"
+    )
     load_curve_tasks = {}
     for i in range(num_batches):
@@ -1336,6 +1479,10 @@ def _create_batch_progress_tasks(
         end_idx = min(start_idx + batch_size, len(bldg_ids))
         batch_count = end_idx - start_idx
+        # Skip empty or negative batches
+        if batch_count <= 0:
+            break
         console.print(f"[blue]Batch {i + 1}/{num_batches}: {batch_count} buildings[/blue]")
         task_id = progress.add_task(
@@ -1347,6 +1494,39 @@ def _create_batch_progress_tasks(
     return load_curve_tasks
+def _create_batch_progress_tasks_15min(
+    bldg_ids: list[BuildingID], progress: Progress, console: Console
+) -> dict[int, TaskID]:
+    """Create progress tasks for 15-minute load curve batch processing."""
+    num_batches = 20
+    # Calculate batch size rounded up to nearest 100
+    batch_size = ((len(bldg_ids) + num_batches - 1) // num_batches + 99) // 100 * 100
+    console.print(
+        f"[blue]Using batch processing: {len(bldg_ids)} buildings split into {num_batches} batches of up to {batch_size} buildings each[/blue]"
+    )
+    load_curve_tasks = {}
+    for i in range(num_batches):
+        # Calculate how many buildings are in this batch
+        start_idx = i * batch_size
+        end_idx = min(start_idx + batch_size, len(bldg_ids))
+        batch_count = end_idx - start_idx
+        # Skip empty or negative batches
+        if batch_count <= 0:
+            break
+        console.print(f"[blue]Batch {i + 1}/{num_batches}: {batch_count} buildings[/blue]")
+        task_id = progress.add_task(
+            f"[magenta]Batch {i + 1}/{num_batches} (15min)",
+            total=batch_count,  # Set total to the number of buildings in this batch
+        )
+        load_curve_tasks[i] = task_id
+    return load_curve_tasks
 def _create_individual_progress_tasks(bldg_ids: list[BuildingID], progress: Progress) -> dict[int, TaskID]:
     """Create progress tasks for individual building processing."""
     load_curve_tasks = {}
@@ -1359,6 +1539,18 @@ def _create_individual_progress_tasks(bldg_ids: list[BuildingID], progress: Prog
     return load_curve_tasks
+def _create_individual_progress_tasks_15min(bldg_ids: list[BuildingID], progress: Progress) -> dict[int, TaskID]:
+    """Create progress tasks for individual 15-minute load curve processing."""
+    load_curve_tasks = {}
+    for i, bldg_id in enumerate(bldg_ids):
+        task_id = progress.add_task(
+            f"[magenta]Load curve {bldg_id.bldg_id} (upgrade {bldg_id.upgrade_id})",
+            total=0,  # Will be updated when we get the file size
+        )
+        load_curve_tasks[i] = task_id
+    return load_curve_tasks
 def _download_aggregate_with_batch_progress(
     bldg_id: BuildingID, output_dir: Path, task_id: TaskID, aggregate_time_step: str, progress: Progress
 ) -> Path:
@@ -1370,6 +1562,17 @@ def _download_aggregate_with_batch_progress(
     return result
+def _download_15min_with_batch_progress(
+    bldg_id: BuildingID, output_dir: Path, task_id: TaskID, progress: Progress
+) -> Path:
+    """Download 15-minute load curve with batch progress tracking."""
+    # Download the file without individual progress tracking
+    result = download_15min_load_curve_with_progress(bldg_id, output_dir, None, None)
+    # Update batch progress by 1
+    progress.update(task_id, advance=1)
+    return result
 def _process_download_future(
     future: concurrent.futures.Future,
     bldg_id: BuildingID,
@@ -1406,6 +1609,43 @@ def _process_download_future(
         console.print(f"[red]Download failed for monthly load curve {bldg_id.bldg_id}: {e}[/red]")
+def _process_download_future_15min(
+    future: concurrent.futures.Future,
+    bldg_id: BuildingID,
+    output_dir: Path,
+    downloaded_paths: list[Path],
+    failed_downloads: list[str],
+    console: Console,
+) -> None:
+    """Process a completed 15-minute download future."""
+    try:
+        output_file = future.result()
+        downloaded_paths.append(output_file)
+    except No15minLoadCurveError:
+        output_file = (
+            output_dir
+            / bldg_id.get_release_name()
+            / "load_curve_15min"
+            / f"state={bldg_id.state}"
+            / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
+            / f"bldg{str(bldg_id.bldg_id).zfill(7)}_load_curve_15min.parquet"
+        )
+        failed_downloads.append(str(output_file))
+        console.print(f"[red]15 min load curve not available for {bldg_id.get_release_name()}[/red]")
+        raise
+    except Exception as e:
+        output_file = (
+            output_dir
+            / bldg_id.get_release_name()
+            / "load_curve_15min"
+            / f"state={bldg_id.state}"
+            / f"upgrade={str(int(bldg_id.upgrade_id)).zfill(2)}"
+            / f"bldg{str(bldg_id.bldg_id).zfill(7)}_load_curve_15min.parquet"
+        )
+        failed_downloads.append(str(output_file))
+        console.print(f"[red]Download failed for 15 min load curve {bldg_id.bldg_id}: {e}[/red]")
 def _download_aggregate_load_curves_parallel(
     bldg_ids: list[BuildingID],
     output_dir: Path,
@@ -1416,7 +1656,7 @@ def _download_aggregate_load_curves_parallel(
     failed_downloads: list[str],
     console: Console,
 ) -> None:
-    """Download monthly load curves in parallel with progress tracking."""
+    """Download aggregate load curves in parallel with progress tracking."""
     # Create progress tasks based on dataset size
     if len(bldg_ids) > 500:
@@ -1435,11 +1675,16 @@ def _download_aggregate_load_curves_parallel(
     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
         if len(bldg_ids) > 500:
             # Process in batches for large datasets
-            batch_size = 100
+            num_batches = 20
+            batch_size = ((len(bldg_ids) + num_batches - 1) // num_batches + 99) // 100 * 100
             future_to_bldg = {}
             for batch_idx in range(0, len(bldg_ids), batch_size):
                 batch = bldg_ids[batch_idx : batch_idx + batch_size]
+                # Skip empty batches
+                if not batch:
+                    break
                 task_id = load_curve_tasks[batch_idx // batch_size]
                 for bldg_id in batch:
@@ -1481,7 +1726,8 @@ def _download_metadata(
     if not bldg_ids:
         return
     _download_metadata_with_progress(bldg_ids, output_dir, progress, downloaded_paths, failed_downloads, console)
-    _process_metadata_results(bldg_ids, output_dir, downloaded_paths)
+    # Only keep the requested bldg_ids in the metadata file
+    _filter_metadata_requested_bldg_ids(bldg_ids, output_dir, downloaded_paths)
 def download_annual_load_curve_with_progress(
@@ -1594,6 +1840,132 @@ def _download_annual_load_curves_parallel(
                 console.print(f"[red]Download failed for annual load curve {bldg_id.bldg_id}: {e}[/red]")
+def _get_parquet_files_for_state(s3_client: Any, bucket: str, s3_prefix: str) -> list[str]:
+    """Get list of parquet files for a given S3 prefix."""
+    paginator = s3_client.get_paginator("list_objects_v2")
+    parquet_files = []
+    for page in paginator.paginate(Bucket=bucket, Prefix=s3_prefix):
+        for obj in page.get("Contents", []):
+            if obj["Key"].endswith(".parquet"):
+                parquet_files.append(obj["Key"])
+    return parquet_files
+def _download_and_read_parquet_files(
+    s3_client: Any, bucket: str, parquet_files: list[str], output_dir: Path
+) -> list[Any]:
+    """Download and read parquet files, returning a list of dataframes."""
+    # Ensure output directory exists
+    output_dir.mkdir(parents=True, exist_ok=True)
+    state_dataframes = []
+    for s3_key in parquet_files:
+        temp_file = output_dir / f"temp_{s3_key.split('/')[-1]}"
+        s3_client.download_file(bucket, s3_key, str(temp_file))
+        df = pl.read_parquet(str(temp_file))
+        state_dataframes.append(df)
+        temp_file.unlink()
+    return state_dataframes
+def _process_state_data(
+    s3_client: Any, bucket: str, prefix: str, release: str, state: str, output_dir: Path
+) -> tuple[list[Any], bool]:
+    """Process data for a single state, returning (dataframes, has_data)."""
+    s3_prefix = f"{prefix}release={release}/state={state}/"
+    parquet_files = _get_parquet_files_for_state(s3_client, bucket, s3_prefix)
+    if not parquet_files:
+        return [], False
+    state_dataframes = _download_and_read_parquet_files(s3_client, bucket, parquet_files, output_dir)
+    if state_dataframes:
+        state_combined_df = pl.concat(state_dataframes)
+        return [state_combined_df], True
+    return [], False
+def _save_filtered_state_data(
+    state_df: Any, state: str, bldg_ids: list[BuildingID], release: str, output_dir: Path, downloaded_paths: list[Path]
+) -> None:
+    """Save filtered data for a specific state."""
+    bldg_id_list = [str(bldg.bldg_id) for bldg in bldg_ids if bldg.state == state]
+    if not bldg_id_list:
+        return
+    filtered_df = state_df.filter(pl.col("bldg_id").is_in(bldg_id_list))
+    if filtered_df.height == 0:
+        return
+    output_file = output_dir / release / "trip_schedules" / f"state={state}" / "trip_schedules.parquet"
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    filtered_df.write_parquet(str(output_file))
+    downloaded_paths.append(output_file)
+def _download_trip_schedules_data(
+    bldg_ids: list[BuildingID],
+    output_dir: Path,
+    downloaded_paths: list[Path],
+    bucket: str = "buildstock-fetch",
+    prefix: str = "ev_demand/trip_schedules/",
+) -> None:
+    """
+    Download and filter trip schedules data for specific building IDs.
+    Args:
+        bldg_ids: List of BuildingID objects to filter for.
+        output_dir: Directory to save the downloaded files.
+        downloaded_paths: List to append successful download paths to.
+        bucket: Name of the S3 bucket.
+        prefix: S3 prefix for the trip schedules data.
+    Raises:
+        NoBuildingDataError: If no buildings from bldg_ids are found in any available state data.
+    """
+    import warnings
+    release = bldg_ids[0].get_release_name()
+    states_list = list({bldg.state for bldg in bldg_ids})
+    s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
+    all_dataframes = []
+    available_states = []
+    unavailable_states = []
+    # Process each state
+    for state in states_list:
+        state_dataframes, has_data = _process_state_data(s3, bucket, prefix, release, state, output_dir)
+        if has_data:
+            available_states.append(state)
+            all_dataframes.extend(state_dataframes)
+        else:
+            unavailable_states.append(state)
+    # Issue warnings for unavailable states
+    if unavailable_states:
+        warnings.warn(
+            f"No trip schedules data found for {release} in states: {', '.join(unavailable_states)}. "
+            f"Continuing with available states: {', '.join(available_states)}.",
+            stacklevel=2,
+        )
+    if not all_dataframes:
+        msg = f"No trip schedules data found for {release} in any of the requested states: {', '.join(states_list)}"
+        raise NoBuildingDataError(msg)
+    # Save filtered data for each available state separately
+    for i, state_df in enumerate(all_dataframes):
+        state = available_states[i]
+        _save_filtered_state_data(state_df, state, bldg_ids, release, output_dir, downloaded_paths)
+    if not any(bldg.state in available_states for bldg in bldg_ids):
+        msg = f"No trip schedules data found for buildings {[bldg.bldg_id for bldg in bldg_ids]} in {release} for any available state"
+        raise NoBuildingDataError(msg)
 def _download_weather_files_parallel(
     bldg_ids: list[BuildingID],
     output_dir: Path,
@@ -1731,6 +2103,8 @@ def fetch_bldg_data(
         total_files += len(bldg_ids)  # Add 15-minute load curve files
     if file_type_obj.load_curve_hourly:
         total_files += len(bldg_ids)  # Add hourly load curve files
+    if file_type_obj.load_curve_daily:
+        total_files += len(bldg_ids)  # Add daily load curve files
     if file_type_obj.load_curve_monthly:
         total_files += len(bldg_ids)  # Add monthly load curve files
     if file_type_obj.load_curve_annual:
@@ -1767,6 +2141,13 @@ def fetch_bldg_data(
             weather_states,
         )
+        # TODO: add EV related files
+        # TODO: Write a function for downloading EV related files from SB's s3 bucket.
+        # It should dynamically build the download url based on the release_name + state combo.
+        # Make sure to follow the directory structure for downloading the files.
+        if file_type_obj.trip_schedules:
+            _download_trip_schedules_data(bldg_ids, output_dir, downloaded_paths)
     _print_download_summary(downloaded_paths, failed_downloads, console)
     return downloaded_paths, failed_downloads
@@ -1817,6 +2198,19 @@ def _execute_downloads(
             console,
         )
+    if file_type_obj.load_curve_daily:
+        aggregate_time_step = "daily"
+        _download_aggregate_load_curves_parallel(
+            bldg_ids,
+            output_dir,
+            aggregate_time_step,
+            max_workers,
+            progress,
+            downloaded_paths,
+            failed_downloads,
+            console,
+        )
     if file_type_obj.load_curve_monthly:
         aggregate_time_step = "monthly"
         _download_aggregate_load_curves_parallel(
@@ -1835,6 +2229,8 @@ def _execute_downloads(
         _download_annual_load_curves_parallel(
             bldg_ids, output_dir, max_workers, progress, downloaded_paths, failed_downloads, console
         )
+        # Process annual load curve files to filter columns
+        _process_annual_load_curve_results(downloaded_paths)
     # Get weather files if requested.
     if file_type_obj.weather:
@@ -1846,12 +2242,78 @@ def _execute_downloads(
 if __name__ == "__main__":  # pragma: no cover
     bldg_ids = [
         BuildingID(
-            bldg_id=67, release_year="2024", res_com="comstock", weather="tmy3", upgrade_id="0", release_number="2"
+            bldg_id=19713,
+            release_year="2024",
+            res_com="comstock",
+            weather="amy2018",
+            upgrade_id="0",
+            release_number="2",
+            state="NY",
+        ),
+        BuildingID(
+            bldg_id=658,
+            release_year="2024",
+            res_com="comstock",
+            weather="amy2018",
+            upgrade_id="0",
+            release_number="2",
+            state="NY",
+        ),
+        BuildingID(
+            bldg_id=659,
+            release_year="2024",
+            res_com="comstock",
+            weather="amy2018",
+            upgrade_id="0",
+            release_number="2",
+            state="NY",
         ),
     ]
-    file_type = ("weather",)
+    file_type = ("metadata",)
     output_dir = Path("data")
-    weather_states: list[str] = []
-    downloaded_paths, failed_downloads = fetch_bldg_data(bldg_ids, file_type, output_dir, weather_states=weather_states)
+    downloaded_paths, failed_downloads = fetch_bldg_data(bldg_ids, file_type, output_dir)
     print(downloaded_paths)
     print(failed_downloads)
+    bldg_ids = [
+        BuildingID(
+            bldg_id=21023,
+            release_year="2024",
+            res_com="comstock",
+            weather="amy2018",
+            upgrade_id="0",
+            release_number="2",
+            state="NY",
+        ),
+        BuildingID(
+            bldg_id=18403,
+            release_year="2024",
+            res_com="comstock",
+            weather="amy2018",
+            upgrade_id="0",
+            release_number="2",
+            state="NY",
+        ),
+        BuildingID(
+            bldg_id=70769,
+            release_year="2024",
+            res_com="comstock",
+            weather="amy2018",
+            upgrade_id="0",
+            release_number="2",
+            state="NV",
+        ),
+        BuildingID(
+            bldg_id=68227,
+            release_year="2024",
+            res_com="comstock",
+            weather="amy2018",
+            upgrade_id="0",
+            release_number="2",
+            state="NV",
+        ),
+    ]
+    file_type = ("metadata",)
+    output_dir = Path("data")
+    downloaded_paths, failed_downloads = fetch_bldg_data(bldg_ids, file_type, output_dir)