PyPI - climagrid - Versions diffs - 0.1.0__py3-none-any.whl - Mend

climagrid 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

climagrid/__init__.py +17 -0
climagrid/assets/__init__.py +4 -0
climagrid/assets/joiner.py +177 -0
climagrid/assets/registry.py +153 -0
climagrid/cli.py +166 -0
climagrid/features/__init__.py +15 -0
climagrid/features/conductor_sag.py +136 -0
climagrid/features/freeze_thaw.py +83 -0
climagrid/features/ice_loading.py +110 -0
climagrid/features/soil.py +94 -0
climagrid/features/thermal.py +129 -0
climagrid/features/wildfire.py +157 -0
climagrid/outputs/__init__.py +9 -0
climagrid/outputs/exporters.py +165 -0
climagrid/pipeline/__init__.py +3 -0
climagrid/pipeline/orchestrator.py +190 -0
climagrid/schema.py +175 -0
climagrid/sources/__init__.py +21 -0
climagrid/sources/base.py +126 -0
climagrid/sources/nasa_power.py +118 -0
climagrid/sources/noaa_hrrr.py +206 -0
climagrid/sources/noaa_ncei.py +158 -0
climagrid/sources/usda_nrcs.py +204 -0
climagrid/sources/usfs_wfigs.py +201 -0
climagrid-0.1.0.dist-info/METADATA +254 -0
climagrid-0.1.0.dist-info/RECORD +29 -0
climagrid-0.1.0.dist-info/WHEEL +4 -0
climagrid-0.1.0.dist-info/entry_points.txt +2 -0
climagrid-0.1.0.dist-info/licenses/LICENSE +202 -0

climagrid/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""
+climagrid — Climate data, grid-ready.
+Open-source Python toolkit that converts public NOAA, NASA, USDA, and
+U.S. Forest Service data into standardized environmental stress features
+for electric utility predictive maintenance systems.
+Designed for rural electric cooperatives and municipal utilities.
+License: Apache 2.0
+"""
+from climagrid.pipeline.orchestrator import run
+from climagrid.schema import schema_summary, validate_dataframe
+__version__ = "0.1.0"
+__all__ = ["run", "schema_summary", "validate_dataframe"]

climagrid/assets/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from climagrid.assets.joiner import AssetEnvironmentJoiner
+from climagrid.assets.registry import AssetRegistry
+__all__ = ["AssetRegistry", "AssetEnvironmentJoiner"]

climagrid/assets/joiner.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""
+AssetEnvironmentJoiner — spatially joins environmental data to asset locations.
+For each asset in the registry, finds the nearest data point (grid cell or
+station) in the environmental DataFrame and extracts its time series.
+"""
+from __future__ import annotations
+import warnings
+import numpy as np
+import pandas as pd
+from scipy.spatial import cKDTree
+from climagrid.assets.registry import AssetRegistry
+class AssetEnvironmentJoiner:
+    """
+    Joins time-series environmental data to utility asset point locations.
+    Strategy: nearest-neighbor match in Euclidean lat/lon space (valid for
+    small regions, <500 km extents). For large extents consider haversine.
+    Parameters
+    ----------
+    max_distance_km:
+        Reject matches farther than this distance. Points beyond this
+        threshold will have NaN environmental values. Default 100 km.
+    Example
+    -------
+    >>> registry = AssetRegistry("assets.csv")
+    >>> nasa = NasaPowerAdapter()
+    >>> env_df = nasa.fetch(bbox, start_dt, end_dt)
+    >>> joiner = AssetEnvironmentJoiner()
+    >>> result = joiner.join(registry, env_df)
+    >>> result.head()
+    """
+    def __init__(self, max_distance_km: float = 100.0):
+        self._max_distance_km = max_distance_km
+    def join(
+        self,
+        registry: AssetRegistry,
+        env_df: pd.DataFrame,
+        time_col: str = "timestamp",
+    ) -> pd.DataFrame:
+        """
+        Join environmental observations to each asset for every timestamp.
+        Parameters
+        ----------
+        registry:
+            AssetRegistry with asset locations.
+        env_df:
+            DataFrame returned by any adapter's fetch() method.
+            Must have 'lat', 'lon', and at least one timestamp.
+        time_col:
+            Name of the timestamp column in env_df.
+        Returns
+        -------
+        pd.DataFrame
+            One row per (asset_id, timestamp) with index columns and all
+            environmental columns present in env_df.
+        """
+        assets = registry.assets
+        if env_df.empty:
+            return pd.DataFrame(
+                {"asset_id": assets["asset_id"].values}
+            )
+        if "lat" not in env_df.columns or "lon" not in env_df.columns:
+            raise ValueError("env_df must contain 'lat' and 'lon' columns")
+        # Build KD-tree from unique environmental grid points
+        env_points = env_df[["lat", "lon"]].drop_duplicates().reset_index(drop=True)
+        tree = cKDTree(env_points[["lat", "lon"]].values)
+        asset_lats = assets["lat"].values
+        asset_lons = assets["lon"].values
+        asset_coords = np.column_stack([asset_lats, asset_lons])
+        # Query nearest grid point for each asset
+        distances_deg, indices = tree.query(asset_coords, k=1)
+        # Rough conversion: 1 degree ≈ 111 km
+        distances_km = distances_deg * 111.0
+        # Warn about far matches
+        too_far = distances_km > self._max_distance_km
+        if too_far.any():
+            n_far = too_far.sum()
+            warnings.warn(
+                f"{n_far} asset(s) are more than {self._max_distance_km} km "
+                "from any environmental data point. Those rows will have NaN values.",
+                UserWarning,
+                stacklevel=2,
+            )
+        # Map each asset to its nearest environmental grid point lat/lon
+        nearest_lats = env_points.loc[indices, "lat"].values
+        nearest_lons = env_points.loc[indices, "lon"].values
+        # Build result: for each asset, extract the env time series at its nearest point
+        result_frames: list[pd.DataFrame] = []
+        env_value_cols = [
+            c for c in env_df.columns
+            if c not in {"lat", "lon", time_col}
+        ]
+        for i, row in assets.iterrows():
+            asset_id = row["asset_id"]
+            asset_lat = row["lat"]
+            asset_lon = row["lon"]
+            nn_lat = nearest_lats[list(assets.index).index(i) if i in assets.index else i]
+            nn_lon = nearest_lons[list(assets.index).index(i) if i in assets.index else i]
+            env_slice = env_df[
+                (env_df["lat"] == nn_lat) & (env_df["lon"] == nn_lon)
+            ][env_value_cols + ([time_col] if time_col in env_df.columns else [])].copy()
+            env_slice["asset_id"] = asset_id
+            env_slice["lat"] = asset_lat
+            env_slice["lon"] = asset_lon
+            if distances_km[list(assets.index).index(i) if i in assets.index else i] > self._max_distance_km:
+                for col in env_value_cols:
+                    env_slice[col] = float("nan")
+            result_frames.append(env_slice)
+        if not result_frames:
+            return pd.DataFrame()
+        result = pd.concat(result_frames, ignore_index=True)
+        # Reorder columns: asset_id, timestamp, lat, lon, then env columns
+        front_cols = ["asset_id"]
+        if time_col in result.columns:
+            front_cols.append(time_col)
+        front_cols += ["lat", "lon"]
+        remaining = [c for c in result.columns if c not in front_cols]
+        return result[front_cols + remaining].reset_index(drop=True)
+    def join_point(
+        self,
+        asset_lat: float,
+        asset_lon: float,
+        env_df: pd.DataFrame,
+        time_col: str = "timestamp",
+    ) -> pd.DataFrame:
+        """Convenience method: join env data for a single lat/lon point."""
+        import os
+        import tempfile
+        from climagrid.assets.registry import AssetRegistry
+        tmp_data = pd.DataFrame([{
+            "asset_id": "point",
+            "lat": asset_lat,
+            "lon": asset_lon,
+        }])
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
+            tmp_data.to_csv(f, index=False)
+            tmp_path = f.name
+        try:
+            reg = AssetRegistry(tmp_path)
+            return self.join(reg, env_df, time_col)
+        finally:
+            os.unlink(tmp_path)

climagrid/assets/registry.py ADDED Viewed

@@ -0,0 +1,153 @@
+"""
+AssetRegistry — loads utility asset records from CSV or GeoJSON.
+Each asset must have at minimum: asset_id, lat, lon.
+Optional fields: asset_type, voltage_kv, install_year, manufacturer.
+"""
+from __future__ import annotations
+from pathlib import Path
+import geopandas as gpd
+import pandas as pd
+REQUIRED_COLUMNS = {"asset_id", "lat", "lon"}
+ASSET_TYPE_VALUES = {
+    "transformer",
+    "circuit_breaker",
+    "transmission_line",
+    "distribution_line",
+    "substation",
+    "capacitor_bank",
+    "recloser",
+    "other",
+}
+class AssetRegistry:
+    """
+    Loads and validates a utility asset registry from CSV or GeoJSON.
+    Parameters
+    ----------
+    path:
+        Path to a CSV file (must have asset_id, lat, lon columns) or
+        a GeoJSON file (must have asset_id and Point geometry).
+    asset_type_filter:
+        If provided, only include assets of these types.
+    Example
+    -------
+    >>> registry = AssetRegistry("my_coop_assets.csv")
+    >>> registry.assets.head()
+    """
+    def __init__(
+        self,
+        path: str | Path,
+        asset_type_filter: list[str] | None = None,
+    ):
+        self._path = Path(path)
+        self._gdf = self._load(self._path)
+        if asset_type_filter:
+            self._gdf = self._gdf[
+                self._gdf["asset_type"].isin(asset_type_filter)
+            ]
+    @property
+    def assets(self) -> gpd.GeoDataFrame:
+        """GeoDataFrame with one row per asset, CRS=EPSG:4326."""
+        return self._gdf
+    @property
+    def count(self) -> int:
+        return len(self._gdf)
+    @property
+    def bounding_box(self) -> tuple[float, float, float, float]:
+        """(min_lat, max_lat, min_lon, max_lon) covering all assets."""
+        bounds = self._gdf.total_bounds  # (minx, miny, maxx, maxy)
+        return bounds[1], bounds[3], bounds[0], bounds[2]
+    def __len__(self) -> int:
+        return self.count
+    def __repr__(self) -> str:
+        return f"AssetRegistry(n={self.count}, path={self._path.name!r})"
+    # ------------------------------------------------------------------
+    # Internal loading
+    # ------------------------------------------------------------------
+    def _load(self, path: Path) -> gpd.GeoDataFrame:
+        suffix = path.suffix.lower()
+        if suffix == ".csv":
+            return self._load_csv(path)
+        elif suffix in {".geojson", ".json"}:
+            return self._load_geojson(path)
+        else:
+            raise ValueError(
+                f"Unsupported file type: {suffix!r}. "
+                "Use .csv or .geojson."
+            )
+    def _load_csv(self, path: Path) -> gpd.GeoDataFrame:
+        df = pd.read_csv(path, dtype={"asset_id": str})
+        self._check_required_columns(df, path)
+        df["lat"] = pd.to_numeric(df["lat"], errors="coerce")
+        df["lon"] = pd.to_numeric(df["lon"], errors="coerce")
+        n_before = len(df)
+        df = df.dropna(subset=["lat", "lon"])
+        if len(df) < n_before:
+            import warnings
+            warnings.warn(
+                f"Dropped {n_before - len(df)} rows with null lat/lon",
+                UserWarning,
+                stacklevel=3,
+            )
+        gdf = gpd.GeoDataFrame(
+            df,
+            geometry=gpd.points_from_xy(df["lon"], df["lat"]),
+            crs="EPSG:4326",
+        )
+        return gdf
+    def _load_geojson(self, path: Path) -> gpd.GeoDataFrame:
+        gdf = gpd.read_file(path)
+        gdf = gdf.set_crs("EPSG:4326") if gdf.crs is None else gdf.to_crs("EPSG:4326")
+        # Extract lat/lon from geometry if not present
+        if "lat" not in gdf.columns:
+            gdf["lat"] = gdf.geometry.y
+        if "lon" not in gdf.columns:
+            gdf["lon"] = gdf.geometry.x
+        df_check = pd.DataFrame(gdf.drop(columns="geometry"))
+        self._check_required_columns(df_check, path)
+        return gdf
+    @staticmethod
+    def _check_required_columns(df: pd.DataFrame, path: Path) -> None:
+        missing = REQUIRED_COLUMNS - set(df.columns)
+        if missing:
+            raise ValueError(
+                f"Asset file {path.name!r} is missing required columns: "
+                f"{sorted(missing)}. "
+                f"Required: {sorted(REQUIRED_COLUMNS)}"
+            )
+def load_sample_assets() -> AssetRegistry:
+    """Load the bundled 50-asset sample registry for testing and demos."""
+    here = Path(__file__).parent.parent.parent.parent
+    sample_path = here / "examples" / "data" / "sample_assets.csv"
+    if not sample_path.exists():
+        raise FileNotFoundError(
+            f"Sample asset file not found at {sample_path}. "
+            "Has the repository been cloned fully?"
+        )
+    return AssetRegistry(sample_path)

climagrid/cli.py ADDED Viewed

@@ -0,0 +1,166 @@
+"""
+Command-line interface for climagrid.
+Usage examples
+--------------
+  climagrid fetch --assets assets.csv --start 2024-07-01 --end 2024-07-08
+  climagrid fetch --assets assets.csv --start 2024-07-01 --end 2024-07-08 \\
+      --sources nasa_power,usfs_wfigs --output features.parquet
+  climagrid schema
+"""
+from __future__ import annotations
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+import click
+@click.group()
+@click.version_option(package_name="climagrid")
+def main() -> None:
+    """climagrid — climate data, grid-ready.
+    Fetch NOAA/NASA/USDA/USFS environmental data and compute
+    predictive-maintenance stress features for utility assets.
+    """
+@main.command()
+@click.option(
+    "--assets", "-a",
+    required=True,
+    type=click.Path(exists=True, path_type=Path),
+    help="Asset CSV or GeoJSON file (must have asset_id, lat, lon columns).",
+)
+@click.option(
+    "--start", "-s",
+    required=True,
+    metavar="YYYY-MM-DD",
+    help="Start date (UTC).",
+)
+@click.option(
+    "--end", "-e",
+    required=True,
+    metavar="YYYY-MM-DD",
+    help="End date (UTC, inclusive).",
+)
+@click.option(
+    "--sources",
+    default="nasa_power",
+    show_default=True,
+    help="Comma-separated data source names. "
+         "Valid: nasa_power, noaa_hrrr, noaa_ncei, usda_nrcs, usfs_wfigs.",
+)
+@click.option(
+    "--features",
+    default="all",
+    show_default=True,
+    help="Comma-separated feature names or 'all'. "
+         "Valid: thermal, conductor_sag, freeze_thaw, ice_loading, soil, wildfire.",
+)
+@click.option(
+    "--output", "-o",
+    default="climagrid_output.parquet",
+    show_default=True,
+    type=click.Path(path_type=Path),
+    help="Output file path (.parquet or .csv).",
+)
+@click.option(
+    "--long-form",
+    is_flag=True,
+    default=False,
+    help="Write long-form Parquet (feature_name, feature_value rows) instead of wide.",
+)
+@click.option(
+    "--bbox-radius",
+    default=50.0,
+    show_default=True,
+    metavar="KM",
+    help="Bounding box radius around asset centroid for data fetch.",
+)
+def fetch(
+    assets: Path,
+    start: str,
+    end: str,
+    sources: str,
+    features: str,
+    output: Path,
+    long_form: bool,
+    bbox_radius: float,
+) -> None:
+    """Fetch environmental data and compute stress features for utility assets."""
+    import climagrid
+    from climagrid.outputs import to_csv, to_long_parquet, to_parquet
+    try:
+        start_dt = datetime.fromisoformat(start).replace(tzinfo=timezone.utc)
+        end_dt = datetime.fromisoformat(end).replace(tzinfo=timezone.utc)
+    except ValueError as exc:
+        raise click.BadParameter(str(exc), param_hint="--start/--end") from exc
+    source_list = [s.strip() for s in sources.split(",") if s.strip()]
+    feature_list: list[str] | str = (
+        "all"
+        if features.strip().lower() == "all"
+        else [f.strip() for f in features.split(",") if f.strip()]
+    )
+    click.echo(f"Assets:   {assets}")
+    click.echo(f"Period:   {start} → {end}")
+    click.echo(f"Sources:  {', '.join(source_list)}")
+    click.echo(f"Features: {features}")
+    try:
+        df = climagrid.run(
+            assets,
+            start_dt=start_dt,
+            end_dt=end_dt,
+            sources=source_list,
+            features=feature_list,
+            bbox_radius_km=bbox_radius,
+        )
+    except Exception as exc:
+        click.secho(f"Error: {exc}", fg="red", err=True)
+        sys.exit(1)
+    if df.empty:
+        click.secho("Warning: result is empty — check source availability.", fg="yellow")
+    suffix = output.suffix.lower()
+    if long_form and suffix == ".parquet":
+        out_path = to_long_parquet(df, output)
+        fmt = "long-form Parquet"
+    elif suffix == ".csv":
+        out_path = to_csv(df, output)
+        fmt = "CSV"
+    else:
+        out_path = to_parquet(df, output)
+        fmt = "Parquet"
+    click.secho(
+        f"✓ {len(df):,} rows × {df.shape[1]} columns → {out_path} ({fmt})",
+        fg="green",
+    )
+@main.command()
+@click.option(
+    "--output", "-o",
+    default=None,
+    type=click.Path(path_type=Path),
+    help="Optional path to write schema JSON file.",
+)
+def schema(output: Path | None) -> None:
+    """Print the climagrid column schema."""
+    import climagrid
+    summary = climagrid.schema_summary()
+    click.echo(summary.to_string(index=False))
+    if output:
+        from climagrid.outputs import to_json_schema
+        to_json_schema(output)
+        click.secho(f"✓ Schema written to {output}", fg="green")

climagrid/features/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+from climagrid.features.conductor_sag import ConductorSagIndex
+from climagrid.features.freeze_thaw import FreezeThawtCycleCounter
+from climagrid.features.ice_loading import IceLoadingRisk
+from climagrid.features.soil import SoilSaturationIndex
+from climagrid.features.thermal import ThermalStressIndex
+from climagrid.features.wildfire import WildfireProximityScore
+__all__ = [
+    "ThermalStressIndex",
+    "FreezeThawtCycleCounter",
+    "IceLoadingRisk",
+    "SoilSaturationIndex",
+    "WildfireProximityScore",
+    "ConductorSagIndex",
+]

climagrid/features/conductor_sag.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""
+ConductorSagIndex — thermal sag estimation for overhead T&D lines.
+When conductor temperature rises, the aluminum/ACSR strands expand and
+the conductor sags downward, reducing ground clearance. Excessive sag
+causes regulatory violations and phase-to-ground faults.
+Thermal sag is governed by the IEEE 738-2012 (Standard for Calculating
+the Current-Temperature Relationship of Bare Overhead Conductors). This
+module implements a simplified version using ambient temperature and solar
+irradiance as primary inputs.
+The output is a normalized index [0, 1] representing sag relative to the
+maximum allowable sag (configurable), suitable as an ML feature.
+"""
+from __future__ import annotations
+import numpy as np
+import pandas as pd
+class ConductorSagIndex:
+    """
+    Computes normalized conductor thermal sag index from weather inputs.
+    Simplified IEEE 738-2012 heat balance:
+        T_conductor ≈ T_ambient + (I²R + Q_solar - Q_convective) / (thermal_capacity)
+    For stress *feature* purposes (not full current rating), we approximate
+    the conductor temperature as ambient + solar heating - convective cooling,
+    then compute sag as a fraction of the maximum design sag.
+    Parameters
+    ----------
+    temp_col:
+        Ambient temperature column in °C.
+    solar_col:
+        Global horizontal irradiance column in W/m².
+    wind_col:
+        Wind speed column in m/s. Wind is the primary cooling mechanism.
+    max_sag_temp_c:
+        Conductor temperature at which sag reaches the design maximum.
+        Default 75°C (typical for ACSR "Drake" conductor per IEEE 738).
+    conductor_absorptivity:
+        Solar absorptivity of the conductor surface (0–1). Default 0.5.
+    conductor_emissivity:
+        Emissivity for radiated cooling (0–1). Default 0.5.
+    conductor_diameter_mm:
+        Conductor outer diameter for convective heat loss. Default 28.1 mm (Drake ACSR).
+    Example
+    -------
+    >>> csi = ConductorSagIndex()
+    >>> df = csi.compute(env_df)
+    >>> df["feat_conductor_sag_index"]
+    """
+    # Stefan-Boltzmann constant W/(m²·K⁴)
+    _SIGMA = 5.6704e-8
+    def __init__(
+        self,
+        temp_col: str = "hrrr_temperature_2m",
+        solar_col: str = "hrrr_solar_irradiance_ghi",
+        wind_col: str = "hrrr_wind_speed_10m",
+        max_sag_temp_c: float = 75.0,
+        conductor_absorptivity: float = 0.5,
+        conductor_emissivity: float = 0.5,
+        conductor_diameter_mm: float = 28.1,
+    ):
+        self._temp_col = temp_col
+        self._solar_col = solar_col
+        self._wind_col = wind_col
+        self._max_sag_temp_c = max_sag_temp_c
+        self._alpha = conductor_absorptivity
+        self._eps = conductor_emissivity
+        self._d = conductor_diameter_mm / 1000.0  # convert to metres
+    def compute(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Add feat_conductor_sag_index column [0, 1]. Returns a copy."""
+        df = df.copy()
+        temp = self._resolve_col(df, self._temp_col, ["nasa_temperature_2m", "ncei_temperature_max"])
+        solar = self._resolve_col(df, self._solar_col, ["nasa_solar_irradiance_ghi"])
+        wind = self._resolve_col(df, self._wind_col, ["nasa_wind_speed_10m", "ncei_wind_speed"])
+        if temp is None:
+            df["feat_conductor_sag_index"] = float("nan")
+            return df
+        # Defaults when data not available
+        if solar is None:
+            solar = np.full_like(temp, 400.0)   # moderate irradiance
+        if wind is None:
+            wind = np.full_like(temp, 1.0)       # near-calm (conservative)
+        wind = np.maximum(wind, 0.5)  # prevent divide-by-zero in convection
+        # Solar heat gain per unit length (W/m)
+        q_solar = self._alpha * solar * self._d
+        # Convective cooling (simplified Morgan formula per IEEE 738 Eq. 3a)
+        # q_conv = (1.01 + 1.35 * Re^0.52) * k_f * (T_c - T_a)
+        # Simplified for feature purposes: linear approximation
+        # Full implementation would require air density, viscosity, thermal conductivity
+        # We use: q_conv ≈ h_c * d * delta_T where h_c ≈ 10 * sqrt(wind) W/(m²·K) (typical)
+        h_c = 10.0 * np.sqrt(wind)
+        q_conv_per_k = h_c * self._d  # W/(m·K) per unit temperature rise
+        # Steady-state conductor temperature rise above ambient (°C)
+        # Radiation delta omitted for simplicity (dominated by convection at typical loadings)
+        delta_t = np.maximum(q_solar / q_conv_per_k, 0.0)
+        t_conductor = temp + delta_t
+        # Sag index: ratio of conductor temperature to max allowable, clamped [0,1]
+        # Using linear approximation: sag scales approximately linearly with temperature
+        # (IEEE 738 Table B.1 confirms near-linear relationship for typical conductors)
+        baseline_temp = 25.0  # °C design reference temperature
+        sag_index = (t_conductor - baseline_temp) / (self._max_sag_temp_c - baseline_temp)
+        df["feat_conductor_sag_index"] = np.clip(sag_index, 0.0, 1.0)
+        return df
+    @property
+    def _sigma_val(self) -> float:
+        return self._SIGMA
+    @staticmethod
+    def _resolve_col(df: pd.DataFrame, primary: str, fallbacks: list[str]):
+        if primary in df.columns:
+            return df[primary].values.astype(float)
+        for fb in fallbacks:
+            if fb in df.columns:
+                return df[fb].values.astype(float)
+        return None