PyPI - metdatapy - Versions diffs - 1.0.0__py3-none-any.whl - Mend

metdatapy 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

metdatapy/__init__.py +23 -0
metdatapy/cli.py +314 -0
metdatapy/core.py +220 -0
metdatapy/derive.py +393 -0
metdatapy/io.py +350 -0
metdatapy/manifest.py +345 -0
metdatapy/mapper.py +214 -0
metdatapy/mlprep.py +306 -0
metdatapy/qc.py +318 -0
metdatapy/units.py +53 -0
metdatapy/utils.py +61 -0
metdatapy-1.0.0.dist-info/METADATA +285 -0
metdatapy-1.0.0.dist-info/RECORD +17 -0
metdatapy-1.0.0.dist-info/WHEEL +5 -0
metdatapy-1.0.0.dist-info/entry_points.txt +2 -0
metdatapy-1.0.0.dist-info/licenses/LICENSE +23 -0
metdatapy-1.0.0.dist-info/top_level.txt +1 -0

metdatapy/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""MetDataPy package init."""
+from .core import WeatherSet
+from .mapper import Mapper, Detector
+from .mlprep import make_supervised, time_split, fit_scaler, apply_scaler
+from .qc import qc_range, qc_spike, qc_flatline, qc_consistency
+__all__ = [
+    "WeatherSet",
+    "Mapper",
+    "Detector",
+    "make_supervised",
+    "time_split",
+    "fit_scaler",
+    "apply_scaler",
+    "qc_range",
+    "qc_spike",
+    "qc_flatline",
+    "qc_consistency",
+]
+__version__ = "1.0.0"

metdatapy/cli.py ADDED Viewed

@@ -0,0 +1,314 @@
+import json
+from pathlib import Path
+from typing import List, Optional
+import click
+import pandas as pd
+from .mapper import Detector, Mapper
+from .core import WeatherSet
+from .io import to_parquet
+@click.group()
+def main():
+     """MetDataPy command-line interface."""
+     pass
+@main.group()
+def ingest():
+     """Ingestion helpers."""
+     pass
+@ingest.command("detect")
+@click.option("--csv", "csv_path", required=True, type=click.Path(exists=True, dir_okay=False))
+@click.option("--save", "save_path", required=False, type=click.Path(dir_okay=False))
+@click.option("--yes", is_flag=True, help="Accept detected mapping without interactive editing")
+def ingest_detect(csv_path: str, save_path: Optional[str], yes: bool):
+     det = Detector()
+     # Read a sample for column choices
+     df_head = pd.read_csv(csv_path, nrows=200)
+     mapping = det.detect(df_head)
+     if not yes:
+         mapping = _interactive_mapping_wizard(mapping, list(df_head.columns))
+     click.echo(json.dumps(mapping, indent=2))
+     if save_path:
+         Mapper.save(mapping, save_path)
+         click.echo(f"Saved mapping to {save_path}")
+def _interactive_mapping_wizard(mapping: dict, columns: List[str]) -> dict:
+     """Interactive confirm/edit flow for detected mapping."""
+     from .mapper import CANONICAL_FIELDS
+     click.echo("Interactive mapping wizard (press Enter to accept defaults). Type 'none' to unset.")
+     # Timestamp column
+     ts_current = (mapping.get("ts") or {}).get("col")
+     col_choices = [str(c) for c in columns]
+     if ts_current is None:
+         ts_current = col_choices[0] if col_choices else None
+     ts_selected = click.prompt(
+         "Timestamp column",
+         default=ts_current or "",
+         show_default=True,
+     ).strip()
+     if ts_selected.lower() == "none" or ts_selected == "":
+         mapping["ts"] = {"col": None}
+     else:
+         mapping["ts"] = {"col": ts_selected}
+     # Ensure fields dict exists
+     if "fields" not in mapping or mapping["fields"] is None:
+         mapping["fields"] = {}
+     # Loop over canonical fields (union with detected keys)
+     canonical_all = list({*CANONICAL_FIELDS, *mapping["fields"].keys()})
+     for canon in canonical_all:
+         current = mapping["fields"].get(canon, {})
+         cur_col = current.get("col") or ""
+         cur_unit = (current.get("unit") or "")
+         conf = current.get("confidence")
+         if conf is not None:
+             click.echo(f"\n{canon}: (confidence={conf})")
+         else:
+             click.echo(f"\n{canon}:")
+         new_col = click.prompt(
+             f"  Source column for {canon}",
+             default=cur_col,
+             show_default=True,
+         ).strip()
+         if new_col.lower() == "none":
+             if canon in mapping["fields"]:
+                 del mapping["fields"][canon]
+             continue
+         if new_col:
+             # Ask for unit if applicable
+             new_unit = click.prompt(
+                 f"  Unit for {canon} (e.g., C, F, m/s, km/h, hpa, mm)",
+                 default=cur_unit,
+                 show_default=True,
+             ).strip()
+             entry = {"col": new_col}
+             if new_unit:
+                 entry["unit"] = new_unit
+             # Preserve confidence if present
+             if conf is not None:
+                 entry["confidence"] = conf
+             mapping["fields"][canon] = entry
+     return mapping
+@ingest.command("apply")
+@click.option("--csv", "csv_path", required=True, type=click.Path(exists=True, dir_okay=False))
+@click.option("--map", "map_path", required=True, type=click.Path(exists=True, dir_okay=False))
+@click.option("--out", "out_path", required=True, type=click.Path(dir_okay=False))
+def ingest_apply(csv_path: str, map_path: str, out_path: str):
+     mapping = Mapper.load(map_path)
+     df = pd.read_csv(csv_path)
+     ws = WeatherSet.from_mapping(df, mapping).to_utc().normalize_units(mapping)
+     to_parquet(ws.to_dataframe(), out_path)
+     click.echo(f"Wrote {out_path}")
+@main.group()
+def qc():
+     """Quality control commands."""
+     pass
+@qc.command("run")
+@click.option("--in", "in_path", required=True, type=click.Path(exists=True, dir_okay=False))
+@click.option("--out", "out_path", required=True, type=click.Path(dir_okay=False))
+@click.option("--report", "report_path", required=False, type=click.Path(dir_okay=False))
+@click.option("--config", "config_path", required=False, type=click.Path(exists=True, dir_okay=False), help="YAML/JSON thresholds for QC")
+def qc_run(in_path: str, out_path: str, report_path: Optional[str], config_path: Optional[str]):
+     df = pd.read_parquet(in_path)
+     ws = WeatherSet(df)
+     cfg = None
+     if config_path:
+         text = Path(config_path).read_text(encoding="utf-8")
+         try:
+             import yaml as _yaml
+             cfg = _yaml.safe_load(text)
+         except Exception:
+             try:
+                 cfg = json.loads(text)
+             except Exception:
+                 cfg = None
+     ws = ws.qc_range()
+     from .qc import qc_spike as _sp, qc_flatline as _fl
+     sp = cfg.get("spike", {}) if isinstance(cfg, dict) else {}
+     fl = cfg.get("flatline", {}) if isinstance(cfg, dict) else {}
+     ws.df = _sp(ws.df, window=int(sp.get("window", 9)), thresh=float(sp.get("thresh", 6.0)))
+     ws.df = _fl(ws.df, window=int(fl.get("window", 5)), tol=float(fl.get("tol", 0.0)))
+     ws = ws.qc_consistency()
+     out_df = ws.to_dataframe()
+     out_df.to_parquet(out_path)
+     click.echo(f"Wrote {out_path}")
+     if report_path:
+         report = {}
+         for col in out_df.columns:
+             if col.startswith("qc_"):
+                 report[col] = int(out_df[col].fillna(False).sum())
+         Path(report_path).write_text(json.dumps(report, indent=2), encoding="utf-8")
+         click.echo(f"Saved report to {report_path}")
+@ingest.command("template")
+@click.option("--out", "out_path", required=False, type=click.Path(dir_okay=False))
+@click.option("--minimal", is_flag=True, help="Exclude optional fields from template")
+def ingest_template(out_path: Optional[str], minimal: bool):
+    from .mapper import Mapper
+    tpl = Mapper.template(include_optional=not minimal)
+    s = json.dumps(tpl, indent=2)
+    if out_path:
+        Path(out_path).write_text(s, encoding="utf-8")
+        click.echo(f"Wrote mapping template to {out_path}")
+    else:
+        click.echo(s)
+@main.group()
+def manifest():
+    """Manifest and reproducibility commands."""
+    pass
+@manifest.command("validate")
+@click.argument("manifest_path", type=click.Path(exists=True, dir_okay=False))
+@click.option("--verbose", "-v", is_flag=True, help="Show detailed validation results")
+def manifest_validate(manifest_path: str, verbose: bool):
+    """Validate a manifest.json file."""
+    from .manifest import validate_manifest
+    click.echo(f"Validating manifest: {manifest_path}")
+    results = validate_manifest(manifest_path)
+    if results["valid"]:
+        click.secho("✓ Manifest is valid", fg="green", bold=True)
+        if verbose:
+            click.echo(f"\nManifest Details:")
+            click.echo(f"  Version: {results['version']}")
+            click.echo(f"  MetDataPy Version: {results['metdatapy_version']}")
+            click.echo(f"  Pipeline Steps: {results['pipeline_steps']}")
+            click.echo(f"  Pipeline Hash: {results['pipeline_hash']}")
+            click.echo(f"  Has QC Report: {results['has_qc_report']}")
+            click.echo(f"  Has Scaler: {results['has_scaler']}")
+            click.echo(f"  Has Split: {results['has_split']}")
+        if results.get("warnings"):
+            click.echo(f"\nWarnings:")
+            for warning in results["warnings"]:
+                click.secho(f"  ⚠ {warning}", fg="yellow")
+    else:
+        click.secho("✗ Manifest is invalid", fg="red", bold=True)
+        if results.get("errors"):
+            click.echo(f"\nErrors:")
+            for error in results["errors"]:
+                click.secho(f"  ✗ {error}", fg="red")
+        raise click.Abort()
+@manifest.command("show")
+@click.argument("manifest_path", type=click.Path(exists=True, dir_okay=False))
+@click.option("--format", "output_format", type=click.Choice(["json", "yaml", "summary"]), default="summary")
+def manifest_show(manifest_path: str, output_format: str):
+    """Display manifest contents."""
+    from .manifest import Manifest
+    m = Manifest.from_json(manifest_path)
+    if output_format == "json":
+        click.echo(json.dumps(m.model_dump(), indent=2))
+    elif output_format == "yaml":
+        import yaml
+        click.echo(yaml.dump(m.model_dump(), sort_keys=False))
+    else:  # summary
+        click.echo(f"Manifest Summary")
+        click.echo(f"{'=' * 60}")
+        click.echo(f"Created: {m.created_at}")
+        click.echo(f"MetDataPy Version: {m.metdatapy_version}")
+        click.echo(f"Pipeline Hash: {m.pipeline_hash}")
+        click.echo(f"\nDataset:")
+        click.echo(f"  Source: {m.dataset.source}")
+        click.echo(f"  Rows: {m.dataset.rows:,}")
+        click.echo(f"  Columns: {len(m.dataset.columns)}")
+        click.echo(f"  Time Range: {m.dataset.start_time} to {m.dataset.end_time}")
+        if m.dataset.frequency:
+            click.echo(f"  Frequency: {m.dataset.frequency}")
+        click.echo(f"\nPipeline Steps ({len(m.pipeline_steps)}):")
+        for i, step in enumerate(m.pipeline_steps, 1):
+            duration = f" ({step.duration_seconds:.2f}s)" if step.duration_seconds else ""
+            click.echo(f"  {i}. {step.function}{duration}")
+        click.echo(f"\nFeatures:")
+        click.echo(f"  Original: {len(m.features.original_features)}")
+        click.echo(f"  Derived: {len(m.features.derived_features)}")
+        click.echo(f"  Lag: {len(m.features.lag_features)}")
+        click.echo(f"  Calendar: {len(m.features.calendar_features)}")
+        click.echo(f"  Target: {len(m.features.target_features)}")
+        if m.qc_report:
+            click.echo(f"\nQuality Control:")
+            click.echo(f"  Total Flags: {m.qc_report.total_flags:,}")
+            click.echo(f"  Flagged: {m.qc_report.flagged_percentage:.2f}%")
+            if m.qc_report.flags_by_type:
+                click.echo(f"  By Type:")
+                for flag_type, count in sorted(m.qc_report.flags_by_type.items()):
+                    click.echo(f"    {flag_type}: {count:,}")
+        if m.scaler:
+            click.echo(f"\nScaler:")
+            click.echo(f"  Method: {m.scaler.method}")
+            click.echo(f"  Columns: {len(m.scaler.columns)}")
+        if m.split:
+            click.echo(f"\nSplit Boundaries:")
+            click.echo(f"  Train: {m.split.train_start} to {m.split.train_end}")
+            if m.split.val_start:
+                click.echo(f"  Val: {m.split.val_start} to {m.split.val_end}")
+            if m.split.test_start:
+                click.echo(f"  Test: {m.split.test_start} to {m.split.test_end}")
+@manifest.command("compare")
+@click.argument("manifest1", type=click.Path(exists=True, dir_okay=False))
+@click.argument("manifest2", type=click.Path(exists=True, dir_okay=False))
+def manifest_compare(manifest1: str, manifest2: str):
+    """Compare two manifests for reproducibility."""
+    from .manifest import Manifest
+    m1 = Manifest.from_json(manifest1)
+    m2 = Manifest.from_json(manifest2)
+    results = m1.validate_reproducibility(m2)
+    click.echo(f"Comparing Manifests")
+    click.echo(f"{'=' * 60}")
+    click.echo(f"Manifest 1: {manifest1}")
+    click.echo(f"Manifest 2: {manifest2}")
+    click.echo()
+    all_match = all(results.values())
+    for check, passed in results.items():
+        status = "✓" if passed else "✗"
+        color = "green" if passed else "red"
+        click.secho(f"{status} {check.replace('_', ' ').title()}", fg=color)
+    click.echo()
+    if all_match:
+        click.secho("✓ Manifests are compatible for reproducibility", fg="green", bold=True)
+    else:
+        click.secho("✗ Manifests differ - results may not be reproducible", fg="yellow", bold=True)

metdatapy/core.py ADDED Viewed

@@ -0,0 +1,220 @@
+from __future__ import annotations
+from typing import Dict, Optional
+import pandas as pd
+from .utils import CANONICAL_INDEX, CANONICAL_VARS, ensure_datetime_utc
+from .units import (
+    fahrenheit_to_c,
+    identity,
+    mph_to_ms,
+    kmh_to_ms,
+    mbar_to_hpa,
+    pa_to_hpa,
+)
+from .qc import qc_range
+from .derive import dew_point_c, vpd_kpa
+UNIT_CONVERTERS = {
+    "temp_c": {"F": fahrenheit_to_c, "C": identity},
+    "wspd_ms": {"mph": mph_to_ms, "km/h": kmh_to_ms, "m/s": identity},
+    "gust_ms": {"mph": mph_to_ms, "km/h": kmh_to_ms, "m/s": identity},
+    "pres_hpa": {"mbar": mbar_to_hpa, "hpa": identity, "pa": pa_to_hpa},
+    "rain_mm": {"mm": identity, "inch": lambda x: x * 25.4},
+}
+class WeatherSet:
+    def __init__(self, df: pd.DataFrame):
+        self.df = df
+    @classmethod
+    def from_csv(cls, path: str, mapping: Dict) -> "WeatherSet":
+        df = pd.read_csv(path)
+        return cls.from_mapping(df, mapping)
+    @classmethod
+    def from_mapping(cls, df: pd.DataFrame, mapping: Dict) -> "WeatherSet":
+        ts_col = mapping.get("ts", {}).get("col")
+        if ts_col is None or ts_col not in df.columns:
+            raise ValueError("Timestamp column not found in mapping or data")
+        idx = ensure_datetime_utc(df[ts_col])
+        df = df.copy()
+        df.index = idx
+        df.index.name = CANONICAL_INDEX
+        out = pd.DataFrame(index=df.index)
+        fields = mapping.get("fields", {})
+        for canon, cfg in fields.items():
+            if canon not in CANONICAL_VARS:
+                continue
+            src = cfg.get("col")
+            if src not in df.columns:
+                continue
+            out[canon] = df[src]
+        return cls(out)
+    def to_utc(self) -> "WeatherSet":
+        if self.df.index.tz is None:
+            self.df.index = self.df.index.tz_localize("UTC")
+        else:
+            self.df.index = self.df.index.tz_convert("UTC")
+        return self
+    def normalize_units(self, mapping: Dict) -> "WeatherSet":
+        fields = mapping.get("fields", {})
+        for var, cfg in fields.items():
+            if var not in self.df.columns:
+                continue
+            unit = (cfg or {}).get("unit")
+            if unit is None:
+                continue
+            convs = UNIT_CONVERTERS.get(var)
+            if not convs:
+                continue
+            func = convs.get(unit)
+            if func is None:
+                continue
+            self.df[var] = func(self.df[var].astype(float))
+        return self
+    def insert_missing(self, frequency: Optional[str] = None) -> "WeatherSet":
+        freq = frequency or pd.infer_freq(self.df.index)
+        if freq is None:
+            return self
+        # Normalize deprecated frequency aliases (H->h, T->min, etc.)
+        if freq == 'H':
+            freq = 'h'
+        elif freq and freq.endswith('H') and freq[:-1].isdigit():
+            freq = freq[:-1] + 'h'
+        full = pd.date_range(self.df.index.min(), self.df.index.max(), freq=freq, tz="UTC")
+        before = self.df.index
+        self.df = self.df.reindex(full)
+        self.df.index.name = CANONICAL_INDEX
+        # Mark gaps: True where index not in original
+        gap_mask = ~self.df.index.isin(before)
+        if "gap" in self.df.columns:
+            self.df["gap"] = self.df["gap"].fillna(gap_mask)
+        else:
+            self.df["gap"] = gap_mask
+        return self
+    def fix_accum_rain(self) -> "WeatherSet":
+        if "rain_mm" not in self.df.columns:
+            return self
+        s = self.df["rain_mm"].astype(float)
+        ds = s.diff()
+        # If negative diff, assume counter reset: use current value as new accumulation for that step
+        reset_idx = ds[ds < 0].index
+        ds.loc[reset_idx] = s.loc[reset_idx]
+        # Negative tiny noise -> clamp to 0
+        ds = ds.clip(lower=0.0)
+        self.df["rain_mm"] = ds.fillna(0.0)
+        return self
+    def resample(self, rule: str, agg: Optional[dict] = None) -> "WeatherSet":
+        agg = agg or {
+            "temp_c": "mean",
+            "rh_pct": "mean",
+            "pres_hpa": "mean",
+            "wspd_ms": "mean",
+            "wdir_deg": "mean",
+            "gust_ms": "max",
+            "rain_mm": "sum",
+            "solar_wm2": "mean",
+            "uv_index": "max",
+        }
+        # Normalize frequency strings to use lowercase (pandas 2.0+ requirement)
+        # Replace deprecated uppercase 'H' with lowercase 'h' for hours
+        rule = rule.replace('H', 'h')
+        # Filter aggregation dict to only include columns that exist
+        agg = {k: v for k, v in agg.items() if k in self.df.columns}
+        grouped = self.df.resample(rule)
+        out = grouped.agg(agg) if agg else pd.DataFrame(index=grouped.groups.keys())
+        # Propagate gap as True if any gap in period
+        if "gap" in self.df.columns:
+            out["gap"] = grouped["gap"].max()
+        # Propagate qc_* flags as any over window
+        for col in self.df.columns:
+            if isinstance(col, str) and col.startswith("qc_"):
+                try:
+                    out[col] = grouped[col].max()
+                except Exception:
+                    pass
+        self.df = out
+        self.df.index = self.df.index.tz_convert("UTC") if self.df.index.tz is not None else self.df.index.tz_localize("UTC")
+        self.df.index.name = CANONICAL_INDEX
+        return self
+    def calendar_features(self, cyclical: bool = True) -> "WeatherSet":
+        idx = self.df.index.tz_convert("UTC") if self.df.index.tz is not None else self.df.index.tz_localize("UTC")
+        self.df["hour"] = idx.hour
+        self.df["weekday"] = idx.weekday
+        self.df["month"] = idx.month
+        if cyclical:
+            import numpy as np
+            self.df["hour_sin"] = np.sin(2 * np.pi * self.df["hour"] / 24.0)
+            self.df["hour_cos"] = np.cos(2 * np.pi * self.df["hour"] / 24.0)
+            self.df["doy"] = idx.dayofyear
+            self.df["doy_sin"] = np.sin(2 * np.pi * self.df["doy"] / 365.25)
+            self.df["doy_cos"] = np.cos(2 * np.pi * self.df["doy"] / 365.25)
+        return self
+    def add_exogenous(self, exo: pd.DataFrame, how: str = "left") -> "WeatherSet":
+        # exo should have time index in UTC or tz-aware
+        if exo.index.tz is None:
+            exo.index = exo.index.tz_localize("UTC")
+        else:
+            exo.index = exo.index.tz_convert("UTC")
+        self.df = self.df.join(exo, how=how)
+        return self
+    def qc_range(self) -> "WeatherSet":
+        self.df = qc_range(self.df)
+        return self
+    def qc_spike(self) -> "WeatherSet":
+        from .qc import qc_spike
+        self.df = qc_spike(self.df)
+        return self
+    def qc_flatline(self) -> "WeatherSet":
+        from .qc import qc_flatline
+        self.df = qc_flatline(self.df)
+        return self
+    def qc_consistency(self) -> "WeatherSet":
+        from .qc import qc_consistency, qc_any
+        self.df = qc_consistency(self.df)
+        self.df = qc_any(self.df)
+        return self
+    def to_dataframe(self) -> pd.DataFrame:
+        return self.df
+    def to_netcdf(
+        self,
+        path: str,
+        metadata: Optional[Dict] = None,
+        station_metadata: Optional[Dict] = None,
+    ) -> None:
+        """Export to CF-compliant NetCDF4 file."""
+        from .io import to_netcdf
+        to_netcdf(self.df, path, metadata, station_metadata)
+    def derive(self, metrics: list[str]) -> "WeatherSet":
+        if "dew_point" in metrics and {"temp_c", "rh_pct"}.issubset(self.df.columns):
+            self.df["dew_point_c"] = dew_point_c(self.df["temp_c"], self.df["rh_pct"]).astype(float)
+        if "vpd" in metrics and {"temp_c", "rh_pct"}.issubset(self.df.columns):
+            self.df["vpd_kpa"] = vpd_kpa(self.df["temp_c"], self.df["rh_pct"]).astype(float)
+        if "heat_index" in metrics and {"temp_c", "rh_pct"}.issubset(self.df.columns):
+            from .derive import heat_index_c
+            self.df["heat_index_c"] = heat_index_c(self.df["temp_c"], self.df["rh_pct"]).astype(float)
+        if "wind_chill" in metrics and {"temp_c", "wspd_ms"}.issubset(self.df.columns):
+            from .derive import wind_chill_c
+            self.df["wind_chill_c"] = wind_chill_c(self.df["temp_c"], self.df["wspd_ms"]).astype(float)
+        return self