PyPI - pycontrails - Versions diffs - 0.47.3__cp311-cp311-win_amd64.whl → 0.48.1__cp311-cp311-win_amd64.whl - Mend

pycontrails 0.47.3__cp311-cp311-win_amd64.whl → 0.48.1__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pycontrails might be problematic. Click here for more details.

Files changed (31) hide show

pycontrails/__init__.py +2 -2
pycontrails/_version.py +2 -2
pycontrails/core/coordinates.py +17 -10
pycontrails/core/datalib.py +155 -113
pycontrails/core/flight.py +45 -28
pycontrails/core/met.py +163 -39
pycontrails/core/met_var.py +9 -9
pycontrails/core/models.py +27 -0
pycontrails/core/rgi_cython.cp311-win_amd64.pyd +0 -0
pycontrails/core/vector.py +257 -33
pycontrails/datalib/ecmwf/common.py +14 -65
pycontrails/datalib/ecmwf/era5.py +22 -27
pycontrails/datalib/ecmwf/hres.py +53 -88
pycontrails/datalib/ecmwf/ifs.py +10 -2
pycontrails/datalib/gfs/gfs.py +68 -106
pycontrails/models/accf.py +181 -154
pycontrails/models/cocip/cocip.py +205 -105
pycontrails/models/cocip/cocip_params.py +0 -4
pycontrails/models/cocip/wake_vortex.py +9 -7
pycontrails/models/cocipgrid/cocip_grid.py +2 -6
pycontrails/models/issr.py +29 -31
pycontrails/models/pcr.py +5 -12
pycontrails/models/sac.py +24 -27
pycontrails/models/tau_cirrus.py +22 -5
pycontrails/utils/types.py +1 -1
{pycontrails-0.47.3.dist-info → pycontrails-0.48.1.dist-info}/METADATA +2 -2
{pycontrails-0.47.3.dist-info → pycontrails-0.48.1.dist-info}/RECORD +31 -31
{pycontrails-0.47.3.dist-info → pycontrails-0.48.1.dist-info}/WHEEL +1 -1
{pycontrails-0.47.3.dist-info → pycontrails-0.48.1.dist-info}/LICENSE +0 -0
{pycontrails-0.47.3.dist-info → pycontrails-0.48.1.dist-info}/NOTICE +0 -0
{pycontrails-0.47.3.dist-info → pycontrails-0.48.1.dist-info}/top_level.txt +0 -0

pycontrails/core/vector.py CHANGED Viewed

@@ -18,7 +18,7 @@ from pycontrails.core import coordinates, interpolation
 from pycontrails.core import met as met_module
 from pycontrails.physics import units
 from pycontrails.utils import dependencies
-from pycontrails.utils import json as json_module
+from pycontrails.utils import json as json_utils
 logger = logging.getLogger(__name__)
@@ -492,7 +492,7 @@ class VectorDataset:
         return self.size
     def _display_attrs(self) -> dict[str, str]:
-        """Return properties used in `repr` constructions`.
+        """Return properties used in `repr` constructions.
         Returns
         -------
@@ -515,7 +515,7 @@ class VectorDataset:
         n_keys = len(self.data)
         _repr = f"{class_name} [{n_keys} keys x {self.size} length, {n_attrs} attributes]"
-        keys = list(self.data.keys())
+        keys = list(self)
         keys = keys[0:5] + ["..."] + keys[-1:] if len(keys) > 5 else keys
         _repr += f"\n\tKeys: {', '.join(keys)}"
@@ -633,6 +633,8 @@ class VectorDataset:
         8  15  18
         """
+        vectors = [v for v in vectors if v]  # remove empty vectors
         if not vectors:
             return cls()
@@ -753,7 +755,7 @@ class VectorDataset:
         str
             Unique hash for flight instance (sha1)
         """
-        _hash = json.dumps(self.data, cls=json_module.NumpyEncoder)
+        _hash = json.dumps(self.data, cls=json_utils.NumpyEncoder)
         return hashlib.sha1(bytes(_hash, "utf-8")).hexdigest()
     # ------------
@@ -951,11 +953,11 @@ class VectorDataset:
             ignore_keys = (ignore_keys,)
         # Somewhat brittle: Only checking for int or float type
-        numeric_attrs = [
+        numeric_attrs = (
             attr
             for attr, val in self.attrs.items()
             if (isinstance(val, (int, float)) and attr not in ignore_keys)
-        ]
+        )
         self.broadcast_attrs(numeric_attrs, overwrite)
     # ------------
@@ -982,6 +984,104 @@ class VectorDataset:
         df.attrs = self.attrs
         return df
+    def to_dict(self) -> dict[str, Any]:
+        """Create dictionary with :attr:`data` and :attr:`attrs`.
+        If geo-spatial coordinates (e.g. `"latitude"`, `"longitude"`, `"altitude"`)
+        are present, round to a reasonable precision. If a `"time"` variable is present,
+        round to unix seconds. When the instance is a :class:`GeoVectorDataset`,
+        disregard any `"altitude"` or `"level"` coordinate and only include
+        `"altitude_ft"`in the output.
+        Returns
+        -------
+        dict[str, Any]
+            Dictionary with :attr:`data` and :attr:`attrs`.
+        See Also
+        --------
+        :meth:`from_dict`
+        Examples
+        --------
+        >>> import pprint
+        >>> from pycontrails import Flight
+        >>> fl = Flight(
+        ...     longitude=[-100, -110],
+        ...     latitude=[40, 50],
+        ...     level=[200, 200],
+        ...     time=[np.datetime64("2020-01-01T09"), np.datetime64("2020-01-01T09:30")],
+        ...     aircraft_type="B737",
+        ... )
+        >>> fl = fl.resample_and_fill("5T")
+        >>> pprint.pprint(fl.to_dict())
+        {'aircraft_type': 'B737',
+         'altitude_ft': [38661.0, 38661.0, 38661.0, 38661.0, 38661.0, 38661.0, 38661.0],
+         'crs': 'EPSG:4326',
+         'latitude': [40.0, 41.724, 43.428, 45.111, 46.769, 48.399, 50.0],
+         'longitude': [-100.0,
+                       -101.441,
+                       -102.959,
+                       -104.563,
+                       -106.267,
+                       -108.076,
+                       -110.0],
+         'time': [1577869200,
+                  1577869500,
+                  1577869800,
+                  1577870100,
+                  1577870400,
+                  1577870700,
+                  1577871000]}
+        """
+        np_encoder = json_utils.NumpyEncoder()
+        # round latitude, longitude, and altitude
+        precision = {"longitude": 3, "latitude": 3, "altitude_ft": 0}
+        def encode(key: str, obj: Any) -> Any:
+            # Try to handle some pandas objects
+            if hasattr(obj, "to_numpy"):
+                obj = obj.to_numpy()
+            # Convert numpy objects to python objects
+            if isinstance(obj, (np.ndarray, np.generic)):
+                # round time to unix seconds
+                if key == "time":
+                    return np_encoder.default(obj.astype("datetime64[s]").astype(int))
+                # round specific keys in precision
+                try:
+                    d = precision[key]
+                except KeyError:
+                    return np_encoder.default(obj)
+                return np_encoder.default(obj.astype(float).round(d))
+            # Pass through everything else
+            return obj
+        data = {k: encode(k, v) for k, v in self.data.items()}
+        attrs = {k: encode(k, v) for k, v in self.attrs.items()}
+        # Only include one of the vertical coordinate keys
+        if isinstance(self, GeoVectorDataset):
+            data.pop("altitude", None)
+            data.pop("level", None)
+            if "altitude_ft" not in data:
+                data["altitude_ft"] = self.altitude_ft.round(precision["altitude_ft"]).tolist()
+        # Issue warning if any keys are duplicated
+        common_keys = data.keys() & attrs.keys()
+        if common_keys:
+            warnings.warn(
+                f"Found duplicate keys in data and attrs: {common_keys}. "
+                "Data keys will overwrite attrs keys in returned dictionary."
+            )
+        return {**attrs, **data}
     @classmethod
     def create_empty(
         cls: Type[VectorDatasetType],
@@ -1010,6 +1110,42 @@ class VectorDataset:
         """
         return cls(data=_empty_vector_dict(keys or set()), attrs=attrs, copy=False, **attrs_kwargs)
+    @classmethod
+    def from_dict(
+        cls: Type[VectorDatasetType], obj: dict[str, Any], copy: bool = True, **obj_kwargs: Any
+    ) -> VectorDatasetType:
+        """Create instance from dict representation containing data and attrs.
+        Parameters
+        ----------
+        obj : dict[str, Any]
+            Dict representation of VectorDataset (e.g. :meth:`to_dict`)
+        copy : bool, optional
+            Passed to VectorDataset constructor.
+            Defaults to True.
+        **obj_kwargs : Any
+            Additional properties passed as keyword arguments.
+        Returns
+        -------
+        VectorDatasetType
+            VectorDataset instance.
+        See Also
+        --------
+        :meth:`to_dict`
+        """
+        data = {}
+        attrs = {}
+        for k, v in {**obj, **obj_kwargs}.items():
+            if isinstance(v, (list, np.ndarray)):
+                data[k] = v
+            else:
+                attrs[k] = v
+        return cls(data=data, attrs=attrs, copy=copy)
     def generate_splits(
         self: VectorDatasetType, n_splits: int, copy: bool = True
     ) -> Generator[VectorDatasetType, None, None]:
@@ -1182,7 +1318,7 @@ class GeoVectorDataset(VectorDataset):
         if not np.issubdtype(time.dtype, np.datetime64):
             warnings.warn("Time data is not np.datetime64. Attempting to coerce.")
             try:
-                pd_time = pd.to_datetime(self["time"])
+                pd_time = _handle_time_column(pd.Series(self["time"]))
             except ValueError as e:
                 raise ValueError("Could not coerce time data to datetime64.") from e
             np_time = pd_time.to_numpy(dtype="datetime64[ns]")
@@ -1214,12 +1350,17 @@ class GeoVectorDataset(VectorDataset):
     @overrides
     def _display_attrs(self) -> dict[str, str]:
         try:
-            time0, time1 = np.nanmin(self["time"]), np.nanmax(self["time"])
-            lon0, lon1 = np.nanmin(self["longitude"]), np.nanmax(self["longitude"])
-            lat0, lat1 = np.nanmin(self["latitude"]), np.nanmax(self["latitude"])
-            alt0, alt1 = np.nanmin(self.altitude), np.nanmax(self.altitude)
+            time0 = pd.Timestamp(np.nanmin(self["time"]))
+            time1 = pd.Timestamp(np.nanmax(self["time"]))
+            lon0 = round(np.nanmin(self["longitude"]), 3)
+            lon1 = round(np.nanmax(self["longitude"]), 3)
+            lat0 = round(np.nanmin(self["latitude"]), 3)
+            lat1 = round(np.nanmax(self["latitude"]), 3)
+            alt0 = round(np.nanmin(self.altitude), 1)
+            alt1 = round(np.nanmax(self.altitude), 1)
             attrs = {
-                "time": f"[{pd.Timestamp(time0)}, {pd.Timestamp(time1)}]",
+                "time": f"[{time0}, {time1}]",
                 "longitude": f"[{lon0}, {lon1}]",
                 "latitude": f"[{lat0}, {lat1}]",
                 "altitude": f"[{alt0}, {alt1}]",
@@ -1785,7 +1926,7 @@ class GeoVectorDataset(VectorDataset):
         dict[str, Any]
             Python representation of GeoJSON FeatureCollection
         """
-        return json_module.dataframe_to_geojson_points(self.dataframe)
+        return json_utils.dataframe_to_geojson_points(self.dataframe)
     def to_pseudo_mercator(self: GeoVectorDatasetType, copy: bool = True) -> GeoVectorDatasetType:
         """Convert data from :attr:`attrs["crs"]` to Pseudo Mercator (EPSG:3857).
@@ -1908,29 +2049,112 @@ def vector_to_lon_lat_grid(
 def _handle_time_column(time: pd.Series) -> pd.Series:
+    """Ensure that pd.Series has compatible Timestamps.
+    Parameters
+    ----------
+    time : pd.Series
+        Pandas dataframe column labeled "time".
+    Returns
+    -------
+    pd.Series
+        Parsed pandas time series.
+    Raises
+    ------
+    ValueError
+        When time series can't be parsed, or is not timezone naive.
+    """
     if not hasattr(time, "dt"):
-        # If the time column is a string, we try to convert it to a datetime
-        # If it fails (for example, a unix integer time), we raise an error
-        # and let the user figure it out.
-        try:
-            return pd.to_datetime(time)
-        except ValueError as exc:
-            raise ValueError(
-                "The 'time' field must hold datetime-like values. "
-                'Try data["time"] = pd.to_datetime(data["time"], unit=...) '
-                "with the appropriate unit."
-            ) from exc
+        time = _parse_pandas_time(time)
+    # Translate all times to UTC and then remove timezone.
     # If the time column contains a timezone, the call to `to_numpy`
-    # will convert it to an array of object. We do not want this, so
-    # we raise an error in this case. Timezone issues are complicated,
-    # and so it is better for the user to handle them rather than try
-    # to address them here.
+    # will convert it to an array of object.
+    # Note `.tz_convert(None)` automatically converts to UTC first.
     if time.dt.tz is not None:
-        raise ValueError(
-            "The 'time' field must be timezone naive. "
-            "This can be achieved with: "
-            'data["time"] = data["time"].dt.tz_localize(None)'
-        )
+        time = time.dt.tz_convert(None)
     return time
+def _parse_pandas_time(time: pd.Series) -> pd.Series:
+    """Parse pandas dataframe column labelled "time".
+    Parameters
+    ----------
+    time : pd.Series
+        Time series
+    Returns
+    -------
+    pd.Series
+        Parsed time series
+    Raises
+    ------
+    ValueError
+        When series values can't be inferred.
+    """
+    try:
+        # If the time series is a string, try to convert it to a datetime
+        if time.dtype == "O":
+            return pd.to_datetime(time)
+        # If the time is an int, try to parse it as unix time
+        if np.issubdtype(time.dtype, np.integer):
+            return _parse_unix_time(time)
+        raise ValueError("Unsupported time format")
+    except ValueError as exc:
+        raise ValueError(
+            "The 'time' field must hold datetime-like values. "
+            'Try data["time"] = pd.to_datetime(data["time"], unit=...) '
+            "with the appropriate unit."
+        ) from exc
+def _parse_unix_time(time: list[int] | npt.NDArray[np.int_] | pd.Series) -> pd.Series:
+    """Parse array of int times as unix epoch timestamps.
+    Attempts to parse the time in "s", "ms", "us", "ns"
+    Parameters
+    ----------
+    time : list[int] | npt.NDArray[np.int_] | pd.Series
+        Sequence of unix timestamps
+    Returns
+    -------
+    pd.Series
+        Series of timezone naive pandas Timestamps
+    Raises
+    ------
+    ValueError
+        When unable to parse time as unix epoch timestamp
+    """
+    units = "s", "ms", "us", "ns"
+    for unit in units:
+        try:
+            out = pd.to_datetime(time, unit=unit, utc=True)
+        except ValueError:
+            continue
+        # make timezone naive
+        out = out.dt.tz_convert(None)
+        # make sure time is reasonable
+        if (pd.Timestamp("1980-01-01") <= out).all() and (out <= pd.Timestamp("2030-01-01")).all():
+            return out
+    raise ValueError(
+        f"Unable to parse time parameter '{time}' as unix epoch timestamp between "
+        "1980-01-01 and 2030-01-01"
+    )

pycontrails/datalib/ecmwf/common.py CHANGED Viewed

@@ -2,48 +2,20 @@
 from __future__ import annotations
-import datetime
 import logging
-from contextlib import ExitStack
+import os
 from typing import Any
 LOG = logging.getLogger(__name__)
 import numpy as np
+import pandas as pd
 import xarray as xr
 from overrides import overrides
 from pycontrails.core import datalib, met
-from pycontrails.utils import temp
-def rad_accumulated_to_average(mds: met.MetDataset, key: str, dt_accumulation: int) -> None:
-    """Convert accumulated radiation value to instantaneous average.
-    Parameters
-    ----------
-    mds : MetDataset
-        MetDataset containing the accumulated value at ``key``
-    key : str
-        Data variable key
-    dt_accumulation : int
-        Accumulation time in seconds, [:math:`s`]
-    """
-    if key in mds.data and not mds.data[key].attrs.get("_pycontrails_modified", False):
-        if not np.all(np.diff(mds.data["time"]) == np.timedelta64(dt_accumulation, "s")):
-            raise ValueError(
-                f"Dataset expected to have time interval of {dt_accumulation} seconds when"
-                " converting accumulated parameters"
-            )
-        mds.data[key] = mds.data[key] / dt_accumulation
-        mds.data[key].attrs["units"] = "W m**-2"
-        mds.data[key].attrs[
-            "_pycontrails_modified"
-        ] = "Accumulated value converted to average instantaneous value"
-# TODO: Remove this in favor of functional implementation
 class ECMWFAPI(datalib.MetDataSource):
     """Abstract class for all ECMWF data accessed remotely through CDS / MARS."""
@@ -58,7 +30,6 @@ class ECMWFAPI(datalib.MetDataSource):
         """
         return [v.ecmwf_id for v in self.variables if v.ecmwf_id is not None]
-    # TODO: this could be functional, but there many properties utilized
     def _process_dataset(self, ds: xr.Dataset, **kwargs: Any) -> met.MetDataset:
         """Process the :class:`xr.Dataset` opened from cache or local files.
@@ -88,8 +59,8 @@ class ECMWFAPI(datalib.MetDataSource):
                 ds = ds.sel(time=self.timesteps)
             except KeyError:
                 # this snippet shows the missing times for convenience
-                np_timesteps = [np.datetime64(t, "ns") for t in self.timesteps]
-                missing_times = sorted(set(np_timesteps) - set(ds["time"].values))
+                np_timesteps = {np.datetime64(t, "ns") for t in self.timesteps}
+                missing_times = sorted(np_timesteps.difference(ds["time"].values))
                 raise KeyError(
                     f"Input dataset is missing time coordinates {[str(t) for t in missing_times]}"
                 )
@@ -111,22 +82,6 @@ class ECMWFAPI(datalib.MetDataSource):
         # harmonize variable names
         ds = met.standardize_variables(ds, self.variables)
-        # modify values
-        # rescale relative humidity from % -> dimensionless if its in dataset
-        if "relative_humidity" in ds and not ds["relative_humidity"].attrs.get(
-            "_pycontrails_modified", False
-        ):
-            ds["relative_humidity"] = ds["relative_humidity"] / 100
-            ds["relative_humidity"].attrs["long_name"] = "Relative humidity"
-            ds["relative_humidity"].attrs["standard_name"] = "relative_humidity"
-            ds["relative_humidity"].attrs["units"] = "[0 - 1]"
-            ds["relative_humidity"].attrs[
-                "_pycontrails_modified"
-            ] = "Relative humidity rescaled to [0 - 1] instead of %"
-        ds.attrs["met_source"] = type(self).__name__
         kwargs.setdefault("cachestore", self.cachestore)
         return met.MetDataset(ds, **kwargs)
@@ -136,18 +91,12 @@ class ECMWFAPI(datalib.MetDataSource):
             LOG.debug("Cache is turned off, skipping")
             return
-        with ExitStack() as stack:
-            # group by hour and save one dataset for each hour to temp file
-            time_group, datasets = zip(*dataset.groupby("time", squeeze=False))
-            xarray_temp_filenames = [stack.enter_context(temp.temp_file()) for _ in time_group]
-            xr.save_mfdataset(datasets, xarray_temp_filenames)
+        for t, ds_t in dataset.groupby("time", squeeze=False):
+            cache_path = self.create_cachepath(pd.Timestamp(t).to_pydatetime())
+            if os.path.exists(cache_path):
+                LOG.debug(f"Overwriting existing cache file {cache_path}")
+                # This may raise a PermissionError if the file is already open
+                # If this is the case, the user should explicitly close the file and try again
+                os.remove(cache_path)
-            # put each hourly file into cache
-            cache_path = [
-                self.create_cachepath(
-                    datetime.datetime.fromtimestamp(tg.tolist() / 1e9, datetime.timezone.utc)
-                )
-                for tg in time_group
-            ]
-            self.cachestore.put_multiple(xarray_temp_filenames, cache_path)
+            ds_t.to_netcdf(cache_path)

pycontrails/datalib/ecmwf/era5.py CHANGED Viewed

@@ -21,16 +21,9 @@ from overrides import overrides
 import pycontrails
 from pycontrails.core import cache, datalib
 from pycontrails.core.met import MetDataset, MetVariable
-from pycontrails.datalib.ecmwf.common import ECMWFAPI, rad_accumulated_to_average
-from pycontrails.datalib.ecmwf.variables import (
-    PRESSURE_LEVEL_VARIABLES,
-    SURFACE_VARIABLES,
-    TOAIncidentSolarRadiation,
-    TopNetSolarRadiation,
-    TopNetThermalRadiation,
-)
-from pycontrails.utils import dependencies
-from pycontrails.utils.temp import temp_file
+from pycontrails.datalib.ecmwf.common import ECMWFAPI
+from pycontrails.datalib.ecmwf.variables import PRESSURE_LEVEL_VARIABLES, SURFACE_VARIABLES
+from pycontrails.utils import dependencies, temp
 if TYPE_CHECKING:
     import cdsapi
@@ -96,7 +89,7 @@ class ERA5(ECMWFAPI):
     ERA5 parameter list:
     https://confluence.ecmwf.int/pages/viewpage.action?pageId=82870405#ERA5:datadocumentation-Parameterlistings
-    All accumulated radiative quantities are converted to average instantaneous quantities.
+    All radiative quantities are accumulated.
     See https://www.ecmwf.int/sites/default/files/elibrary/2015/18490-radiation-quantities-ecmwf-model-and-mars.pdf
     for more information.
@@ -176,7 +169,7 @@ class ERA5(ECMWFAPI):
         if time is None and paths is None:
             raise ValueError("The parameter 'time' must be defined if 'paths' is None")
-        supported = {"reanalysis", "ensemble_mean", "ensemble_members", "ensemble_spread"}
+        supported = ("reanalysis", "ensemble_mean", "ensemble_members", "ensemble_spread")
         if product_type not in supported:
             raise ValueError(
                 f"Unknown product_type {product_type}. "
@@ -388,28 +381,30 @@ class ERA5(ECMWFAPI):
             # run MetDataset constructor
             ds = self.open_dataset(disk_cachepaths, **xr_kwargs)
-            # TODO: corner case
             # If any files are already cached, they will not have the version attached
             ds.attrs.setdefault("pycontrails_version", pycontrails.__version__)
         # run the same ECMWF-specific processing on the dataset
         mds = self._process_dataset(ds, **kwargs)
-        # convert accumulated radiation values to average instantaneous values
-        # set minimum for all values to 0
-        # accumulations are 3 hours for ensembles, 1 hour for reanalysis
-        dt_accumulation = 60 * 60 if self.product_type == "reanalysis" else 3 * 60 * 60
+        self.set_metadata(mds)
+        return mds
-        for key in (
-            TOAIncidentSolarRadiation.standard_name,
-            TopNetSolarRadiation.standard_name,
-            TopNetThermalRadiation.standard_name,
-        ):
-            if key in mds.data:
-                rad_accumulated_to_average(mds, key, dt_accumulation)
+    @overrides
+    def set_metadata(self, ds: xr.Dataset | MetDataset) -> None:
+        if self.product_type == "reanalysis":
+            product = "reanalysis"
+        elif self.product_type.startswith("ensemble"):
+            product = "ensemble"
+        else:
+            msg = f"Unknown product type {self.product_type}"
+            raise ValueError(msg)
-        return mds
+        ds.attrs.update(
+            provider="ECMWF",
+            dataset="ERA5",
+            product=product,
+        )
     def _open_and_cache(self, xr_kwargs: dict[str, Any]) -> xr.Dataset:
         """Open and cache :class:`xr.Dataset` from :attr:`self.paths`.
@@ -478,7 +473,7 @@ class ERA5(ECMWFAPI):
         # Open ExitStack to control temp_file context manager
         with ExitStack() as stack:
             # hold downloaded file in named temp file
-            cds_temp_filename = stack.enter_context(temp_file())
+            cds_temp_filename = stack.enter_context(temp.temp_file())
             LOG.debug(f"Performing CDS request: {request} to dataset {self.dataset}")
             if not hasattr(self, "cds"):
                 self._set_cds()