PyPI - mlda - Versions diffs - 2024.11.22__tar.gz - Mend

mlda 2024.11.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

mlda-2024.11.22/LICENSE +28 -0
mlda-2024.11.22/PKG-INFO +25 -0
mlda-2024.11.22/README.md +4 -0
mlda-2024.11.22/mlda/__init__.py +11 -0
mlda-2024.11.22/mlda/da.py +333 -0
mlda-2024.11.22/mlda/obs.py +80 -0
mlda-2024.11.22/mlda/prior.py +329 -0
mlda-2024.11.22/mlda/psm.py +147 -0
mlda-2024.11.22/mlda/utils.py +103 -0
mlda-2024.11.22/mlda.egg-info/PKG-INFO +25 -0
mlda-2024.11.22/mlda.egg-info/SOURCES.txt +15 -0
mlda-2024.11.22/mlda.egg-info/dependency_links.txt +1 -0
mlda-2024.11.22/mlda.egg-info/not-zip-safe +1 -0
mlda-2024.11.22/mlda.egg-info/requires.txt +7 -0
mlda-2024.11.22/mlda.egg-info/top_level.txt +1 -0
mlda-2024.11.22/setup.cfg +4 -0
mlda-2024.11.22/setup.py +33 -0

mlda-2024.11.22/LICENSE ADDED Viewed

@@ -0,0 +1,28 @@
+BSD 3-Clause License
+Copyright (c) 2024, Feng Zhu
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

mlda-2024.11.22/PKG-INFO ADDED Viewed

@@ -0,0 +1,25 @@
+Metadata-Version: 2.1
+Name: mlda
+Version: 2024.11.22
+Summary: mlda: A Python package for Machine Learning-base Data Assimilation
+Home-page: https://github.com/fzhu2e/mlda
+Author: Feng Zhu, Weimin Si
+Author-email: fengzhu@ucar.edu, weimin_si@brown.edu
+License: BSD-3
+Keywords: Machine Learning,Data Assimilation
+Classifier: Natural Language :: English
+Classifier: Programming Language :: Python :: 3.12
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: netCDF4
+Requires-Dist: xarray
+Requires-Dist: dask
+Requires-Dist: nc-time-axis
+Requires-Dist: colorama
+Requires-Dist: tqdm
+Requires-Dist: x4c-exp
+# mlda: A Python package for Machine Learning-based Data Assimilation
+`mlda` is a Python package for Machine Learning-base Data Assimilation (DA).
+It aims to provide a universal framework and the corresponding utilities for conducting reproducible data assimilation experiments using novel machine learning-based DA methods.

mlda-2024.11.22/README.md ADDED Viewed

@@ -0,0 +1,4 @@
+# mlda: A Python package for Machine Learning-based Data Assimilation
+`mlda` is a Python package for Machine Learning-base Data Assimilation (DA).
+It aims to provide a universal framework and the corresponding utilities for conducting reproducible data assimilation experiments using novel machine learning-based DA methods.

mlda-2024.11.22/mlda/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# get the version
+from importlib.metadata import version
+__version__ = version('cpda')
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+from . import utils, cesm
+from .prior import Prior, PriorMember
+from .obs import Obs
+from .da import Solver

mlda-2024.11.22/mlda/da.py ADDED Viewed

@@ -0,0 +1,333 @@
+import numpy as np
+import xarray as xr
+from scipy.linalg import cholesky, sqrtm
+from . import utils
+def gaspari_cohn(dist, loc_radius):
+    '''
+    Vectorized Gaspari-Cohn localization function.
+    Args:
+        dist (ndarray): Distance(s) between model state and observation.
+        loc_radius (float): Localization radius (distance beyond which covariance is set to zero).
+    Reference:
+        Gaspari, G., Cohn, S.E., 1999. Construction of correlation functions in two and three dimensions.
+        Quarterly Journal of the Royal Meteorological Society 125, 723-757. https://doi.org/10.1002/qj.49712555417
+    '''
+    # Normalize the distances
+    r = np.abs(dist) / loc_radius
+    # Initialize the result array with zeros
+    f = np.zeros_like(r)
+    # Eq. (4.10) in Gaaspari & Coh (1999)
+    mask1 = r <= 1
+    f[mask1] = -r[mask1]**5 / 4 + r[mask1]**4 / 2 + 5/8 * r[mask1]**3 - 5/3 * r[mask1]**2 + 1
+    mask2 = (r > 1) & (r <= 2)
+    f[mask2] = r[mask2]**5 / 12 - r[mask2]**4 / 2 + 5/8 * r[mask2]**3  + 5/3 * r[mask2]**2 - 5 * r[mask2] + 4 - 2/3 / r[mask2]
+    f[f<0] = 0 # force f >= 0
+    return f
+def gaspari_cohn_dash(dist, loc_radius, scale=0.5):
+    """
+    Implements a Gaspari-Cohn 5th order polynomial localization function.
+    Parameters:
+        dist (ndarray): An array of distances.
+        loc_radius (float): The cutoff radius, beyond which weights are zero.
+        scale (float or str, optional): The length scale for the polynomial.
+            Must be on the interval 0 < scale <= 0.5, or 'optimal' to use the optimal
+            length scale as described by Lorenc (2003). Default is 0.5.
+    Returns:
+        weights (ndarray): Covariance localization weights with the same shape as distances.
+    """
+    # Set the scale if 'optimal' is specified
+    if isinstance(scale, str) and scale == 'optimal':
+        scale = np.sqrt(10 / 3)
+    # Define length scale and localization radius
+    c = scale * loc_radius
+    # Preallocate weights array with ones
+    weights = np.ones_like(dist)
+    # Calculate mask arrays for the different distance ranges
+    outside_radius = dist > loc_radius
+    inside_scale = dist <= c
+    in_between = ~inside_scale & ~outside_radius
+    # Apply Gaspari-Cohn polynomial
+    X = dist / c
+    weights[outside_radius] = 0
+    weights[in_between] = X[in_between]**5 / 12 - 0.5 * X[in_between]**4 + 0.625 * X[in_between]**3 + (5 / 3) * X[in_between]**2 - 5 * X[in_between] + 4 - 2 / (3 * X[in_between])
+    weights[inside_scale] = -0.25 * X[inside_scale]**5 + 0.5 * X[inside_scale]**4 + 0.625 * X[inside_scale]**3 - (5 / 3) * X[inside_scale]**2 + 1
+    # Ensure weights are non-negative due to rounding errors
+    weights[weights < 0] = 0
+    return weights
+class EnSRF:
+    def __init__(self, X=None, Y=None, y=None, R=None, L=None, Lobs=None):
+        self.X = X            # ensemble of the prior state vectors (n x N)
+        self.Y = Y            # ensemble of the forward estimates (m x N); Y=H(X)
+        self.y = y            # observations (m x 1)
+        self.R = R            # obs err matrix (m x m)
+        self.L = L            # localization matrix (n x m)
+        self.Lobs = Lobs      # localization matrix (m x m)
+    def update(self, debug=False):
+        ''' Perform an EnSRF update with localization. '''
+        N = self.X.shape[1]  # Ensemble size
+        # Compute the ensemble mean
+        Xm = np.mean(self.X, axis=1, keepdims=True)
+        Xp = self.X - Xm
+        Ym = np.mean(self.Y, axis=1, keepdims=True)
+        Yp = self.Y - Ym
+        # Observation error covariance matrix
+        Ycov = (Yp @ Yp.T) / (N - 1)
+        # Localize the obs err covariance matrix
+        if self.Lobs is not None:
+            Ycov_loc = Ycov * self.Lobs
+        else:
+            Ycov_loc = Ycov
+        C =  Ycov_loc + self.R
+        # Kalman gain matrix
+        XYcov = (Xp @ Yp.T) / (N - 1)
+        # Localize the Kalman gain
+        if self.L is not None:
+            XYcov_loc = XYcov * self.L
+        else:
+            XYcov_loc = XYcov
+        K = XYcov_loc @ np.linalg.inv(C)
+        # Observation innovation
+        d = self.y - Ym
+        # Update the ensemble mean
+        Xm_updated = Xm + K @ d
+        # Update the ensemble perturbations
+        T = np.eye(N) - (Yp.T @ np.linalg.inv(C)) @ Yp / (N - 1)
+        Xp_updated = Xp @ T
+        # Combine updated mean and perturbations
+        self.X_updated = Xm_updated + Xp_updated
+        if debug:
+            self.Xm = Xm
+            self.Xp = Xp
+            self.Ym = Ym
+            self.Yp = Yp
+            self.C = C
+            self.K = K
+            self.d = d
+            self.T = T
+class EnSRF_DASH:
+    def __init__(self, X=None, Y=None, y=None, R=None, L=None, Lobs=None):
+        self.X = X            # ensemble of the prior state vectors (n x N)
+        self.Y = Y            # ensemble of the forward estimates (m x N); Y=H(X)
+        self.y = y            # observations (m x 1)
+        self.R = R            # obs err matrix (m x m)
+        self.L = L            # localization matrix (n x m)
+        self.Lobs = Lobs      # localization matrix (m x m)
+    def update(self, debug=False):
+        ''' Perform an EnSRF update with localization. '''
+        N = self.X.shape[1]  # Ensemble size
+        # Compute the ensemble mean
+        Xm = np.mean(self.X, axis=1, keepdims=True)
+        Xp = self.X - Xm
+        Ym = np.mean(self.Y, axis=1, keepdims=True)
+        Yp = self.Y - Ym
+        # Observation error covariance matrix
+        Ycov = (Yp @ Yp.T) / (N - 1)
+        # Localize the obs err covariance matrix
+        if self.Lobs is not None:
+            Ycov_loc = self.Lobs * Ycov
+        else:
+            Ycov_loc = Ycov
+        C =  Ycov_loc + self.R
+        # Kalman gain matrix
+        XYcov = (Xp @ Yp.T) / (N - 1)
+        # Localize the Kalman gain
+        if self.L is not None:
+            XYcov_loc = self.L * XYcov
+        else:
+            XYcov_loc = XYcov
+        K = XYcov_loc @ np.linalg.inv(C)
+        # Observation innovation
+        d = self.y - Ym
+        # Update the ensemble mean
+        Xm_updated = Xm + K @ d
+        # Update the ensemble perturbations
+        Ksqrt = sqrtm(C)
+        Ksqrt_inv_transpose = np.linalg.inv(Ksqrt).T
+        Rcov_sqrt = sqrtm(self.R)
+        Ka = K @ Ksqrt_inv_transpose @ np.linalg.inv(Ksqrt + Rcov_sqrt)
+        Xp_updated = Xp - Ka @ Yp
+        # Combine updated mean and perturbations
+        self.X_updated = Xm_updated + Xp_updated
+        if debug:
+            self.Xm = Xm
+            self.Xp = Xp
+            self.Ym = Ym
+            self.Yp = Yp
+            self.C = C
+            self.K = K
+            self.d = d
+class EnOI:
+    def __init__(self, X_target=None, X=None, Y=None, y=None, R=None, L=None):
+        self.X_target = X_target   # the **monthly** prior state vectors (n x 1)
+        self.X = X         # ensemble of the prior state vectors (n x N)
+        self.Y = Y         # ensemble of the forward estimates (m x N); Y=H(X)
+        self.y = y         # observations (m x 1)
+        self.R = R         # obs err matrix (m x m)
+        self.L = L         # localization matrix (n x m)
+    def update(self, debug=False):
+        ''' Perform an EnOI update with localization. '''
+        N = self.X.shape[1]  # Ensemble size
+        # Compute the ensemble mean
+        Xm = np.mean(self.X, axis=1, keepdims=True)
+        Xp = self.X - Xm
+        Ym = np.mean(self.Y, axis=1, keepdims=True)
+        Yp = self.Y - Ym
+        # Observation error covariance matrix
+        C = (Yp @ Yp.T) / (N - 1) + self.R
+        # Kalman gain matrix
+        K = (Xp @ Yp.T) / (N - 1) @ np.linalg.inv(C)
+        # Localize the Kalman gain
+        if self.L is not None:
+            K_loc = K * self.L
+        else:
+            K_loc = K
+        # Observation innovation
+        d = self.y - Ym
+        # the increment
+        inc = K_loc @ d
+        # update
+        self.X_target_updated = self.X_target + inc
+        if debug:
+            self.Xm = Xm
+            self.Xp = Xp
+            self.Ym = Ym
+            self.Yp = Yp
+            self.C = C
+            self.K = K
+            self.K_loc = K_loc
+            self.d = d
+class Solver:
+    def __init__(self, prior=None, obs=None, prior_target=None):
+        self.prior = prior.copy() if prior is not None else None
+        self.obs = obs.copy() if obs is not None else None
+        self.prior_target = prior_target.copy() if prior_target is not None else None
+    def prep(self, localize=True, loc_radius=2500, dist_vsf=1, dlat=1, dlon=1, loc_method='dash',
+             recon_season=list(range(1, 13)), startover=False, nearest_valid_radius=5, **fwd_kws):
+        ''' Prepare Y=H(X) and the localization matrix for DA
+        Args:
+            dist_vsf (float, list of float): the vertical scaling factor of the distance
+        '''
+        if not hasattr(self.prior, 'ds_rgd'):
+            utils.p_header(f'>>> Regridding the prior (dlat={dlat}, dlon={dlon})')
+            self.prior.regrid(dlat=dlat, dlon=dlon)
+        if startover or not hasattr(self.prior, 'Y'):
+            utils.p_header('>>> Proxy System Modeling: Y = H(X)')
+            self.prior.get_Y(self.obs, nearest_valid_radius=nearest_valid_radius, **fwd_kws)
+        if not hasattr(self.prior, 'ds_ann'):
+            utils.p_header(f'>>> Annualizing prior w/ season: {recon_season}')
+            self.prior.annualize(months=recon_season)
+        if localize and not hasattr(self.prior, 'dist'):
+            loc_func = {
+                'cpda': gaspari_cohn,
+                'dash': gaspari_cohn_dash,
+            }
+            utils.p_header('>>> Computing the localization matrix')
+            self.prior.get_dist(self.prior.obs_assim, dist_vsf)
+            self.L = loc_func[loc_method](self.prior.dist, loc_radius)
+            self.obs.get_dist()
+            self.Lobs = loc_func[loc_method](self.obs.dist, loc_radius)
+        else:
+            self.L = None
+            self.Lobs = None
+    def run(self, method='EnSRF', debug=False):
+        algo = {
+            'EnSRF': EnSRF,
+            'EnSRF_DASH': EnSRF_DASH,
+            'EnOI': EnOI,
+        }
+        kws = {}
+        for m in algo.keys():
+            kws[m] = {
+                'X': self.prior.X,
+                'Y': self.prior.Y,
+                'y': self.obs.y,
+                'R': self.obs.R,
+                'L': self.L,
+                'Lobs': self.Lobs,
+            }
+        if self.prior_target is not None:
+            kws['EnOI']['X_target'] = self.prior_target.X
+        self.S = algo[method](**kws[method])
+        utils.p_header('>>> DA update')
+        self.S.update(debug=debug)
+        utils.p_header('>>> Formatting the posterior')
+        if method in ['EnSRF', 'EnSRF_DASH']:
+            self.post = utils.states2ds(self.S.X_updated, self.prior.ds_ann)
+        elif method == 'EnOI':
+            self.post = utils.states2ds(self.S.X_target_updated, self.prior_target.ds_ann)

mlda-2024.11.22/mlda/obs.py ADDED Viewed

@@ -0,0 +1,80 @@
+from copy import deepcopy
+import numpy as np
+import pandas as pd
+import xarray as xr
+from . import utils
+class Obs:
+    def __init__(self, df:pd.DataFrame):
+        self.df = df
+        self.df['lon'] = (df['lon'] + 360) % 360
+        self.nobs = len(df)
+        self.pids = df['pid'].values
+        self.records = {}
+        for pid in self.pids:
+            self.records[pid] = self[pid]
+    @property
+    def y(self):
+        return self.df['value'].values[..., np.newaxis]
+    @property
+    def y_locs(self):
+        return self.df[['lat', 'lon']].values
+    @property
+    def R(self):
+        return np.diag(self.df['R'].values)
+    def copy(self):
+        return deepcopy(self)
+    def __getitem__(self, pid:str):
+        mask = self.df['pid'] == pid
+        row = self.df[mask].iloc[0]
+        rec = ProxyRecord(row)
+        return rec
+    def get_dist(self):
+        lats = self.df['lat'].values
+        lons = self.df['lon'].values
+        lat1, lat2 = np.meshgrid(lats, lats)
+        lon1, lon2 = np.meshgrid(lons, lons)
+        self.dist = utils.gcd(lat1, lon1, lat2, lon2)
+        return self.dist
+class ProxyRecord:
+    def __init__(self, data:pd.Series):
+        self.data = data.copy()
+        if 'time' in data: self.data['time'] = np.array(data['time'])
+        if 'value' in data: self.data['value'] = np.array(data['value'])
+        if 'seasonality' in data:
+            if isinstance(data['seasonality'], str):
+                self.data['seasonality'] = utils.str2list(data['seasonality'])
+            elif isinstance(data['seasonality'], list):
+                self.data['seasonality'] = data['seasonality']
+            else:
+                raise ValueError('Wrong seasonality type; should be a string or a list.')
+    def get_clim(self, clim_ds, vns:list=None, verbose=False):
+        if vns is None:
+            vns = clim_ds.data_vars
+        else:
+            vns = [vn for vn in vns if vn in clim_ds.data_vars]
+        self.clim = xr.Dataset()
+        for vn in vns:
+            self.clim[vn] = clim_ds[vn].x.nearest2d(
+            # filled_da = clim_ds[vn].ffill(dim='lon').bfill(dim='lon').ffill(dim='lat').bfill(dim='lat')
+            # self.clim[vn] = filled_da.sel(
+                lat=self.data.lat,
+                lon=self.data.lon,
+                method='nearest',
+            ).sel(month=self.data.seasonality).mean(dim='month')
+            if verbose: utils.p_success(f'>>> ProxyRecord.clim["{vn}"] created')
+        self.clim.attrs['seasonality'] = self.data.seasonality

mlda-2024.11.22/mlda/prior.py ADDED Viewed

@@ -0,0 +1,329 @@
+import numpy as np
+import xarray as xr
+from tqdm import tqdm
+from scipy.stats import norm
+from tqdm import tqdm
+from copy import deepcopy
+from . import psm
+from . import utils
+class PriorMember:
+    def __init__(self, ds):
+        if isinstance(ds, xr.DataArray): ds = ds.to_dataset()
+        self.ds = ds
+        self.vns = list(ds.data_vars)
+    def gen_samples_Gaussian(self, local_sigma:dict, global_sigma:dict, nens:int=100, seed:int=2333):
+        ''' Generate samples following Gaussian
+        Args:
+            sigma (dict): Dictionary with standard deviation (sigma) for each variable.
+            nens (int): Number of ensemble members to generate.
+            seed (int): Seed for reproducibility.
+        '''
+        rng = np.random.default_rng(seed)
+        self.samples = xr.Dataset()
+        for vn in self.vns:
+            mean = self.ds[vn].values
+            samples_shape = (*mean.shape, nens)
+            global_perturbation = norm.rvs(loc=0, scale=global_sigma[vn], size=nens, random_state=rng)
+            local_perturbation = norm.rvs(loc=0, scale=local_sigma[vn], size=samples_shape, random_state=rng)
+            samples = mean[..., np.newaxis] + global_perturbation + local_perturbation
+            samples_da = xr.DataArray(samples, dims=(*self.ds[vn].dims, 'ens'), coords=self.ds[vn].coords)
+            samples_da.attrs = self.ds[vn].attrs
+            self.samples[vn] = samples_da
+    def gen_samples_bootstrap(self, nens:int=30, clim_yrs:int=50, seed:int=0, dim='time'):
+        ''' Generate samples from the prior pool
+        Args:
+        '''
+        nt = len(self.ds[dim])
+        pool_idx = list(range(nt))
+        sample_list = []
+        if seed is None: seed = 0
+        for i in range(nens):
+            seed += 1
+            rng = np.random.default_rng(seed)
+            sample_idx = rng.choice(pool_idx, size=clim_yrs, replace=False)
+            sample = self.ds.isel({dim: sample_idx}).mean(dim)
+            sample_list.append(sample)
+        samples = xr.concat(sample_list, dim=dim)
+        self.samples = xr.Dataset(samples).rename({dim: 'ens'})
+class Prior:
+    def __init__(self, members, lat_name='TLAT', lon_name='TLONG', depth_name='z_t'):
+        if not isinstance(members, list): members = [members]
+        ds_list = []
+        for m in members:
+            if hasattr(m, 'samples'):
+                ds_list.append(m.samples)
+            else:
+                if 'ens' not in m.ds.dims:
+                    ds_list.append(m.ds.expand_dims({'ens': 1}))
+                else:
+                    ds_list.append(m.ds)
+        self.ds = xr.concat(ds_list, dim='ens').transpose(..., 'ens')
+        self.lat_name = lat_name
+        self.lon_name = lon_name
+        self.depth_name = depth_name
+        if depth_name is not None:
+            self.nz = len(self.ds[depth_name])
+        self.nlat, self.nlon = self.ds[lat_name].shape, self.ds[lon_name].shape
+        self.nens = len(self.ds.ens)
+        self.nvar = len(self.ds.data_vars)
+    def regrid(self, dlat=1, dlon=1, verbose=False):
+        self.ds_rgd = xr.Dataset()
+        for vn in tqdm(self.ds.data_vars, desc=f'Regridding variables to {dlat}x{dlon}'):
+            self.ds_rgd[vn] = self.ds.x[vn].x.regrid(dlat=dlat, dlon=dlon)
+            if verbose: utils.p_success(f'>>> Prior.ds_rgd["{vn}"] created')
+    def annualize(self, months=list(range(1, 13))):
+        self.ds_ann = self.ds.sel(month=months).mean('month')
+    def inflate(self, factor=2):
+        self.ds_raw = self.ds.copy()
+        ens_mean = self.ds.mean('ens')
+        ens_pert = self.ds - self.ds.mean('ens')
+        inflated_pert = ens_pert * factor
+        self.ds = ens_mean + inflated_pert
+    def copy(self):
+        return deepcopy(self)
+    @property
+    def X(self):
+        res = []
+        for vn in self.ds_ann.data_vars:
+            res.append(self.ds_ann[vn].values.reshape(-1, self.nens))
+        res = np.array(res).reshape(-1, self.nens)
+        return res
+    # def get_Y(self, obs, **fwd_kws):
+    #     self.obs_assim = obs.copy()
+    #     lats = obs.df['lat'].values
+    #     lons = obs.df['lon'].values
+    #     pids = obs.df['pid'].values
+    #     depths = obs.df['depth'].values
+    #     if 'clean' in obs.df.columns: cleans = obs.df['clean'].values
+    #     if 'species' in obs.df.columns: specs = obs.df['species'].values
+    #     psms = obs.df['psm'].values
+    #     psm_names = list(set(psms))
+    #     pseudo_obs = np.empty((len(obs.df), self.nens))
+    #     # Loop over PSM types (psm_names)
+    #     for psm_name in psm_names:
+    #         mask = psms == psm_name
+    #         idx = np.where(mask)[0]
+    #         if np.any(mask):
+    #             lat_lon_pairs = xr.Dataset({
+    #                 'lat': (('obs',), lats[mask]),
+    #                 'lon': (('obs',), lons[mask]),
+    #             })
+    #             self.clim_proxy_locs = xr.Dataset()
+    #             for vn in self.ds_rgd.data_vars:
+    #                 filled_da = self.ds_rgd[vn].ffill(dim='lat').bfill(dim='lat').ffill(dim='lon').bfill(dim='lon')
+    #                 self.clim_proxy_locs[vn] = filled_da.sel(
+    #                     lat=lat_lon_pairs['lat'],
+    #                     lon=lat_lon_pairs['lon'],
+    #                     method='nearest',
+    #                 ).transpose(..., 'ens')
+    #             for i in tqdm(range(len(idx)), desc=f'>>> Looping over sites w/ PSM - {psm_name}'):
+    #                 pid = pids[idx[i]]
+    #                 lat = lats[idx[i]]
+    #                 lon = lons[idx[i]]
+    #                 depth = depths[idx[i]]
+    #                 if np.isnan(depth): depth = 0
+    #                 obs_meta = {
+    #                     'pid': pid,
+    #                     'lat': lat,
+    #                     'lon': lon,
+    #                     'depth': depth,
+    #                 }
+    #                 if 'clean' in obs.df.columns:
+    #                     clean = cleans[idx[i]]
+    #                     if np.isnan(clean): clean = 0
+    #                     obs_meta['clean'] = clean
+    #                 if 'species' in obs.df.columns:
+    #                     species = specs[idx[i]]
+    #                     if not isinstance(species, str): species = 'all'
+    #                     obs_meta['species'] = species
+    #                 mdl = psm.__dict__[psm_name](obs_meta, self.clim_proxy_locs.isel({'obs': i}))
+    #                 _fwd_kws = {}
+    #                 _fwd_kws[psm_name] = {}
+    #                 if psm_name in fwd_kws:
+    #                     _fwd_kws[psm_name].update(fwd_kws[psm_name])
+    #                 res = mdl.forward(**_fwd_kws[psm_name])
+    #                 if res is None:
+    #                     utils.p_warning(f'>>> Dropping proxy: {pid}')
+    #                     self.obs_assim.df = obs.df.drop(obs.df[obs.df['pid'] == pid].index)
+    #                     pseudo_obs[idx[i]] = np.nan
+    #                 else:
+    #                     pseudo_obs[idx[i]] = res
+    #     self.obs_assim.nobs = len(self.obs_assim.df)
+    #     pseudo_obs = pseudo_obs[~np.isnan(pseudo_obs).any(axis=1)]
+    #     self.Y = pseudo_obs
+    #     self.obs_assim.df['Ym'] = self.Y.mean(axis=1)
+    # def get_Y(self, obs, **fwd_kws):
+    #     self.obs_assim = obs.copy()
+    #     pseudo_obs = np.empty((len(obs.df), self.nens))
+    #     for i, (pid, rec) in tqdm(enumerate(obs.records.items()), total=obs.nobs, desc='Looping over records'):
+    #         mdl = psm.__dict__[rec.data.psm](rec)
+    #         mdl.record.get_clim(self.ds_rgd, vns=mdl.clim_vns)
+    #         _fwd_kws = {}
+    #         _fwd_kws[rec.data.psm] = {}
+    #         if rec.data.psm in fwd_kws:
+    #             _fwd_kws[rec.data.psm].update(fwd_kws[rec.data.psm])
+    #         res = mdl.forward(**_fwd_kws[rec.data.psm])
+    #         if res is None:
+    #             utils.p_warning(f'>>> Dropping proxy: {pid}')
+    #             self.obs_assim.df = obs.df.drop(obs.df[obs.df['pid'] == pid].index)
+    #             pseudo_obs[i] = np.nan
+    #         else:
+    #             pseudo_obs[i] = res
+    #     self.obs_assim.nobs = len(self.obs_assim.df)
+    #     pseudo_obs = pseudo_obs[~np.isnan(pseudo_obs).any(axis=1)]
+    #     self.Y = pseudo_obs
+    #     self.obs_assim.df['Ym'] = self.Y.mean(axis=1)
+    def get_Y(self, obs, nearest_valid_radius=5, **fwd_kws):
+        self.obs_assim = obs.copy()
+        pseudo_obs = np.empty((len(obs.df), self.nens))
+        psm_names = set(obs.df['psm_name'])
+        clim_vns = list({
+            vn for psm_name in psm_names
+            for vn in psm.__dict__[psm_name]().clim_vns
+            if vn in self.ds_rgd.data_vars
+        })
+        lat_lon_pairs = xr.Dataset({
+            'lat': (('sites',), obs.df['lat'].values),
+            'lon': (('sites',), obs.df['lon'].values),
+        })
+        self.ds_proxy_locs = xr.Dataset()
+        for vn in clim_vns:
+            # filled_da = self.ds_rgd[vn].ffill(dim='lon').bfill(dim='lon').ffill(dim='lat').bfill(dim='lat')
+            # ds_proxy_locs[vn] = filled_da.sel(
+            #     lat=lat_lon_pairs['lat'],
+            #     lon=lat_lon_pairs['lon'],
+            #     method='nearest',
+            # ).transpose(..., 'ens')
+            self.ds_proxy_locs[vn] = self.ds_rgd[vn].x.nearest2d(
+                lat=lat_lon_pairs['lat'],
+                lon=lat_lon_pairs['lon'],
+                r=nearest_valid_radius,
+                extra_dim='ens',
+            ).transpose(..., 'ens')
+        if 'sites' not in self.ds_proxy_locs.dims:
+            self.ds_proxy_locs = self.ds_proxy_locs.expand_dims({'sites': [0]})
+            # if ds_proxy_locs[vn].isnull().any():
+            #     for idx in obs.df.index:
+            #         if ds_proxy_locs[vn].sel(sites=idx).isnull().any():
+            #             utils.p_warning(f"NaN detected for {vn}: {obs.df.iloc[idx][['pid', 'lat', 'lon']].values}")
+            #             print(ds_proxy_locs[vn].sel(sites=idx).dims)
+            #             print(ds_proxy_locs[vn].sel(sites=idx).values)
+            #             utils.p_warning('------------------------------------')
+            #     raise ValueError('Some of the nearest gridcell values are NaN.')
+        nearest_lats, nearest_lons = [], []
+        for i, (pid, rec) in tqdm(enumerate(obs.records.items()), total=obs.nobs, desc='Looping over records'):
+            # nearest_clim = self.ds_proxy_locs.isel({'sites': i}).sel(month=rec.data.seasonality).mean(dim='month')
+            # nearest_lat = nearest_clim.lat.values.mean()
+            # nearest_lon = nearest_clim.lon.values.mean()
+            # nearest_lats.append(nearest_lat)
+            # nearest_lons.append(nearest_lon)
+            # rec.data.lat = nearest_lat
+            # rec.data.lon = nearest_lon
+            mdl = psm.__dict__[rec.data.psm_name](rec)
+            mdl.record.clim = self.ds_proxy_locs.isel({'sites': i}).sel(month=rec.data.seasonality).mean(dim='month')
+            for vn in clim_vns:
+                if mdl.record.clim[vn].isnull().any():
+                    # print(i, ds_proxy_locs[vn].isel({'sites': i}))
+                    # print(ds_proxy_locs[vn].isel({'sites': i, 'ens': 6}))
+                    # print(ds_proxy_locs.sel(month=rec.data.seasonality).mean(dim='month')[vn].values[i])
+                    # print(vn, rec.data.pid, rec.data.lat, rec.data.lon, rec.data.seasonality)
+                    # print(mdl.record.clim[vn].values)
+                    raise ValueError(f'NaN values detected in input climate for forward modeling of: {pid}')
+            obs.records[pid].psm = mdl  # for debugging purposes
+            _fwd_kws = {}
+            _fwd_kws[rec.data.psm_name] = {}
+            if rec.data.psm_name in fwd_kws:
+                _fwd_kws[rec.data.psm_name].update(fwd_kws[rec.data.psm_name])
+            mdl.forward(**_fwd_kws[rec.data.psm_name])
+            if mdl.output is None:
+                utils.p_warning(f'>>> Dropping proxy: {pid}')
+                self.obs_assim.df = obs.df.drop(obs.df[obs.df['pid'] == pid].index)
+                pseudo_obs[i] = np.nan
+            else:
+                pseudo_obs[i] = mdl.output
+        self.obs_assim.nobs = len(self.obs_assim.df)
+        # self.obs_assim.df['lat'] = nearest_lats
+        # self.obs_assim.df['lon'] = nearest_lons
+        pseudo_obs = pseudo_obs[~np.isnan(pseudo_obs).any(axis=1)]
+        self.Y = pseudo_obs
+        self.obs_assim.df['Ym'] = self.Y.mean(axis=1)
+    def get_dist(self, obs, s=1):
+        # Extract grid latitudes and longitudes as 2D arrays
+        lat_grid = self.ds[self.lat_name].values  # shape: (nlat, nlon)
+        lon_grid = self.ds[self.lon_name].values  # shape: (nlat, nlon)
+        if lat_grid.ndim == 1 and lon_grid.ndim == 1:
+            # If lat and lon are 1D, create a meshgrid
+            lon_grid, lat_grid = np.meshgrid(lon_grid, lat_grid)
+        # Flatten the grid arrays to 1D
+        lat_grid_flat = lat_grid.ravel()  # shape: (nlat * nlon,)
+        lon_grid_flat = lon_grid.ravel()  # shape: (nlat * nlon,)
+        # Get the observation lat/lon as a 2D array
+        lats2 = obs.df['lat'].values  # shape: (nobs,)
+        lons2 = obs.df['lon'].values  # shape: (nobs,)
+        # Broadcast the grid cells to all observation points
+        lats1 = np.repeat(lat_grid_flat, obs.nobs)  # shape: (nlat * nlon * nobs,)
+        lons1 = np.repeat(lon_grid_flat, obs.nobs)  # shape: (nlat * nlon * nobs,)
+        # Repeat observation points for every grid point
+        lats2 = np.tile(lats2, len(lat_grid_flat))  # shape: (nlat * nlon * nobs,)
+        lons2 = np.tile(lons2, len(lon_grid_flat))  # shape: (nlat * nlon * nobs,)
+        dist0 = utils.gcd(lats1, lons1, lats2, lons2).reshape((-1, obs.nobs))
+        if hasattr(self, 'nz'):
+            # 3D localization
+            s = (np.ones(self.nz)*s).reshape(-1, 1, 1)
+            dist1 = (dist0[None, :, :] * s).reshape((-1, obs.nobs))
+            self.dist = dist1[np.newaxis, :].repeat(self.nvar, axis=0).reshape(-1, obs.nobs)
+        else:
+            self.dist = dist0[None, :, :].repeat(self.nvar, axis=0).reshape(-1, obs.nobs)

mlda-2024.11.22/mlda/psm.py ADDED Viewed

@@ -0,0 +1,147 @@
+import xarray as xr
+import pybaywatch as pb
+import numpy as np
+from . import utils
+from . import obs
+class IdenticalSST:
+    def __init__(self, record:obs.ProxyRecord=None):
+        self.record = record
+    @property
+    def clim_vns(self):
+        return ['TEMP']
+    def forward(self):
+        self.output = self.record.clim['TEMP'].isel(z_t=0).values
+class IdenticalSSS:
+    def __init__(self, record:obs.ProxyRecord=None):
+        self.record = record
+    @property
+    def clim_vns(self):
+        return ['SALT']
+    def forward(self):
+        self.output = self.record.clim['SALT'].isel(z_t=0).values
+class IdenticalSSTSSS:
+    def __init__(self, record:obs.ProxyRecord=None):
+        self.record = record
+    @property
+    def clim_vns(self):
+        return ['TEMP', 'SALT']
+    def forward(self):
+        self.output = self.record.clim['TEMP'].isel(z_t=0).values+self.record.clim['SALT'].isel(z_t=0).values
+class TEX86:
+    def __init__(self, record:obs.ProxyRecord=None):
+        self.record = record
+    @property
+    def clim_vns(self):
+        return ['TEMP', 'tos', 'sst']
+    def forward(self, seed=2333, mode='analog', type='SST', tolerance=1):
+        if 'TEMP' in self.record.clim:
+            sst = self.record.clim['TEMP'].isel(z_t=0).values
+        elif 'tos' in self.record.clim:
+            sst = self.record.clim['tos'].values
+        elif 'sst' in self.record.clim:
+            sst = self.record.clim['sst'].values
+        lat = self.record.data.lat
+        lon = self.record.data.lon
+        lon180 = utils.lon180(lon)
+        # run
+        self.params = {
+            'lat': lat,
+            'lon': lon180,
+            'temp': sst,
+            'seed': seed,
+            'type': type,
+            'mode': mode,
+            'tolerance': tolerance,
+        }
+        res = pb.TEX_forward(**self.params)
+        if res['status'] == 'FAIL':
+            utils.p_warning(f'>>> Forward modeling failed for proxy: {self.meta["pid"]}')
+            self.output = None
+        else:
+            self.output = np.median(res['values'], axis=1)
+class UK37:
+    def __init__(self, record:obs.ProxyRecord=None):
+        self.record = record
+    @property
+    def clim_vns(self):
+        return ['TEMP', 'tos', 'sst']
+    def forward(self, order=3, seed=2333):
+        if 'TEMP' in self.record.clim:
+            sst = self.clim['TEMP'].isel(z_t=0).values
+        elif 'tos' in self.record.clim:
+            sst = self.record.clim['tos'].values
+        elif 'sst' in self.record.clim:
+            sst = self.record.clim['sst'].values
+        # run
+        self.params = {
+            'sst': sst,
+            'order': order,
+            'seed': seed,
+        }
+        res = pb.UK_forward(**self.params)
+        self.output = np.median(res['values'], axis=1)
+class MgCa:
+    def __init__(self, record:obs.ProxyRecord=None):
+        self.record = record
+    @property
+    def clim_vns(self):
+        return ['TEMP', 'tos', 'sst', 'SALT', 'sos', 'sss']
+    def forward(self, age, omega=None, pH=None, clean=None, species=None, sw=2, H=1, seed=2333):
+        if 'TEMP' in self.record.clim and 'SALT' in self.record.clim:
+            sst = self.record.clim['TEMP'].isel(z_t=0).values
+            sss = self.record.clim['SALT'].isel(z_t=0).values
+        elif 'tos' in self.record.clim and 'sos' in self.record.clim:
+            sst = self.record.clim['tos'].values
+            sss = self.record.clim['sos'].values
+        elif 'sst' in self.record.clim and 'sss' in self.record.clim:
+            sst = self.record.clim['sst'].values
+            sss = self.record.clim['sss'].values
+        # get omega and pH
+        lat = self.record.data.lat
+        lon = self.record.data.lon
+        depth = self.record.data.depth
+        if omega is None and pH is None:
+            lon180 = np.mod(lon + 180, 360) - 180
+            omega, pH = pb.core.omgph(lat, lon180, depth)
+        if clean is None: clean = self.record.data.clean
+        if species is None: species = self.record.data.species
+        # run
+        self.params = {
+            'age': age,
+            'sst': sst,
+            'salinity': sss,
+            'pH': pH,
+            'omega': omega,
+            'species': species,
+            'clean': clean,
+            'sw': sw,
+            'H': H,
+            'seed': seed,
+        }
+        res = pb.MgCa_forward(**self.params)
+        self.output = np.median(res['values'], axis=1)

mlda-2024.11.22/mlda/utils.py ADDED Viewed

@@ -0,0 +1,103 @@
+import numpy as np
+import xarray as xr
+import colorama as ca
+def p_header(text):
+    print(ca.Fore.CYAN + ca.Style.BRIGHT + text + ca.Style.RESET_ALL)
+def p_hint(text):
+    print(ca.Fore.LIGHTBLACK_EX + ca.Style.BRIGHT + text + ca.Style.RESET_ALL)
+def p_success(text):
+    print(ca.Fore.GREEN + ca.Style.BRIGHT + text + ca.Style.RESET_ALL)
+def p_fail(text):
+    print(ca.Fore.RED + ca.Style.BRIGHT + text + ca.Style.RESET_ALL)
+def p_warning(text):
+    print(ca.Fore.YELLOW + ca.Style.BRIGHT + text + ca.Style.RESET_ALL)
+def gcd(lat1, lon1, lat2, lon2, radius=6378.137):
+    ''' 2D Great Circle Distance [km]
+    Args:
+        radius (float): Earth radius
+    '''
+    # Convert degrees to radians
+    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
+    dlat, dlon = lat2 - lat1, lon2 - lon1
+    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
+    c = 2 * np.arcsin(np.sqrt(a))
+    dist = radius * c
+    return dist
+def states2ds(states, ds):
+    original_shapes = {vn: ds[vn].shape for vn in ds.data_vars}
+    original_dims = {vn: ds[vn].dims for vn in ds.data_vars}
+    original_coords = {vn: ds[vn].coords for vn in ds.data_vars}
+    ds_out = xr.Dataset()
+    start_loc = 0
+    for vn in ds.data_vars:
+        if 'ens' in original_dims[vn]:
+            end_loc = start_loc + np.prod(original_shapes[vn][:-1])
+            # p_hint(f'{np.prod(original_shapes[vn][:-1]) = }')
+        else:
+            end_loc = start_loc + np.prod(original_shapes[vn])
+            # p_hint(f'{np.prod(original_shapes[vn]) = }')
+        # p_hint(f'{vn = }')
+        # p_hint(f'{start_loc = }')
+        # p_hint(f'{end_loc = }')
+        # p_hint(f'{np.shape(states) = }')
+        # p_hint(f'{np.shape(states[start_loc:end_loc]) = }')
+        data = states[start_loc:end_loc].reshape(original_shapes[vn])
+        nan_mask = np.isnan(ds[vn].values)
+        data[nan_mask] = np.nan
+        ds_out[vn] = xr.DataArray(
+            data,
+            dims=original_dims[vn],
+            coords=original_coords[vn],
+        )
+        start_loc = end_loc
+        ds_out[vn].attrs = ds[vn].attrs
+    return ds_out
+# def gcd_3d(loc1, loc2, radius=6371.0):
+#     ''' 3D Great Circle Distance [km]
+#     Args:
+#         loc1 (tuple): lat1 [degree], lon1 [degree], depth1 [km]
+#         loc2 (tuple): lat2 [degree], lon2 [degree], depth2 [km]
+#         radius (float): Earth radius
+#     '''
+#     lat1, lon1, depth1 = loc1
+#     lat2, lon2, depth2 = loc2
+#     # Convert degrees to radians
+#     lat1, lon1 = np.radians(lat1), np.radians(lon1)
+#     lat2, lon2 = np.radians(lat2), np.radians(lon2)
+#     # Calculate radial distances (Earth's radius minus depth)
+#     r1 = radius - depth1
+#     r2 = radius - depth2
+#     # Compute central angle component
+#     central_angle = np.sin(lat1) * np.sin(lat2) + np.cos(lat1) * np.cos(lat2) * np.cos(lon2 - lon1)
+#     # Compute the 3D distance
+#     distance_3d = np.sqrt(r1**2 + r2**2 - 2 * r1 * r2 * central_angle)
+#     return distance_3d
+def str2list(s, sep=','):
+    l = [int(ss.strip()) for ss in s.split(sep)]
+    return l
+def lon360(lon180):
+    return np.mod(lon180, 360)
+def lon180(lon360):
+    return np.mod(lon360 + 180, 360) - 180

mlda-2024.11.22/mlda.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,25 @@
+Metadata-Version: 2.1
+Name: mlda
+Version: 2024.11.22
+Summary: mlda: A Python package for Machine Learning-base Data Assimilation
+Home-page: https://github.com/fzhu2e/mlda
+Author: Feng Zhu, Weimin Si
+Author-email: fengzhu@ucar.edu, weimin_si@brown.edu
+License: BSD-3
+Keywords: Machine Learning,Data Assimilation
+Classifier: Natural Language :: English
+Classifier: Programming Language :: Python :: 3.12
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: netCDF4
+Requires-Dist: xarray
+Requires-Dist: dask
+Requires-Dist: nc-time-axis
+Requires-Dist: colorama
+Requires-Dist: tqdm
+Requires-Dist: x4c-exp
+# mlda: A Python package for Machine Learning-based Data Assimilation
+`mlda` is a Python package for Machine Learning-base Data Assimilation (DA).
+It aims to provide a universal framework and the corresponding utilities for conducting reproducible data assimilation experiments using novel machine learning-based DA methods.

mlda-2024.11.22/mlda.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,15 @@
+LICENSE
+README.md
+setup.py
+mlda/__init__.py
+mlda/da.py
+mlda/obs.py
+mlda/prior.py
+mlda/psm.py
+mlda/utils.py
+mlda.egg-info/PKG-INFO
+mlda.egg-info/SOURCES.txt
+mlda.egg-info/dependency_links.txt
+mlda.egg-info/not-zip-safe
+mlda.egg-info/requires.txt
+mlda.egg-info/top_level.txt

mlda-2024.11.22/mlda.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

mlda-2024.11.22/mlda.egg-info/not-zip-safe ADDED Viewed

	@@ -0,0 +1 @@
1	+

mlda-2024.11.22/mlda.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,7 @@
+netCDF4
+xarray
+dask
+nc-time-axis
+colorama
+tqdm
+x4c-exp

mlda-2024.11.22/mlda.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ mlda

mlda-2024.11.22/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

mlda-2024.11.22/setup.py ADDED Viewed

@@ -0,0 +1,33 @@
+from setuptools import setup, find_packages
+with open('README.md', 'r') as fh:
+    long_description = fh.read()
+setup(
+    name='mlda',  # required
+    version='2024.11.22',
+    description='mlda: A Python package for Machine Learning-base Data Assimilation',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    author='Feng Zhu, Weimin Si',
+    author_email='fengzhu@ucar.edu, weimin_si@brown.edu',
+    url='https://github.com/fzhu2e/mlda',
+    packages=find_packages(),
+    include_package_data=True,
+    license='BSD-3',
+    zip_safe=False,
+    keywords=['Machine Learning', 'Data Assimilation'],
+    classifiers=[
+        'Natural Language :: English',
+        'Programming Language :: Python :: 3.12',
+    ],
+    install_requires=[
+        'netCDF4',
+        'xarray',
+        'dask',
+        'nc-time-axis',
+        'colorama',
+        'tqdm',
+        'x4c-exp',
+    ],
+)