PyPI - pydartdiags - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

pydartdiags 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pydartdiags might be problematic. Click here for more details.

Files changed (10) hide show

pydartdiags/matplots/matplots.py +200 -20
pydartdiags/obs_sequence/composite_types.yaml +35 -0
pydartdiags/obs_sequence/obs_sequence.py +268 -161
pydartdiags/stats/stats.py +230 -43
{pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info}/METADATA +4 -3
pydartdiags-0.6.0.dist-info/RECORD +15 -0
{pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info}/WHEEL +1 -1
pydartdiags-0.5.0.dist-info/RECORD +0 -14
{pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info/licenses}/LICENSE +0 -0
{pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info}/top_level.txt +0 -0

pydartdiags/stats/stats.py CHANGED Viewed

@@ -2,8 +2,7 @@
 import pandas as pd
 import numpy as np
 from functools import wraps
-# from pydartdiags.obs_sequence import obs_sequence as obsq
+from datetime import datetime, timedelta
 def apply_to_phases_in_place(func):
@@ -39,20 +38,30 @@ def apply_to_phases_by_type_return_df(func):
                 result = func(df, phase, *args, **kwargs)
                 results.append(result)
-        if "midpoint" in result.columns:
-            if len(results) == 2:
-                return pd.merge(
-                    results[0],
-                    results[1],
-                    on=["midpoint", "vlevels", "type", "vert_unit"],
-                )
-            else:
-                return results[0]
+        if not results:
+            return (
+                pd.DataFrame()
+            )  # Return an empty DataFrame if no results are generated
+        # Dynamically determine merge keys based on common columns
+        common_columns = set(results[0].columns)
+        for result in results[1:]:
+            common_columns &= set(result.columns)
+        # Exclude phase-specific columns from the merge keys
+        phase_specific_columns = {
+            f"{phase}_sq_err",
+            f"{phase}_bias",
+            f"{phase}_totalvar",
+            f"{phase}_rmse",
+            f"{phase}_totalspread",
+        }
+        merge_keys = list(common_columns - phase_specific_columns)
+        if len(results) == 2:
+            return pd.merge(results[0], results[1], on=merge_keys)
         else:
-            if len(results) == 2:
-                return pd.merge(results[0], results[1], on="type")
-            else:
-                return results[0]
+            return results[0]
     return wrapper
@@ -82,6 +91,12 @@ def calculate_rank(df, phase):
     """
     Calculate the rank of observations within an ensemble.
+    Note:
+        This function is decorated with @apply_to_phases_by_obs, which modifies its usage.
+        You should call it as calculate_rank(df), and the decorator will automatically apply the
+        function to all relevant phases (‘prior’ and ‘posterior’).
     This function takes a DataFrame containing ensemble predictions and observed values,
     adds sampling noise to the ensemble predictions, and calculates the rank of the observed
     value within the perturbed ensemble for each observation. The rank indicates the position
@@ -92,8 +107,6 @@ def calculate_rank(df, phase):
     Parameters:
         df (pd.DataFrame): A DataFrame with columns for rank, and observation type.
-        phase (str): The phase for which to calculate the statistics ('prior' or 'posterior')
     Returns:
         DataFrame containing columns for 'rank' and observation 'type'.
     """
@@ -147,15 +160,20 @@ def diag_stats(df, phase):
     """
     Calculate diagnostic statistics for a given phase and add them to the DataFrame.
+    Note:
+        This function is decorated with @apply_to_phases_in_place, which modifies its usage.
+        You should call it as diag_stats(df), and the decorator will automatically apply the
+        function to all relevant phases (‘prior’ and ‘posterior’) modifying the DataFrame
+        in place.
     Args:
         df (pandas.DataFrame): The input DataFrame containing observation data and ensemble statistics.
-                               The DataFrame must include the following columns:
-                               - 'observation': The actual observation values.
-                               - 'obs_err_var': The variance of the observation error.
-                               - 'prior_ensemble_mean' and/or 'posterior_ensemble_mean': The mean of the ensemble.
-                               - 'prior_ensemble_spread' and/or 'posterior_ensemble_spread': The spread of the ensemble.
+            The DataFrame must include the following columns:
-        phase (str): The phase for which to calculate the statistics ('prior' or 'posterior')
+            - 'observation': The actual observation values.
+            - 'obs_err_var': The variance of the observation error.
+            - 'prior_ensemble_mean' and/or 'posterior_ensemble_mean': The mean of the ensemble.
+            - 'prior_ensemble_spread' and/or 'posterior_ensemble_spread': The spread of the ensemble.
     Returns:
         None: The function modifies the DataFrame in place by adding the following columns:
@@ -192,9 +210,12 @@ def bin_by_layer(df, levels, verticalUnit="pressure (Pa)"):
     vertical level bin. Only observations (row) with the specified vertical unit are binned.
     Args:
-        df (pandas.DataFrame): The input DataFrame containing observation data. The DataFrame must include the following columns:
+        df (pandas.DataFrame): The input DataFrame containing observation data.
+                               The DataFrame must include the following columns:
                                - 'vertical': The vertical coordinate values of the observations.
                                - 'vert_unit': The unit of the vertical coordinate values.
         levels (list): A list of bin edges for the vertical levels.
         verticalUnit (str, optional): The unit of the vertical axis (e.g., 'pressure (Pa)'). Default is 'pressure (Pa)'.
@@ -211,19 +232,67 @@ def bin_by_layer(df, levels, verticalUnit="pressure (Pa)"):
     df.loc[df["vert_unit"] == verticalUnit, "vlevels"] = pd.cut(
         df.loc[df["vert_unit"] == verticalUnit, "vertical"], levels
     )
-    if verticalUnit == "pressure (Pa)":
-        df.loc[:, "midpoint"] = df["vlevels"].apply(
-            lambda x: x.mid
-        )  # HK todo units HPa - change now or in plotting?
-        df.loc[:, "vlevels"] = df["vlevels"].apply(
-            lambda x: x
-        )  # HK todo units HPa - change now or in plotting?
-    else:
-        df.loc[:, "midpoint"] = df["vlevels"].apply(lambda x: x.mid)
+    df.loc[:, "midpoint"] = df["vlevels"].apply(lambda x: x.mid)
+def bin_by_time(df, time_value):
+    """
+    Bin observations by time and add 'time_bin' and 'time_bin_midpoint' columns to the DataFrame.
+    The first bin starts 1 second before the minimum time value, so the minimum time is included in the
+    first bin. The last bin is inclusive of the maximum time value.
+    Args:
+        df (pd.DataFrame): The input DataFrame containing a 'time' column.
+        time_value (str): The width of each time bin (e.g., '3600S' for 1 hour).
+    Returns:
+        None: The function modifies the DataFrame in place by adding 'time_bin' and 'time_bin_midpoint' columns.
+    """
+    # Create time bins
+    start = df["time"].min() - timedelta(seconds=1)
+    end = df["time"].max()
+    # Determine if the end time aligns with the bin boundary
+    time_delta = pd.Timedelta(time_value)
+    aligned_end = (pd.Timestamp(end) + time_delta).floor(time_value)
+    time_bins = pd.date_range(
+        start=start,
+        end=aligned_end,
+        freq=time_value,
+    )
+    df["time_bin"] = pd.cut(df["time"], bins=time_bins)
+    # Calculate the midpoint of each time bin
+    df["time_bin_midpoint"] = df["time_bin"].apply(
+        lambda x: x.left + (x.right - x.left) / 2 if pd.notnull(x) else None
+    )
 @apply_to_phases_by_type_return_df
 def grand_statistics(df, phase):
+    """
+    Calculate grand statistics (RMSE, bias, total spread) for each observation type and phase.
+    This function assumes that diagnostic statistics (such as squared error, bias, and total variance)
+    have already been computed by :func:`diag_stats` and are present in the DataFrame. It groups the data by observation
+    type and computes the root mean square error (RMSE), mean bias, and total spread for the specified phase.
+    Note:
+        This function is decorated with @apply_to_phases_by_type_return_df, which modifies its usage
+        You should call it as grand_statistics(df), and the decorator will automatically apply the function
+        to all relevant phases ('prior' and 'posterior') and return a merged DataFrame.
+    Args:
+        df (pandas.DataFrame): The input DataFrame containing diagnostic statistics for observations.
+    Returns:
+        pandas.DataFrame: A DataFrame with columns:
+            - 'type': The observation type.
+            - '{phase}_rmse': The root mean square error for the phase.
+            - '{phase}_bias': The mean bias for the phase.
+            - '{phase}_totalspread': The total spread for the phase.
+    """
     # assuming diag_stats has been called
     grand = (
@@ -246,6 +315,33 @@ def grand_statistics(df, phase):
 @apply_to_phases_by_type_return_df
 def layer_statistics(df, phase):
+    """
+    Calculate statistics (RMSE, bias, total spread) for each observation type and vertical layer.
+    This function assumes that diagnostic statistics (such as squared error, bias, and total variance)
+    have already been computed with :func:`diag_stats` and are present in the DataFrame. It groups the data by
+    vertical layer midpoint and observation type, and computes the root mean square error (RMSE),
+    mean bias, and total spread for the specified phase for each vertical layer.
+    Note:
+        This function is decorated with @apply_to_phases_by_type_return_df, which modifies its usage
+        You should call it as layer_statistics(df), and the decorator will automatically apply the function
+        to all relevant phases ('prior' and 'posterior') and return a merged DataFrame.
+    Args:
+        df (pandas.DataFrame): The input DataFrame containing diagnostic statistics for observations.
+        phase (str): The phase for which to calculate the statistics ('prior' or 'posterior').
+    Returns:
+        pandas.DataFrame: A DataFrame with columns:
+            - 'midpoint': The midpoint of the vertical layer.
+            - 'type': The observation type.
+            - '{phase}_rmse': The root mean square error for the phase.
+            - '{phase}_bias': The mean bias for the phase.
+            - '{phase}_totalspread': The total spread for the phase.
+            - 'vert_unit': The vertical unit.
+            - 'vlevels': The categorized vertical level.
+    """
     # assuming diag_stats has been called
     layer_stats = (
@@ -270,13 +366,65 @@ def layer_statistics(df, phase):
     return layer_stats
+@apply_to_phases_by_type_return_df
+def time_statistics(df, phase):
+    """
+    Calculate time-based statistics (RMSE, bias, total spread) for each observation type and time bin.
+    This function assumes that diagnostic statistics (such as squared error, bias, and total variance)
+    have already been computed by :func:`diag_stats` and are present in the DataFrame. It groups the data
+    by time bin midpoint and observation type, and computes the root mean square error (RMSE), mean bias,
+    and total spread for the specified phase for each time bin.
+    Note:
+        This function is decorated with @apply_to_phases_by_type_return_df.
+        You should call it as time_statistics(df), and the decorator will automatically apply the function
+        to all relevant phases ('prior' and 'posterior') and return a merged DataFrame.
+    Args:
+        df (pandas.DataFrame): The input DataFrame containing diagnostic statistics for observations.
+        phase (str): The phase for which to calculate the statistics ('prior' or 'posterior').
+    Returns:
+        pandas.DataFrame: A DataFrame with columns:
+            - 'time_bin_midpoint': The midpoint of the time bin.
+            - 'type': The observation type.
+            - '{phase}_rmse': The root mean square error for the phase.
+            - '{phase}_bias': The mean bias for the phase.
+            - '{phase}_totalspread': The total spread for the phase.
+            - 'time_bin': The time bin interval.
+            - 'time': The first time value in the bin.
+    """
+    # Assuming diag_stats has been called
+    time_stats = (
+        df.groupby(["time_bin_midpoint", "type"], observed=False)
+        .agg(
+            {
+                f"{phase}_sq_err": mean_then_sqrt,
+                f"{phase}_bias": "mean",
+                f"{phase}_totalvar": mean_then_sqrt,
+                "time_bin": "first",
+                "time": "first",
+            }
+        )
+        .reset_index()
+    )
+    time_stats.rename(columns={f"{phase}_sq_err": f"{phase}_rmse"}, inplace=True)
+    time_stats.rename(
+        columns={f"{phase}_totalvar": f"{phase}_totalspread"}, inplace=True
+    )
+    return time_stats
 def possible_vs_used(df):
     """
     Calculates the count of possible vs. used observations by type.
     This function takes a DataFrame containing observation data, including a 'type' column for the observation
     type and an 'observation' column. The number of used observations ('used'), is the total number
-    minus the observations that failed quality control checks (as determined by the `select_failed_qcs` function).
+    of assimilated observations (as determined by the `select_used_qcs` function).
     The result is a DataFrame with each observation type, the count of possible observations, and the count of
     used observations.
@@ -288,8 +436,8 @@ def possible_vs_used(df):
     possible = df.groupby("type")["observation"].count()
     possible.rename("possible", inplace=True)
-    failed_qcs = select_failed_qcs(df).groupby("type")["observation"].count()
-    used = possible - failed_qcs.reindex(possible.index, fill_value=0)
+    used_qcs = select_used_qcs(df).groupby("type")["observation"].count()
+    used = used_qcs.reindex(possible.index, fill_value=0)
     used.rename("used", inplace=True)
     return pd.concat([possible, used], axis=1).reset_index()
@@ -302,22 +450,61 @@ def possible_vs_used_by_layer(df):
     possible = df.groupby(["type", "midpoint"], observed=False)["type"].count()
     possible.rename("possible", inplace=True)
-    failed_qcs = (
-        select_failed_qcs(df)
+    used_qcs = (
+        select_used_qcs(df)
         .groupby(["type", "midpoint"], observed=False)["type"]
         .count()
     )
-    used = possible - failed_qcs.reindex(possible.index, fill_value=0)
+    used = used_qcs.reindex(possible.index, fill_value=0)
     used.rename("used", inplace=True)
     return pd.concat([possible, used], axis=1).reset_index()
-def select_failed_qcs(df):
+def select_used_qcs(df):
     """
-    Select rows from the DataFrame where the DART quality control flag is greater than 0.
+    Select rows from the DataFrame where the observation was used.
+    Includes observations for which the posterior forward observation operators failed.
     Returns:
-        pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
+        pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag 0 or 2.
+    """
+    return df[(df["DART_quality_control"] == 0) | (df["DART_quality_control"] == 2)]
+def possible_vs_used_by_time(df):
     """
-    return df[df["DART_quality_control"] > 0]
+    Calculates the count of possible vs. used observations by type and time bin.
+    Args:
+        df (pd.DataFrame): The input DataFrame containing observation data.
+                           The DataFrame must include:
+                           - 'type': The observation type.
+                           - 'time_bin_midpoint': The midpoint of the time bin.
+                           - 'observation': The observation values.
+                           - 'DART_quality_control': The quality control flag.
+    Returns:
+        pd.DataFrame: A DataFrame with the following columns:
+                      - 'time_bin_midpoint': The midpoint of the time bin.
+                      - 'type': The observation type.
+                      - 'possible': The count of all observations in the time bin.
+                      - 'used': The count of observations in the time bin that passed quality control checks.
+    """
+    # Count all observations (possible) grouped by time_bin_midpoint and type
+    possible = df.groupby(["time_bin_midpoint", "type"], observed=False)["type"].count()
+    possible.rename("possible", inplace=True)
+    # Count used observations (QC=0 or QC=2) grouped by time_bin_midpoint and type
+    used_qcs = (
+        select_used_qcs(df)
+        .groupby(["time_bin_midpoint", "type"], observed=False)["type"]
+        .count()
+    )
+    used = used_qcs.reindex(possible.index, fill_value=0)
+    used.rename("used", inplace=True)
+    # Combine possible and used into a single DataFrame
+    return pd.concat([possible, used], axis=1).reset_index()

{pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,15 +1,15 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: pydartdiags
-Version: 0.5.0
+Version: 0.6.0
 Summary: Observation Sequence Diagnostics for DART
 Home-page: https://github.com/NCAR/pyDARTdiags.git
 Author: Helen Kershaw
 Author-email: Helen Kershaw <hkershaw@ucar.edu>
+License-Expression: Apache-2.0
 Project-URL: Homepage, https://github.com/NCAR/pyDARTdiags.git
 Project-URL: Issues, https://github.com/NCAR/pyDARTdiags/issues
 Project-URL: Documentation, https://ncar.github.io/pyDARTdiags
 Classifier: Programming Language :: Python :: 3
-Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
@@ -21,6 +21,7 @@ Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: matplotlib>=3.9.4
 Dynamic: author
 Dynamic: home-page
+Dynamic: license-file
 Dynamic: requires-python
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)

pydartdiags-0.6.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+pydartdiags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pydartdiags/matplots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pydartdiags/matplots/matplots.py,sha256=Bo0TTz1gvsHEvTfTfLfdTi_3hNRN1okmyY5a5yYgtzk,13455
+pydartdiags/obs_sequence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pydartdiags/obs_sequence/composite_types.yaml,sha256=PVLMU6x6KcVMCwPB-U65C_e0YQUemfqUhYMpf1DhFOY,917
+pydartdiags/obs_sequence/obs_sequence.py,sha256=5HfqOPoF2DyZQrUiGrYEwLJ9Iewe5DIzq0pdxR3bsnk,48037
+pydartdiags/plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pydartdiags/plots/plots.py,sha256=U7WQjE_qN-5a8-85D-PkkgILSFBzTJQ1mcGBa7l5DHI,6464
+pydartdiags/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pydartdiags/stats/stats.py,sha256=a88VuLoHOlhbjYjnrVPHVNnhiDx-4B3YA1jbc6FUSyU,20193
+pydartdiags-0.6.0.dist-info/licenses/LICENSE,sha256=ROglds_Eg_ylXp-1MHmEawDqMw_UsCB4r9sk7z9PU9M,11377
+pydartdiags-0.6.0.dist-info/METADATA,sha256=ZeVGK6hTX2tgIiedCVcavDPn195yCh8LO9-ziliePog,2381
+pydartdiags-0.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+pydartdiags-0.6.0.dist-info/top_level.txt,sha256=LfMoPLnSd0VhhlWev1eeX9t6AzvyASOloag0LO_ppWg,12
+pydartdiags-0.6.0.dist-info/RECORD,,

{pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.0)
+Generator: setuptools (80.9.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

pydartdiags-0.5.0.dist-info/RECORD DELETED Viewed

@@ -1,14 +0,0 @@
-pydartdiags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pydartdiags/matplots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pydartdiags/matplots/matplots.py,sha256=44MlD98gaQsrCT0mW6M9f0a2-clm3KEGrdYqkTUO0RI,7478
-pydartdiags/obs_sequence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pydartdiags/obs_sequence/obs_sequence.py,sha256=kdPOWAqgiyuv6cTdhYx1u9Ru6zCKF0Wd--7-sM3m5F8,44527
-pydartdiags/plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pydartdiags/plots/plots.py,sha256=U7WQjE_qN-5a8-85D-PkkgILSFBzTJQ1mcGBa7l5DHI,6464
-pydartdiags/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pydartdiags/stats/stats.py,sha256=tzjE6HBrw6s9Li0UlJ_sNMcGEU8loT_BA5SDZp-UTOc,12138
-pydartdiags-0.5.0.dist-info/LICENSE,sha256=ROglds_Eg_ylXp-1MHmEawDqMw_UsCB4r9sk7z9PU9M,11377
-pydartdiags-0.5.0.dist-info/METADATA,sha256=F6znTR7qrj2qoGBYNojmWiaOqa9EAETgphV7i0HW0xc,2391
-pydartdiags-0.5.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-pydartdiags-0.5.0.dist-info/top_level.txt,sha256=LfMoPLnSd0VhhlWev1eeX9t6AzvyASOloag0LO_ppWg,12
-pydartdiags-0.5.0.dist-info/RECORD,,

{pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

pydartdiags 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

pydartdiags 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl