PyPI - pydartdiags - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

pydartdiags 0.5.0py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pydartdiags might be problematic. Click here for more details.

Files changed (10) hide show

pydartdiags/matplots/matplots.py +200 -20
pydartdiags/obs_sequence/composite_types.yaml +35 -0
pydartdiags/obs_sequence/obs_sequence.py +198 -83
pydartdiags/stats/stats.py +141 -32
{pydartdiags-0.5.0.dist-info → pydartdiags-0.5.1.dist-info}/METADATA +3 -2
pydartdiags-0.5.1.dist-info/RECORD +15 -0
{pydartdiags-0.5.0.dist-info → pydartdiags-0.5.1.dist-info}/WHEEL +1 -1
pydartdiags-0.5.0.dist-info/RECORD +0 -14
{pydartdiags-0.5.0.dist-info → pydartdiags-0.5.1.dist-info/licenses}/LICENSE +0 -0
{pydartdiags-0.5.0.dist-info → pydartdiags-0.5.1.dist-info}/top_level.txt +0 -0

pydartdiags/matplots/matplots.py CHANGED Viewed

@@ -28,35 +28,56 @@ def plot_profile(obs_seq, levels, type, bias=True, rmse=True, totalspread=True):
     # calculate stats and add to dataframe
     stats.diag_stats(obs_seq.df)
-    qc0 = obs_seq.select_by_dart_qc(0)  # filter only qc=0
+    qc0 = stats.select_used_qcs(obs_seq.df)  # filter only qc=0, qc=2
     # filter by type
     qc0 = qc0[qc0["type"] == type]
-    all_df = obs_seq.df[obs_seq.df["type"] == type]
+    if qc0.empty:
+        print(f"No rows found for type: {type}")
+        return None
+    all_df = obs_seq.df[obs_seq.df["type"] == type]  # for possible vs used
+    if all_df["vert_unit"].nunique() > 1:
+        print(
+            f"Multiple vertical units found in the data: {all_df['vert_unit'].unique()} for type: {type}"
+        )
+        return None
+    vert_unit = all_df.iloc[0]["vert_unit"]
+    if vert_unit == "pressure (Pa)":
+        conversion = 0.01  # from Pa to hPa
+    else:
+        conversion = 1.0  # no conversion needed
     # grand statistics
     grand = stats.grand_statistics(qc0)
     # add level bins to the dataframe
-    stats.bin_by_layer(all_df, levels)  # have to count used vs possible
-    stats.bin_by_layer(qc0, levels)
+    stats.bin_by_layer(all_df, levels, verticalUnit=vert_unit)
+    stats.bin_by_layer(qc0, levels, verticalUnit=vert_unit)
     # aggregate by layer
     df_pvu = stats.possible_vs_used_by_layer(all_df)  # possible vs used
     df = stats.layer_statistics(qc0)  # bias, rmse, totalspread for plotting
-    fig, ax1 = plt.subplots()
+    # using rmse because mean_sqrt vs mean for bias (get a column with 0 obs)
+    if "prior_rmse" not in df.columns:
+        print(f"All layers empty for type: {type}")
+        return None
+    fig, ax1 = plt.subplots(figsize=(8, 8))
     # convert to hPa HK @todo only for Pressure (Pa)
     df["midpoint"] = df["midpoint"].astype(float)
-    df["midpoint"] = df["midpoint"] / 100.0
+    df["midpoint"] = df["midpoint"] * conversion
     df_pvu["midpoint"] = df_pvu["midpoint"].astype(float)
-    df_pvu["midpoint"] = df_pvu["midpoint"] / 100.0
+    df_pvu["midpoint"] = df_pvu["midpoint"] * conversion
     # Add horizontal stripes alternating between gray and white to represent the vertical levels
-    left = df["vlevels"].apply(lambda x: x.left / 100.0)  # todo convert to HPa
-    right = df["vlevels"].apply(lambda x: x.right / 100.0)
+    left = df["vlevels"].apply(lambda x: x.left * conversion)  # todo convert to HPa
+    right = df["vlevels"].apply(lambda x: x.right * conversion)
     for i in range(len(left)):
         color = "gray" if i % 2 == 0 else "white"
         ax1.axhspan(left.iloc[i], right.iloc[i], color=color, alpha=0.3)
@@ -150,33 +171,41 @@ def plot_profile(obs_seq, levels, type, bias=True, rmse=True, totalspread=True):
     )
     ax3.set_xlim(left=0)
-    ax1.invert_yaxis()
+    if vert_unit == "pressure (Pa)":
+        ax1.invert_yaxis()
     ax1.set_title(type)
-    datalabel = "bias," + " " + "rmse," + " " + "totalspread"
-    ax1.set_xlabel(datalabel)
+    # Build the datalabel string
+    datalabel = []
+    if bias:
+        datalabel.append("bias")
+    if rmse:
+        datalabel.append("rmse")
+    if totalspread:
+        datalabel.append("totalspread")
+    ax1.set_xlabel(", ".join(datalabel))
     lines1, labels1 = ax1.get_legend_handles_labels()
     ax1.legend(lines1, labels1, loc="upper left", bbox_to_anchor=(1.05, 1))
     ax1.text(
-        0.5, -0.15, obs_seq.file, ha="center", va="center", transform=ax1.transAxes
+        0.6, -0.08, obs_seq.file, ha="center", va="center", transform=ax1.transAxes
     )
     # Add a text box with information below the legend
     textstr = "Grand statistics:\n"
     if bias:
-        textstr += f"- prior_bias: {bias_prior:.7f}\n"
+        textstr += f"prior_bias: {bias_prior:.7f}\n"
     if rmse:
-        textstr += f"- rmse_prior: {rmse_prior:.7f}\n"
+        textstr += f"rmse_prior: {rmse_prior:.7f}\n"
     if totalspread:
-        textstr += f"- totalspread_prior: {totalspread_prior:.7f}\n"
+        textstr += f"totalspread_prior: {totalspread_prior:.7f}\n"
     if "posterior_bias" in df.columns:
         if bias:
-            textstr += f"- posterior_bias: {bias_posterior:.7f}\n"
+            textstr += f"posterior_bias: {bias_posterior:.7f}\n"
         if rmse:
-            textstr += f"- rmse_posterior: {rmse_posterior:.7f}\n"
+            textstr += f"rmse_posterior: {rmse_posterior:.7f}\n"
         if totalspread:
-            textstr += f"- totalspread_posterior: {totalspread_posterior:.7f}\n"
+            textstr += f"totalspread_posterior: {totalspread_posterior:.7f}\n"
     props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)
     ax1.text(
@@ -189,6 +218,7 @@ def plot_profile(obs_seq, levels, type, bias=True, rmse=True, totalspread=True):
         bbox=props,
     )
+    plt.tight_layout()
     plt.show()
     return fig
@@ -196,7 +226,7 @@ def plot_profile(obs_seq, levels, type, bias=True, rmse=True, totalspread=True):
 def plot_rank_histogram(obs_seq, levels, type, ens_size):
-    qc0 = obs_seq.select_by_dart_qc(0)  # filter only qc=0
+    qc0 = stats.select_used_qcs(obs_seq.df)  # filter only qc=0, qc=2
     qc0 = qc0[qc0["type"] == type]  # filter by type
     stats.bin_by_layer(qc0, levels)  # bin by level
@@ -241,3 +271,153 @@ def plot_rank_histogram(obs_seq, levels, type, ens_size):
         plt.show()
     return fig
+def plot_evolution(
+    obs_seq,
+    type,
+    time_bin_width,
+    stat,
+    levels=None,
+    tick_interval=2,
+    time_format="%m-%d",
+    plot_pvu=True,
+):
+    """
+    Plot the time evolution of the requested statistics and optionally used vs possible observations.
+    Args:
+        obs_seq: The observation sequence object.
+        type (str): The type of observation to filter by.
+        time_bin_width (str): The width of each time bin (e.g., '3600s' for 1 hour).
+        stat (str): The statistic to plot. Default is "prior_rmse".
+        levels (list, optional): The levels to bin by. If None, no binning by level.
+        tick_interval (int): Interval for x-axis ticks (default is 2).
+        time_format (str): Format string for time labels on the x-axis (default is '%m-%d').
+        plot_pvu (bool): Whether to plot possible vs used observations (default is True).
+    Returns:
+        fig: The matplotlib figure object.
+    """
+    # Calculate stats and add to dataframe
+    stats.diag_stats(obs_seq.df)
+    qc0 = stats.select_used_qcs(obs_seq.df)  # filter only qc=0, qc=2
+    qc0 = qc0[qc0["type"] == type]  # filter by type
+    if qc0.empty:
+        print(f"No data found for type: {type}")
+        return
+    all_df = obs_seq.df[obs_seq.df["type"] == type]  # for possible vs used
+    if levels:
+        stats.bin_by_layer(qc0, levels)  # bin by level
+        midpoints = qc0["midpoint"].unique()
+        for level in sorted(midpoints):
+            df = qc0[qc0["midpoint"] == level]
+            # Bin by time
+            stats.bin_by_time(df, time_bin_width)
+            # Aggregate by time bin
+            df = stats.time_statistics(df)
+            # Calculate possible vs used if enabled
+            df_pvu = None
+            if plot_pvu:
+                stats.bin_by_time(all_df, time_bin_width)
+                df_pvu = stats.possible_vs_used_by_time(all_df)
+            # Plot the time evolution of requested stats
+            plot_time_evolution(
+                df, df_pvu, stat, type, level, tick_interval, time_format, plot_pvu
+            )
+    else:
+        # Bin by time
+        stats.bin_by_time(qc0, time_bin_width)
+        # Aggregate by time bin
+        df = stats.time_statistics(qc0)
+        # Calculate possible vs used if enabled
+        df_pvu = None
+        if plot_pvu:
+            stats.bin_by_time(all_df, time_bin_width)
+            df_pvu = stats.possible_vs_used_by_time(all_df)
+        # Plot the time evolution of requested stats
+        return plot_time_evolution(
+            df, df_pvu, stat, type, None, tick_interval, time_format, plot_pvu
+        )
+def plot_time_evolution(
+    df, df_pvu, stat, type, level, tick_interval, time_format, plot_pvu
+):
+    """
+    Plot the time evolution of the requested statistics and optionally used vs possible observations.
+    Args:
+        df (pd.DataFrame): The aggregated DataFrame for statistics.
+        df_pvu (pd.DataFrame): The DataFrame for possible vs used observations (if plot_pvu is True).
+        stat (str): The statistic to plot.
+        type (str): The type of observation.
+        level (float or None): The vertical level (if applicable).
+        tick_interval (int): Interval for x-axis ticks (default is 2).
+        time_format (str): Format string for time labels on the x-axis.
+        plot_pvu (bool): Whether to plot possible vs used observations (default is True).
+    Returns:
+        fig: The matplotlib figure object.
+    """
+    fig, ax1 = plt.subplots()
+    # Plot prior and posterior statistics
+    if f"prior_{stat}" in df.columns:
+        ax1.plot(df["time_bin_midpoint"], df[f"prior_{stat}"], label=f"prior {stat}")
+    if f"posterior_{stat}" in df.columns:
+        ax1.plot(
+            df["time_bin_midpoint"], df[f"posterior_{stat}"], label=f"posterior {stat}"
+        )
+    # Set x-axis ticks every 'tick_interval' values
+    tick_positions = df["time_bin_midpoint"][::tick_interval]
+    ax1.set_xticks(tick_positions)
+    ax1.set_xticklabels(
+        tick_positions.dt.strftime(time_format), rotation=45, ha="right"
+    )
+    # Add a secondary y-axis for possible vs used observations if enabled
+    if plot_pvu and df_pvu is not None:
+        ax2 = ax1.twinx()
+        ax2.set_ylabel("# obs (o=possible; +=assimilated)", color="red")
+        ax2.tick_params(axis="y", colors="red")
+        # Plot possible and used observations
+        ax2.plot(
+            df_pvu["time_bin_midpoint"],
+            df_pvu["possible"],
+            color="red",
+            marker="o",
+            linestyle="",
+            markerfacecolor="none",
+        )
+        ax2.plot(
+            df_pvu["time_bin_midpoint"],
+            df_pvu["used"],
+            color="red",
+            marker="+",
+            linestyle="",
+        )
+        ax2.set_ylim(bottom=0)
+    ax1.legend(loc="upper right")
+    title = f"{type}" if level is None else f"{type} at level {level}"
+    ax1.set_title(title)
+    ax1.set_xlabel("Time")
+    ax1.set_ylabel(stat)
+    plt.tight_layout()
+    return fig

pydartdiags/obs_sequence/composite_types.yaml ADDED Viewed

@@ -0,0 +1,35 @@
+acars_horizontal_wind:
+  description: ACARS-derived Horizontal wind speed
+  components:
+    - acars_u_wind_component
+    - acars_v_wind_component
+sat_horizontal_wind:
+  description: Satellite-derived horizontal wind speed
+  components:
+    - sat_u_wind_component
+    - sat_v_wind_component
+radiosonde_horizontal_wind:
+  description: Radiosonde-derived horizontal wind speed
+  components:
+    - radiosonde_u_wind_component
+    - radiosonde_v_wind_component
+aircraft_horizontal_wind:
+  description: Aircraft-derived horizontal wind speed
+  components:
+    - aircraft_u_wind_component
+    - aircraft_v_wind_component
+10_m_horizontal_wind:
+  description: 10 meter horizontal wind speed
+  components:
+    - 10m_u_wind_component
+    - 10m_v_wind_component
+marine_sfc_horizontal_wind:
+  description: Marine surface horizontal wind speed
+  components:
+    - marine_sfc_u_wind_component
+    - marine_sfc_v_wind_component

pydartdiags/obs_sequence/obs_sequence.py CHANGED Viewed

@@ -9,7 +9,7 @@ import struct
 def requires_assimilation_info(func):
     def wrapper(self, *args, **kwargs):
-        if self.has_assimilation_info:
+        if self.has_assimilation_info():
             return func(self, *args, **kwargs)
         else:
             raise ValueError(
@@ -19,16 +19,6 @@ def requires_assimilation_info(func):
     return wrapper
-def requires_posterior_info(func):
-    def wrapper(self, *args, **kwargs):
-        if self.has_posterior:
-            return func(self, *args, **kwargs)
-        else:
-            raise ValueError("Posterior information is required to call this function.")
-    return wrapper
 class obs_sequence:
     """
     Initialize an obs_sequence object from an ASCII or binary observation sequence file,
@@ -69,7 +59,7 @@ class obs_sequence:
         reverse_types (dict): Dictionary of types with keys and values reversed, e.g
             {'ACARS_TEMPERATURE': 23}
         synonyms_for_obs (list): List of synonyms for the observation column in the DataFrame.
-            The defualt list is
+            The default list is
             .. code-block:: python
@@ -87,8 +77,6 @@ class obs_sequence:
                 obs_sequence(file, synonyms=['synonym1', 'synonym2']).df
-        has_assimilation_info (bool): Indicates if assimilation information is present.
-        has_posterior (bool): Indicates if posterior information is present.
         seq (generator): Generator of observations from the observation sequence file.
         all_obs (list): List of all observations, each observation is a list.
             Valid when the obs_sequence is created from a file.
@@ -119,6 +107,8 @@ class obs_sequence:
         Returns:
             an obs_sequence object
+            1D observations are given a datetime of days, seconds since 2000-01-01 00:00:00
+            3D observations are given a datetime of days, seconds since 1601-01-01 00:00:00 (DART Gregorian calendar)
         Examples:
@@ -129,8 +119,6 @@ class obs_sequence:
         """
         self.loc_mod = "None"
-        self.has_assimilation_info = False
-        self.has_posterior = False
         self.file = file
         self.synonyms_for_obs = [
             "NCEP BUFR observation",
@@ -146,6 +134,9 @@ class obs_sequence:
             else:
                 self.synonyms_for_obs.append(synonyms)
+        module_dir = os.path.dirname(__file__)
+        self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
         if file is None:
             # Early exit - for testing purposes or creating obs_seq objects from scratch
             self.df = pd.DataFrame()
@@ -161,9 +152,6 @@ class obs_sequence:
             self.all_obs = []
             return
-        module_dir = os.path.dirname(__file__)
-        self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
         if self.is_binary(file):
             self.header = self.read_binary_header(file)
         else:
@@ -204,12 +192,6 @@ class obs_sequence:
         }
         self.df = self.df.rename(columns=rename_dict)
-        # check if the assimilation info is present
-        if "prior_ensemble_mean".casefold() in map(str.casefold, self.columns):
-            self.has_assimilation_info = True
-        if "posterior_ensemble_mean".casefold() in map(str.casefold, self.columns):
-            self.has_posterior = True
     def create_all_obs(self):
         """steps through the generator to create a
         list of all observations in the sequence
@@ -261,9 +243,13 @@ class obs_sequence:
         time = obs[-2].split()
         data.append(int(time[0]))  # seconds
         data.append(int(time[1]))  # days
-        data.append(
-            convert_dart_time(int(time[0]), int(time[1]))
-        )  # datetime   # HK todo what is approprate for 1d models?
+        if self.loc_mod == "loc3d":
+            data.append(convert_dart_time(int(time[0]), int(time[1])))
+        else:  # HK todo what is appropriate for 1d models?
+            data.append(
+                dt.datetime(2000, 1, 1)
+                + dt.timedelta(seconds=int(time[0]), days=int(time[1]))
+            )
         data.append(float(obs[-1]))  # obs error variance ?convert to sd?
         return data
@@ -355,20 +341,13 @@ class obs_sequence:
             obsq.write_obs_seq('obs_seq.new')
         """
-        with open(file, "w") as f:
-            # If a DataFrame is provided, update the header with the number of observations
-            num_rows = len(self.df)
-            replacement_string = f"num_obs: {num_rows:>10} max_num_obs: {num_rows:>10}"
-            new_header = [
-                replacement_string if "num_obs" in element else element
-                for element in self.header
-            ]
+        self.create_header_from_dataframe()
+        with open(file, "w") as f:
-            for line in new_header[:-1]:
+            for line in self.header:
                 f.write(str(line) + "\n")
-            first = 1
-            f.write(f"first: {first:>12} last: {num_rows:>12}\n")
             # TODO HK is there something better than copying the whole thing here?
             df_copy = self.df.copy()  # copy since you want to change for writing.
@@ -376,14 +355,23 @@ class obs_sequence:
             if self.loc_mod == "loc3d":
                 df_copy["longitude"] = np.deg2rad(self.df["longitude"]).round(16)
                 df_copy["latitude"] = np.deg2rad(self.df["latitude"]).round(16)
-            if "bias" in df_copy.columns:
-                df_copy = df_copy.drop(columns=["bias", "sq_err"])
+            if "prior_bias" in df_copy.columns:
+                df_copy = df_copy.drop(
+                    columns=["prior_bias", "prior_sq_err", "prior_totalvar"]
+                )
+            if "posterior_bias" in df_copy.columns:
+                df_copy = df_copy.drop(
+                    columns=["posterior_bias", "posterior_sq_err", "posterior_totalvar"]
+                )
+            if "midpoint" in df_copy.columns:
+                df_copy = df_copy.drop(columns=["midpoint", "vlevels"])
             # linked list for reading by dart programs
             df_copy = df_copy.sort_values(
                 by=["time"], kind="stable"
             )  # sort the DataFrame by time
-            df_copy["obs_num"] = self.df.index + 1  # obs_num in time order
+            df_copy.reset_index(drop=True, inplace=True)
+            df_copy["obs_num"] = df_copy.index + 1  # obs_num in time order
             df_copy["linked_list"] = obs_sequence.generate_linked_list_pattern(
                 len(df_copy)
             )  # linked list pattern
@@ -395,6 +383,97 @@ class obs_sequence:
             df_copy.apply(write_row, axis=1)
+    @staticmethod
+    def update_types_dicts(df, reverse_types):
+        """
+        Ensure all unique observation types are in the reverse_types dictionary and create
+        the types dictionary.
+        Args:
+            df (pd.DataFrame): The DataFrame containing the observation sequence data.
+            reverse_types (dict): The dictionary mapping observation types to their corresponding integer values.
+        Returns:
+            dict: The updated reverse_types dictionary.
+            dict: The types dictionary with keys sorted in numerical order.
+        """
+        # Create a dictionary of observation types from the dataframe
+        unique_types = df["type"].unique()
+        # Ensure all unique types are in reverse_types
+        for obs_type in unique_types:
+            if obs_type not in reverse_types:
+                new_id = int(max(reverse_types.values(), default=0)) + 1
+                reverse_types[obs_type] = str(new_id)
+        not_sorted_types = {
+            reverse_types[obs_type]: obs_type for obs_type in unique_types
+        }
+        types = {
+            k: not_sorted_types[k] for k in sorted(not_sorted_types)
+        }  # to get keys in numerical order
+        return reverse_types, types
+    def create_header_from_dataframe(self):
+        """
+        Create a header for the observation sequence based on the data in the DataFrame.
+        It creates a dictionary of unique observation types, counts the
+        number of observations, and constructs the header with necessary information.
+        Example:
+        self.create_header_from_dataframe()
+        """
+        self.reverse_types, self.types = self.update_types_dicts(
+            self.df, self.reverse_types
+        )
+        num_obs = len(self.df)
+        self.header = []
+        self.header.append("obs_sequence")
+        self.header.append("obs_type_definitions")
+        self.header.append(f"{len(self.types)}")
+        for key, value in self.types.items():
+            self.header.append(f"{key} {value}")
+        self.header.append(
+            f"num_copies: {self.n_non_qc}  num_qc: {self.n_qc}"
+        )  # @todo HK not keeping track if num_qc changes
+        self.header.append(f"num_obs: {num_obs:>10} max_num_obs: {num_obs:>10}")
+        stats_cols = [
+            "prior_bias",
+            "prior_sq_err",
+            "prior_totalvar",
+            "posterior_bias",
+            "posterior_sq_err",
+            "posterior_totalvar",
+        ]
+        level_cols = ["vlevels", "midpoint"]
+        non_copie_cols = [
+            "obs_num",
+            "linked_list",
+            "longitude",
+            "latitude",
+            "vertical",
+            "vert_unit",
+            "type",
+            "metadata",
+            "external_FO",
+            "seconds",
+            "days",
+            "time",
+            "obs_err_var",
+            "location",
+        ]
+        for copie in self.df.columns:
+            if copie not in stats_cols + non_copie_cols + level_cols:
+                self.header.append(copie.replace("_", " "))
+        first = 1
+        self.header.append(f"first: {first:>12} last: {num_obs:>12}")
     def column_headers(self):
         """define the columns for the dataframe"""
         heading = []
@@ -440,14 +519,18 @@ class obs_sequence:
             return self.df[self.df["DART_quality_control"] == dart_qc]
     @requires_assimilation_info
-    def select_failed_qcs(self):
+    def select_used_qcs(self):
         """
-        Select rows from the DataFrame where the DART quality control flag is greater than 0.
+        Select rows from the DataFrame where the observation was used.
+        Includes observations for which the posterior forward observation operators failed.
         Returns:
-            pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
+            pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag 0 or 2.
         """
-        return self.df[self.df["DART_quality_control"] > 0]
+        return self.df[
+            (self.df["DART_quality_control"] == 0)
+            | (self.df["DART_quality_control"] == 2)
+        ]
     @requires_assimilation_info
     def possible_vs_used(self):
@@ -456,7 +539,7 @@ class obs_sequence:
         This function takes a DataFrame containing observation data, including a 'type' column for the observation
         type and an 'observation' column. The number of used observations ('used'), is the total number
-        minus the observations that failed quality control checks (as determined by the `select_failed_qcs` function).
+        of assimilated observations (as determined by the `select_used_qcs` function).
         The result is a DataFrame with each observation type, the count of possible observations, and the count of
         used observations.
@@ -468,8 +551,8 @@ class obs_sequence:
         possible = self.df.groupby("type")["observation"].count()
         possible.rename("possible", inplace=True)
-        failed_qcs = self.select_failed_qcs().groupby("type")["observation"].count()
-        used = possible - failed_qcs.reindex(possible.index, fill_value=0)
+        used_qcs = self.select_used_qcs().groupby("type")["observation"].count()
+        used = used_qcs.reindex(possible.index, fill_value=0)
         used.rename("used", inplace=True)
         return pd.concat([possible, used], axis=1).reset_index()
@@ -816,7 +899,8 @@ class obs_sequence:
         components and adds them to the DataFrame.
         Args:
-            composite_types (str, optional): The YAML configuration for composite types. If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
+            composite_types (str, optional): The YAML configuration for composite types.
+            If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
         Returns:
             pd.DataFrame: The updated DataFrame with the new composite rows added.
@@ -838,24 +922,23 @@ class obs_sequence:
         if len(components) != len(set(components)):
             raise Exception("There are repeat values in components.")
+        # data frame for the composite types
         df_comp = self.df[
             self.df["type"]
             .str.upper()
             .isin([component.upper() for component in components])
         ]
-        df_no_comp = self.df[
-            ~self.df["type"]
-            .str.upper()
-            .isin([component.upper() for component in components])
-        ]
+        df = pd.DataFrame()
         for key in self.composite_types_dict:
             df_new = construct_composit(
                 df_comp, key, self.composite_types_dict[key]["components"]
             )
-            df_no_comp = pd.concat([df_no_comp, df_new], axis=0)
+            df = pd.concat([df, df_new], axis=0)
-        return df_no_comp
+        # add the composite types to the DataFrame
+        self.df = pd.concat([self.df, df], axis=0)
+        return
     @classmethod
     def join(cls, obs_sequences, copies=None):
@@ -889,18 +972,18 @@ class obs_sequence:
         # Check if all obs_sequences have compatible attributes
         first_loc_mod = obs_sequences[0].loc_mod
-        first_has_assimilation_info = obs_sequences[0].has_assimilation_info
-        first_has_posterior = obs_sequences[0].has_posterior
+        first_has_assimilation_info = obs_sequences[0].has_assimilation_info()
+        first_has_posterior = obs_sequences[0].has_posterior()
         for obs_seq in obs_sequences:
             if obs_seq.loc_mod != first_loc_mod:
                 raise ValueError(
                     "All observation sequences must have the same loc_mod."
                 )
-            if obs_seq.has_assimilation_info != first_has_assimilation_info:
+            if obs_seq.has_assimilation_info() != first_has_assimilation_info:
                 raise ValueError(
                     "All observation sequences must have assimilation info."
                 )
-            if obs_seq.has_posterior != first_has_posterior:
+            if obs_seq.has_posterior() != first_has_posterior:
                 raise ValueError(
                     "All observation sequences must have the posterior info."
                 )
@@ -908,7 +991,7 @@ class obs_sequence:
         combo.loc_mod = first_loc_mod
         # check the copies are compatible (list of copies to combine?)
-        # subset of copies if needed
+        # subset of copies if needed   # @todo HK 1d or 3d
         if copies:
             start_required_columns = ["obs_num", "observation"]
             end_required_columns = [
@@ -1015,22 +1098,32 @@ class obs_sequence:
         combo.df["obs_num"] = combined_df.index + 1
         combo.create_header(len(combo.df))
-        # set assimilation info (mean and spread) (prior and posterior)
-        combo.has_assimilation_info = "prior_ensemble_mean".casefold() in map(
-            str.casefold, combo.df.columns
-        )
-        combo.has_assimilation_info = "prior_ensemble_spread".casefold() in map(
-            str.casefold, combo.df.columns
-        )
-        combo.has_posterior = "posterior_ensemble_mean".casefold() in map(
-            str.casefold, combo.df.columns
-        )
-        combo.has_posterior = "posterior_ensemble_spread".casefold() in map(
-            str.casefold, combo.df.columns
-        )
         return combo
+    def has_assimilation_info(self):
+        """
+        Check if the DataFrame has prior information.
+        Returns:
+            bool: True if both 'prior_ensemble_mean' and 'prior_ensemble_spread' columns are present, False otherwise.
+        """
+        return "prior_ensemble_mean".casefold() in map(
+            str.casefold, self.df.columns
+        ) and "prior_ensemble_spread".casefold() in map(str.casefold, self.df.columns)
+    def has_posterior(self):
+        """
+        Check if the DataFrame has posterior information.
+        Returns:
+            bool: True if both 'posterior_ensemble_mean' and 'posterior_ensemble_spread' columns are present, False otherwise.
+        """
+        return "posterior_ensemble_mean".casefold() in map(
+            str.casefold, self.df.columns
+        ) and "posterior_ensemble_spread".casefold() in map(
+            str.casefold, self.df.columns
+        )
     def create_header(self, n):
         """Create a header for the obs_seq file from the obs_sequence object."""
         assert (
@@ -1065,7 +1158,7 @@ def load_yaml_to_dict(file_path):
             return yaml.safe_load(file)
     except Exception as e:
         print(f"Error loading YAML file: {e}")
-        return None
+        raise
 def convert_dart_time(seconds, days):
@@ -1093,17 +1186,39 @@ def construct_composit(df_comp, composite, components):
         components (list of str): A list containing the type names of the two components to be combined.
     Returns:
-        merged_df (pd.DataFrame): The updated DataFrame with the new composite rows added.
+        merged_df (pd.DataFrame): A DataFrame containing the new composite rows.
     """
     selected_rows = df_comp[df_comp["type"] == components[0].upper()]
     selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
-    columns_to_combine = df_comp.filter(regex="ensemble").columns.tolist()
-    columns_to_combine.append("observation")  # TODO HK: bias, sq_err, obs_err_var
+    prior_columns_to_combine = df_comp.filter(regex="prior_ensemble").columns.tolist()
+    posterior_columns_to_combine = df_comp.filter(
+        regex="posterior_ensemble"
+    ).columns.tolist()
+    columns_to_combine = (
+        prior_columns_to_combine
+        + posterior_columns_to_combine
+        + ["observation", "obs_err_var"]
+    )
     merge_columns = ["latitude", "longitude", "vertical", "time"]
-    print("duplicates in u: ", selected_rows[merge_columns].duplicated().sum())
-    print("duplicates in v: ", selected_rows_v[merge_columns].duplicated().sum())
+    same_obs_columns = merge_columns + [
+        "observation",
+        "obs_err_var",
+    ]  # same observation is duplicated
+    if (
+        selected_rows[same_obs_columns].duplicated().sum() > 0
+        or selected_rows_v[same_obs_columns].duplicated().sum() > 0
+    ):
+        print(
+            f"{selected_rows[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
+        )
+        print(f"{selected_rows[same_obs_columns]}")
+        print(
+            f"{selected_rows_v[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
+        )
+        print(f"{selected_rows_v[same_obs_columns]}")
+        raise Exception("There are duplicates in the components.")
     # Merge the two DataFrames on location and time columns
     merged_df = pd.merge(

pydartdiags/stats/stats.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import pandas as pd
 import numpy as np
 from functools import wraps
+from datetime import datetime, timedelta
 # from pydartdiags.obs_sequence import obs_sequence as obsq
@@ -39,20 +40,30 @@ def apply_to_phases_by_type_return_df(func):
                 result = func(df, phase, *args, **kwargs)
                 results.append(result)
-        if "midpoint" in result.columns:
-            if len(results) == 2:
-                return pd.merge(
-                    results[0],
-                    results[1],
-                    on=["midpoint", "vlevels", "type", "vert_unit"],
-                )
-            else:
-                return results[0]
+        if not results:
+            return (
+                pd.DataFrame()
+            )  # Return an empty DataFrame if no results are generated
+        # Dynamically determine merge keys based on common columns
+        common_columns = set(results[0].columns)
+        for result in results[1:]:
+            common_columns &= set(result.columns)
+        # Exclude phase-specific columns from the merge keys
+        phase_specific_columns = {
+            f"{phase}_sq_err",
+            f"{phase}_bias",
+            f"{phase}_totalvar",
+            f"{phase}_rmse",
+            f"{phase}_totalspread",
+        }
+        merge_keys = list(common_columns - phase_specific_columns)
+        if len(results) == 2:
+            return pd.merge(results[0], results[1], on=merge_keys)
         else:
-            if len(results) == 2:
-                return pd.merge(results[0], results[1], on="type")
-            else:
-                return results[0]
+            return results[0]
     return wrapper
@@ -211,15 +222,41 @@ def bin_by_layer(df, levels, verticalUnit="pressure (Pa)"):
     df.loc[df["vert_unit"] == verticalUnit, "vlevels"] = pd.cut(
         df.loc[df["vert_unit"] == verticalUnit, "vertical"], levels
     )
-    if verticalUnit == "pressure (Pa)":
-        df.loc[:, "midpoint"] = df["vlevels"].apply(
-            lambda x: x.mid
-        )  # HK todo units HPa - change now or in plotting?
-        df.loc[:, "vlevels"] = df["vlevels"].apply(
-            lambda x: x
-        )  # HK todo units HPa - change now or in plotting?
-    else:
-        df.loc[:, "midpoint"] = df["vlevels"].apply(lambda x: x.mid)
+    df.loc[:, "midpoint"] = df["vlevels"].apply(lambda x: x.mid)
+def bin_by_time(df, time_value):
+    """
+    Bin observations by time and add 'time_bin' and 'time_bin_midpoint' columns to the DataFrame.
+    The first bin starts 1 second before the minimum time value, so the minimum time is included in the
+    first bin. The last bin is inclusive of the maximum time value.
+    Args:
+        df (pd.DataFrame): The input DataFrame containing a 'time' column.
+        time_value (str): The width of each time bin (e.g., '3600S' for 1 hour).
+    Returns:
+        None: The function modifies the DataFrame in place by adding 'time_bin' and 'time_bin_midpoint' columns.
+    """
+    # Create time bins
+    start = df["time"].min() - timedelta(seconds=1)
+    end = df["time"].max()
+    # Determine if the end time aligns with the bin boundary
+    time_delta = pd.Timedelta(time_value)
+    aligned_end = (pd.Timestamp(end) + time_delta).floor(time_value)
+    time_bins = pd.date_range(
+        start=start,
+        end=aligned_end,
+        freq=time_value,
+    )
+    df["time_bin"] = pd.cut(df["time"], bins=time_bins)
+    # Calculate the midpoint of each time bin
+    df["time_bin_midpoint"] = df["time_bin"].apply(
+        lambda x: x.left + (x.right - x.left) / 2 if pd.notnull(x) else None
+    )
 @apply_to_phases_by_type_return_df
@@ -270,13 +307,48 @@ def layer_statistics(df, phase):
     return layer_stats
+@apply_to_phases_by_type_return_df
+def time_statistics(df, phase):
+    """
+    Calculate time-based statistics for a given phase and return a new DataFrame.
+    Args:
+        df (pandas.DataFrame): The input DataFrame containing observation data and ensemble statistics.
+        phase (str): The phase for which to calculate the statistics ('prior' or 'posterior').
+    Returns:
+        pandas.DataFrame: A DataFrame containing time-based statistics for the specified phase.
+    """
+    # Assuming diag_stats has been called
+    time_stats = (
+        df.groupby(["time_bin_midpoint", "type"], observed=False)
+        .agg(
+            {
+                f"{phase}_sq_err": mean_then_sqrt,
+                f"{phase}_bias": "mean",
+                f"{phase}_totalvar": mean_then_sqrt,
+                "time_bin": "first",
+                "time": "first",
+            }
+        )
+        .reset_index()
+    )
+    time_stats.rename(columns={f"{phase}_sq_err": f"{phase}_rmse"}, inplace=True)
+    time_stats.rename(
+        columns={f"{phase}_totalvar": f"{phase}_totalspread"}, inplace=True
+    )
+    return time_stats
 def possible_vs_used(df):
     """
     Calculates the count of possible vs. used observations by type.
     This function takes a DataFrame containing observation data, including a 'type' column for the observation
     type and an 'observation' column. The number of used observations ('used'), is the total number
-    minus the observations that failed quality control checks (as determined by the `select_failed_qcs` function).
+    of assimilated observations (as determined by the `select_used_qcs` function).
     The result is a DataFrame with each observation type, the count of possible observations, and the count of
     used observations.
@@ -288,8 +360,8 @@ def possible_vs_used(df):
     possible = df.groupby("type")["observation"].count()
     possible.rename("possible", inplace=True)
-    failed_qcs = select_failed_qcs(df).groupby("type")["observation"].count()
-    used = possible - failed_qcs.reindex(possible.index, fill_value=0)
+    used_qcs = select_used_qcs(df).groupby("type")["observation"].count()
+    used = used_qcs.reindex(possible.index, fill_value=0)
     used.rename("used", inplace=True)
     return pd.concat([possible, used], axis=1).reset_index()
@@ -302,22 +374,59 @@ def possible_vs_used_by_layer(df):
     possible = df.groupby(["type", "midpoint"], observed=False)["type"].count()
     possible.rename("possible", inplace=True)
-    failed_qcs = (
-        select_failed_qcs(df)
+    used_qcs = (
+        select_used_qcs(df)
         .groupby(["type", "midpoint"], observed=False)["type"]
         .count()
     )
-    used = possible - failed_qcs.reindex(possible.index, fill_value=0)
+    used = used_qcs.reindex(possible.index, fill_value=0)
     used.rename("used", inplace=True)
     return pd.concat([possible, used], axis=1).reset_index()
-def select_failed_qcs(df):
+def select_used_qcs(df):
     """
-    Select rows from the DataFrame where the DART quality control flag is greater than 0.
+    Select rows from the DataFrame where the observation was used.
+    Includes observations for which the posterior forward observation operators failed.
     Returns:
-        pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
+        pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag 0 or 2.
+    """
+    return df[(df["DART_quality_control"] == 0) | (df["DART_quality_control"] == 2)]
+def possible_vs_used_by_time(df):
     """
-    return df[df["DART_quality_control"] > 0]
+    Calculates the count of possible vs. used observations by type and time bin.
+    Args:
+        df (pd.DataFrame): The input DataFrame containing observation data. The DataFrame must include:
+                           - 'type': The observation type.
+                           - 'time_bin_midpoint': The midpoint of the time bin.
+                           - 'observation': The observation values.
+                           - 'DART_quality_control': The quality control flag.
+    Returns:
+        pd.DataFrame: A DataFrame with the following columns:
+                      - 'time_bin_midpoint': The midpoint of the time bin.
+                      - 'type': The observation type.
+                      - 'possible': The count of all observations in the time bin.
+                      - 'used': The count of observations in the time bin that passed quality control checks.
+    """
+    # Count all observations (possible) grouped by time_bin_midpoint and type
+    possible = df.groupby(["time_bin_midpoint", "type"], observed=False)["type"].count()
+    possible.rename("possible", inplace=True)
+    # Count used observations (QC=0 or QC=2) grouped by time_bin_midpoint and type
+    used_qcs = (
+        select_used_qcs(df)
+        .groupby(["time_bin_midpoint", "type"], observed=False)["type"]
+        .count()
+    )
+    used = used_qcs.reindex(possible.index, fill_value=0)
+    used.rename("used", inplace=True)
+    # Combine possible and used into a single DataFrame
+    return pd.concat([possible, used], axis=1).reset_index()

{pydartdiags-0.5.0.dist-info → pydartdiags-0.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: pydartdiags
-Version: 0.5.0
+Version: 0.5.1
 Summary: Observation Sequence Diagnostics for DART
 Home-page: https://github.com/NCAR/pyDARTdiags.git
 Author: Helen Kershaw
@@ -21,6 +21,7 @@ Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: matplotlib>=3.9.4
 Dynamic: author
 Dynamic: home-page
+Dynamic: license-file
 Dynamic: requires-python
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)

pydartdiags-0.5.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+pydartdiags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pydartdiags/matplots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pydartdiags/matplots/matplots.py,sha256=Bo0TTz1gvsHEvTfTfLfdTi_3hNRN1okmyY5a5yYgtzk,13455
+pydartdiags/obs_sequence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pydartdiags/obs_sequence/composite_types.yaml,sha256=PVLMU6x6KcVMCwPB-U65C_e0YQUemfqUhYMpf1DhFOY,917
+pydartdiags/obs_sequence/obs_sequence.py,sha256=8RGUzfWxSlGtPx_uz5lhLJaUaG8ju6qmiIU7da43nwk,48444
+pydartdiags/plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pydartdiags/plots/plots.py,sha256=U7WQjE_qN-5a8-85D-PkkgILSFBzTJQ1mcGBa7l5DHI,6464
+pydartdiags/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pydartdiags/stats/stats.py,sha256=HbRj3toQRx63mX1a1FXHA5_7yGITz8JKHbhjMoAHChk,16163
+pydartdiags-0.5.1.dist-info/licenses/LICENSE,sha256=ROglds_Eg_ylXp-1MHmEawDqMw_UsCB4r9sk7z9PU9M,11377
+pydartdiags-0.5.1.dist-info/METADATA,sha256=Fn3KsjQZma-696rO-yGpAHrHqV2izTNpVmBnYPx9z6k,2413
+pydartdiags-0.5.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+pydartdiags-0.5.1.dist-info/top_level.txt,sha256=LfMoPLnSd0VhhlWev1eeX9t6AzvyASOloag0LO_ppWg,12
+pydartdiags-0.5.1.dist-info/RECORD,,

{pydartdiags-0.5.0.dist-info → pydartdiags-0.5.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.0)
+Generator: setuptools (78.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

pydartdiags-0.5.0.dist-info/RECORD DELETED Viewed

@@ -1,14 +0,0 @@
-pydartdiags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pydartdiags/matplots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pydartdiags/matplots/matplots.py,sha256=44MlD98gaQsrCT0mW6M9f0a2-clm3KEGrdYqkTUO0RI,7478
-pydartdiags/obs_sequence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pydartdiags/obs_sequence/obs_sequence.py,sha256=kdPOWAqgiyuv6cTdhYx1u9Ru6zCKF0Wd--7-sM3m5F8,44527
-pydartdiags/plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pydartdiags/plots/plots.py,sha256=U7WQjE_qN-5a8-85D-PkkgILSFBzTJQ1mcGBa7l5DHI,6464
-pydartdiags/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pydartdiags/stats/stats.py,sha256=tzjE6HBrw6s9Li0UlJ_sNMcGEU8loT_BA5SDZp-UTOc,12138
-pydartdiags-0.5.0.dist-info/LICENSE,sha256=ROglds_Eg_ylXp-1MHmEawDqMw_UsCB4r9sk7z9PU9M,11377
-pydartdiags-0.5.0.dist-info/METADATA,sha256=F6znTR7qrj2qoGBYNojmWiaOqa9EAETgphV7i0HW0xc,2391
-pydartdiags-0.5.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-pydartdiags-0.5.0.dist-info/top_level.txt,sha256=LfMoPLnSd0VhhlWev1eeX9t6AzvyASOloag0LO_ppWg,12
-pydartdiags-0.5.0.dist-info/RECORD,,

{pydartdiags-0.5.0.dist-info → pydartdiags-0.5.1.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{pydartdiags-0.5.0.dist-info → pydartdiags-0.5.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

pydartdiags 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

pydartdiags 0.5.0py3-none-any.whl → 0.5.1py3-none-any.whl