PyPI - pydartdiags - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

pydartdiags 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pydartdiags might be problematic. Click here for more details.

Files changed (10) hide show

pydartdiags/matplots/matplots.py +200 -20
pydartdiags/obs_sequence/composite_types.yaml +35 -0
pydartdiags/obs_sequence/obs_sequence.py +268 -161
pydartdiags/stats/stats.py +230 -43
{pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info}/METADATA +4 -3
pydartdiags-0.6.0.dist-info/RECORD +15 -0
{pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info}/WHEEL +1 -1
pydartdiags-0.5.0.dist-info/RECORD +0 -14
{pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info/licenses}/LICENSE +0 -0
{pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info}/top_level.txt +0 -0

pydartdiags/obs_sequence/obs_sequence.py CHANGED Viewed

@@ -9,7 +9,7 @@ import struct
 def requires_assimilation_info(func):
     def wrapper(self, *args, **kwargs):
-        if self.has_assimilation_info:
+        if self.has_assimilation_info():
             return func(self, *args, **kwargs)
         else:
             raise ValueError(
@@ -19,27 +19,46 @@ def requires_assimilation_info(func):
     return wrapper
-def requires_posterior_info(func):
-    def wrapper(self, *args, **kwargs):
-        if self.has_posterior:
-            return func(self, *args, **kwargs)
-        else:
-            raise ValueError("Posterior information is required to call this function.")
-    return wrapper
+class ObsSequence:
+    """
+    Initialize an ObsSequence object from an ASCII or binary observation sequence file,
+    or create an empty ObsSequence object from scratch.
+    1D observations are given a datetime of days, seconds since 2000-01-01 00:00:00
-class obs_sequence:
-    """
-    Initialize an obs_sequence object from an ASCII or binary observation sequence file,
-    or create an empty obs_sequence object from scratch.
+    3D observations are given a datetime of days, seconds since 1601-01-01 00:00:00 (DART Gregorian calendar)
     Args:
         file (str): The input observation sequence ASCII or binary file.
-                If None, an empty obs_sequence object is created from scratch.
+            If None, an empty ObsSequence object is created from scratch.
+        synonyms (list, optional): List of additional synonyms for the observation column in the DataFrame.
+            The default list is
+            .. code-block:: python
+                ['NCEP BUFR observation',
+                'AIRS observation',
+                'GTSPP observation',
+                'SST observation',
+                'observations',
+                'WOD observation']
+            You can add more synonyms by providing a list of strings when
+            creating the ObsSequence object.
+            .. code-block:: python
+                ObsSequence(file, synonyms=['synonym1', 'synonym2'])
+    Raises:
+        ValueError: If neither 'loc3d' nor 'loc1d' could be found in the observation sequence.
+    Examples:
+        .. code-block:: python
+            obs_seq = ObsSequence(file='obs_seq.final')
-    Returns:
-        An obs_sequence object
     Attributes:
         df (pandas.DataFrame): The DataFrame containing the observation sequence data.
@@ -64,36 +83,18 @@ class obs_sequence:
             - scale height: 'VERTISSCALEHEIGHT' (unitless)
         loc_mod (str): The location model, either 'loc3d' or 'loc1d'.
             For 3D sphere models: latitude and longitude are in degrees in the DataFrame.
-        types (dict): Dictionary of types of observations the observation sequence,
+        types (dict): Dictionary of types of observations in the observation sequence,
             e.g. {23: 'ACARS_TEMPERATURE'},
         reverse_types (dict): Dictionary of types with keys and values reversed, e.g
             {'ACARS_TEMPERATURE': 23}
         synonyms_for_obs (list): List of synonyms for the observation column in the DataFrame.
-            The defualt list is
-            .. code-block:: python
-                [ 'NCEP BUFR observation',
-                'AIRS observation',
-                'GTSPP observation',
-                'SST observation',
-                'observations',
-                'WOD observation']
-            You can add more synonyms by providing a list of strings when
-            creating the obs_sequence object.
-            .. code-block:: python
-                obs_sequence(file, synonyms=['synonym1', 'synonym2']).df
-        has_assimilation_info (bool): Indicates if assimilation information is present.
-        has_posterior (bool): Indicates if posterior information is present.
         seq (generator): Generator of observations from the observation sequence file.
         all_obs (list): List of all observations, each observation is a list.
-            Valid when the obs_sequence is created from a file.
-            Set to None when the obs_sequence is created from scratch or multiple
-            obs_sequences are joined.
+            Valid when the ObsSequence is created from a file.
+            Set to None when the ObsSequence is created from scratch or multiple
+            ObsSequences are joined.
     """
     vert = {
@@ -108,29 +109,8 @@ class obs_sequence:
     reversed_vert = {value: key for key, value in vert.items()}
     def __init__(self, file, synonyms=None):
-        """
-        Create an obs_sequence object from an ASCII or binary observation sequence file,
-        or create an empty obs_sequence object from scratch.
-        Args:
-            file (str): The input observation sequence ASCII or binary file.
-                    If None, an empty obs_sequence object is created from scratch.
-            synonyms (list, optional): List of synonyms for the observation column in the DataFrame.
-        Returns:
-            an obs_sequence object
-        Examples:
-            .. code-block:: python
-                obs_seq = obs_sequence(file='obs_seq.final')
-        """
         self.loc_mod = "None"
-        self.has_assimilation_info = False
-        self.has_posterior = False
         self.file = file
         self.synonyms_for_obs = [
             "NCEP BUFR observation",
@@ -146,6 +126,9 @@ class obs_sequence:
             else:
                 self.synonyms_for_obs.append(synonyms)
+        module_dir = os.path.dirname(__file__)
+        self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
         if file is None:
             # Early exit - for testing purposes or creating obs_seq objects from scratch
             self.df = pd.DataFrame()
@@ -161,9 +144,6 @@ class obs_sequence:
             self.all_obs = []
             return
-        module_dir = os.path.dirname(__file__)
-        self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
         if self.is_binary(file):
             self.header = self.read_binary_header(file)
         else:
@@ -204,12 +184,6 @@ class obs_sequence:
         }
         self.df = self.df.rename(columns=rename_dict)
-        # check if the assimilation info is present
-        if "prior_ensemble_mean".casefold() in map(str.casefold, self.columns):
-            self.has_assimilation_info = True
-        if "posterior_ensemble_mean".casefold() in map(str.casefold, self.columns):
-            self.has_posterior = True
     def create_all_obs(self):
         """steps through the generator to create a
         list of all observations in the sequence
@@ -232,7 +206,7 @@ class obs_sequence:
             data.append(float(location[0]))  # location x
             data.append(float(location[1]))  # location y
             data.append(float(location[2]))  # location z
-            data.append(obs_sequence.vert[int(location[3])])
+            data.append(ObsSequence.vert[int(location[3])])
             self.loc_mod = "loc3d"
         except ValueError:
             try:
@@ -261,9 +235,13 @@ class obs_sequence:
         time = obs[-2].split()
         data.append(int(time[0]))  # seconds
         data.append(int(time[1]))  # days
-        data.append(
-            convert_dart_time(int(time[0]), int(time[1]))
-        )  # datetime   # HK todo what is approprate for 1d models?
+        if self.loc_mod == "loc3d":
+            data.append(convert_dart_time(int(time[0]), int(time[1])))
+        else:  # HK todo what is appropriate for 1d models?
+            data.append(
+                dt.datetime(2000, 1, 1)
+                + dt.timedelta(seconds=int(time[0]), days=int(time[1]))
+            )
         data.append(float(obs[-1]))  # obs error variance ?convert to sd?
         return data
@@ -355,20 +333,13 @@ class obs_sequence:
             obsq.write_obs_seq('obs_seq.new')
         """
-        with open(file, "w") as f:
-            # If a DataFrame is provided, update the header with the number of observations
-            num_rows = len(self.df)
-            replacement_string = f"num_obs: {num_rows:>10} max_num_obs: {num_rows:>10}"
-            new_header = [
-                replacement_string if "num_obs" in element else element
-                for element in self.header
-            ]
+        self.create_header_from_dataframe()
-            for line in new_header[:-1]:
+        with open(file, "w") as f:
+            for line in self.header:
                 f.write(str(line) + "\n")
-            first = 1
-            f.write(f"first: {first:>12} last: {num_rows:>12}\n")
             # TODO HK is there something better than copying the whole thing here?
             df_copy = self.df.copy()  # copy since you want to change for writing.
@@ -376,15 +347,24 @@ class obs_sequence:
             if self.loc_mod == "loc3d":
                 df_copy["longitude"] = np.deg2rad(self.df["longitude"]).round(16)
                 df_copy["latitude"] = np.deg2rad(self.df["latitude"]).round(16)
-            if "bias" in df_copy.columns:
-                df_copy = df_copy.drop(columns=["bias", "sq_err"])
+            if "prior_bias" in df_copy.columns:
+                df_copy = df_copy.drop(
+                    columns=["prior_bias", "prior_sq_err", "prior_totalvar"]
+                )
+            if "posterior_bias" in df_copy.columns:
+                df_copy = df_copy.drop(
+                    columns=["posterior_bias", "posterior_sq_err", "posterior_totalvar"]
+                )
+            if "midpoint" in df_copy.columns:
+                df_copy = df_copy.drop(columns=["midpoint", "vlevels"])
             # linked list for reading by dart programs
             df_copy = df_copy.sort_values(
                 by=["time"], kind="stable"
             )  # sort the DataFrame by time
-            df_copy["obs_num"] = self.df.index + 1  # obs_num in time order
-            df_copy["linked_list"] = obs_sequence.generate_linked_list_pattern(
+            df_copy.reset_index(drop=True, inplace=True)
+            df_copy["obs_num"] = df_copy.index + 1  # obs_num in time order
+            df_copy["linked_list"] = ObsSequence.generate_linked_list_pattern(
                 len(df_copy)
             )  # linked list pattern
@@ -395,6 +375,97 @@ class obs_sequence:
             df_copy.apply(write_row, axis=1)
+    @staticmethod
+    def update_types_dicts(df, reverse_types):
+        """
+        Ensure all unique observation types are in the reverse_types dictionary and create
+        the types dictionary.
+        Args:
+            df (pd.DataFrame): The DataFrame containing the observation sequence data.
+            reverse_types (dict): The dictionary mapping observation types to their corresponding integer values.
+        Returns:
+            dict: The updated reverse_types dictionary.
+            dict: The types dictionary with keys sorted in numerical order.
+        """
+        # Create a dictionary of observation types from the dataframe
+        unique_types = df["type"].unique()
+        # Ensure all unique types are in reverse_types
+        for obs_type in unique_types:
+            if obs_type not in reverse_types:
+                new_id = int(max(reverse_types.values(), default=0)) + 1
+                reverse_types[obs_type] = str(new_id)
+        not_sorted_types = {
+            reverse_types[obs_type]: obs_type for obs_type in unique_types
+        }
+        types = {
+            k: not_sorted_types[k] for k in sorted(not_sorted_types)
+        }  # to get keys in numerical order
+        return reverse_types, types
+    def create_header_from_dataframe(self):
+        """
+        Create a header for the observation sequence based on the data in the DataFrame.
+        It creates a dictionary of unique observation types, counts the
+        number of observations, and constructs the header with necessary information.
+        Example:
+        self.create_header_from_dataframe()
+        """
+        self.reverse_types, self.types = self.update_types_dicts(
+            self.df, self.reverse_types
+        )
+        num_obs = len(self.df)
+        self.header = []
+        self.header.append("obs_sequence")
+        self.header.append("obs_type_definitions")
+        self.header.append(f"{len(self.types)}")
+        for key, value in self.types.items():
+            self.header.append(f"{key} {value}")
+        self.header.append(
+            f"num_copies: {self.n_non_qc}  num_qc: {self.n_qc}"
+        )  # @todo HK not keeping track if num_qc changes
+        self.header.append(f"num_obs: {num_obs:>10} max_num_obs: {num_obs:>10}")
+        stats_cols = [
+            "prior_bias",
+            "prior_sq_err",
+            "prior_totalvar",
+            "posterior_bias",
+            "posterior_sq_err",
+            "posterior_totalvar",
+        ]
+        level_cols = ["vlevels", "midpoint"]
+        non_copie_cols = [
+            "obs_num",
+            "linked_list",
+            "longitude",
+            "latitude",
+            "vertical",
+            "vert_unit",
+            "type",
+            "metadata",
+            "external_FO",
+            "seconds",
+            "days",
+            "time",
+            "obs_err_var",
+            "location",
+        ]
+        for copie in self.df.columns:
+            if copie not in stats_cols + non_copie_cols + level_cols:
+                self.header.append(copie.replace("_", " "))
+        first = 1
+        self.header.append(f"first: {first:>12} last: {num_obs:>12}")
     def column_headers(self):
         """define the columns for the dataframe"""
         heading = []
@@ -440,14 +511,18 @@ class obs_sequence:
             return self.df[self.df["DART_quality_control"] == dart_qc]
     @requires_assimilation_info
-    def select_failed_qcs(self):
+    def select_used_qcs(self):
         """
-        Select rows from the DataFrame where the DART quality control flag is greater than 0.
+        Select rows from the DataFrame where the observation was used.
+        Includes observations for which the posterior forward observation operators failed.
         Returns:
-            pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
+            pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag 0 or 2.
         """
-        return self.df[self.df["DART_quality_control"] > 0]
+        return self.df[
+            (self.df["DART_quality_control"] == 0)
+            | (self.df["DART_quality_control"] == 2)
+        ]
     @requires_assimilation_info
     def possible_vs_used(self):
@@ -456,7 +531,7 @@ class obs_sequence:
         This function takes a DataFrame containing observation data, including a 'type' column for the observation
         type and an 'observation' column. The number of used observations ('used'), is the total number
-        minus the observations that failed quality control checks (as determined by the `select_failed_qcs` function).
+        of assimilated observations (as determined by the `select_used_qcs` function).
         The result is a DataFrame with each observation type, the count of possible observations, and the count of
         used observations.
@@ -468,8 +543,8 @@ class obs_sequence:
         possible = self.df.groupby("type")["observation"].count()
         possible.rename("possible", inplace=True)
-        failed_qcs = self.select_failed_qcs().groupby("type")["observation"].count()
-        used = possible - failed_qcs.reindex(possible.index, fill_value=0)
+        used_qcs = self.select_used_qcs().groupby("type")["observation"].count()
+        used = used_qcs.reindex(possible.index, fill_value=0)
         used.rename("used", inplace=True)
         return pd.concat([possible, used], axis=1).reset_index()
@@ -511,7 +586,7 @@ class obs_sequence:
         with open(file, "rb") as f:
             while True:
                 # Read the record length
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 if record_length is None:
                     break
                 record = f.read(record_length)
@@ -519,7 +594,7 @@ class obs_sequence:
                     break
                 # Read the trailing record length (should match the leading one)
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 linecount += 1
@@ -537,7 +612,7 @@ class obs_sequence:
             f.seek(0)
             for _ in range(2):
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 if record_length is None:
                     break
@@ -545,7 +620,7 @@ class obs_sequence:
                 if not record:  # end of file
                     break
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 header.append(record.decode("utf-8").strip())
             header.append(str(obs_types_definitions))
@@ -553,7 +628,7 @@ class obs_sequence:
             # obs_types_definitions
             for _ in range(3, 4 + obs_types_definitions):
                 # Read the record length
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 if record_length is None:
                     break
@@ -562,7 +637,7 @@ class obs_sequence:
                 if not record:  # end of file
                     break
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 if _ == 3:
                     continue  # num obs_types_definitions
@@ -580,7 +655,7 @@ class obs_sequence:
                 5 + obs_types_definitions + num_copies + num_qcs + 1,
             ):
                 # Read the record length
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 if record_length is None:
                     break
@@ -589,7 +664,7 @@ class obs_sequence:
                 if not record:
                     break
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 if _ == 5 + obs_types_definitions:
                     continue
@@ -600,12 +675,12 @@ class obs_sequence:
             # first and last obs
             # Read the record length
-            record_length = obs_sequence.read_record_length(f)
+            record_length = ObsSequence.read_record_length(f)
             # Read the actual record
             record = f.read(record_length)
-            obs_sequence.check_trailing_record_length(f, record_length)
+            ObsSequence.check_trailing_record_length(f, record_length)
             # Read the whole record as a two integers
             first, last = struct.unpack("ii", record)[:8]
@@ -730,7 +805,7 @@ class obs_sequence:
             # Skip the first len(obs_seq.header) lines
             for _ in range(header_length - 1):
                 # Read the record length
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 if record_length is None:  # End of file
                     break
@@ -747,7 +822,7 @@ class obs_sequence:
                 obs.append(f"OBS        {obs_num}")
                 for _ in range(n):  # number of copies
                     # Read the record length
-                    record_length = obs_sequence.read_record_length(f)
+                    record_length = ObsSequence.read_record_length(f)
                     if record_length is None:
                         break
                     # Read the actual record (copie)
@@ -755,10 +830,10 @@ class obs_sequence:
                     obs.append(struct.unpack("d", record)[0])
                     # Read the trailing record length (should match the leading one)
-                    obs_sequence.check_trailing_record_length(f, record_length)
+                    ObsSequence.check_trailing_record_length(f, record_length)
                 # linked list info
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 if record_length is None:
                     break
@@ -767,17 +842,17 @@ class obs_sequence:
                 linked_list_string = f"{int1:<12} {int2:<10} {int3:<12}"
                 obs.append(linked_list_string)
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 # location (note no location header "loc3d" or "loc1d" for binary files)
                 obs.append("loc3d")
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 record = f.read(record_length)
                 x, y, z, vert = struct.unpack("dddi", record[:28])
                 location_string = f"{x} {y} {z} {vert}"
                 obs.append(location_string)
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 #   kind (type of observation) value
                 obs.append("kind")
@@ -787,23 +862,23 @@ class obs_sequence:
                 kind = f"{struct.unpack('i', record)[0]}"
                 obs.append(kind)
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 # time (seconds, days)
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 record = f.read(record_length)
                 seconds, days = struct.unpack("ii", record)[:8]
                 time_string = f"{seconds} {days}"
                 obs.append(time_string)
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 # obs error variance
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 record = f.read(record_length)
                 obs.append(struct.unpack("d", record)[0])
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 yield obs
@@ -816,7 +891,8 @@ class obs_sequence:
         components and adds them to the DataFrame.
         Args:
-            composite_types (str, optional): The YAML configuration for composite types. If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
+            composite_types (str, optional): The YAML configuration for composite types.
+            If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
         Returns:
             pd.DataFrame: The updated DataFrame with the new composite rows added.
@@ -838,69 +914,68 @@ class obs_sequence:
         if len(components) != len(set(components)):
             raise Exception("There are repeat values in components.")
+        # data frame for the composite types
         df_comp = self.df[
             self.df["type"]
             .str.upper()
             .isin([component.upper() for component in components])
         ]
-        df_no_comp = self.df[
-            ~self.df["type"]
-            .str.upper()
-            .isin([component.upper() for component in components])
-        ]
+        df = pd.DataFrame()
         for key in self.composite_types_dict:
             df_new = construct_composit(
                 df_comp, key, self.composite_types_dict[key]["components"]
             )
-            df_no_comp = pd.concat([df_no_comp, df_new], axis=0)
+            df = pd.concat([df, df_new], axis=0)
-        return df_no_comp
+        # add the composite types to the DataFrame
+        self.df = pd.concat([self.df, df], axis=0)
+        return
     @classmethod
     def join(cls, obs_sequences, copies=None):
         """
         Join a list of observation sequences together.
-        This method combines the headers and observations from a list of obs_sequence objects
-        into a single obs_sequence object.
+        This method combines the headers and observations from a list of ObsSequence objects
+        into a single ObsSequence object.
         Args:
-            obs_sequences (list of obs_sequences): The list of observation sequences objects to join.
+            obs_sequences (list of ObsSequences): The list of observation sequences objects to join.
             copies (list of str, optional): A list of copy names to include in the combined data.
                     If not provided, all copies are included.
         Returns:
-            A new obs_sequence object containing the combined data.
+            A new ObsSequence object containing the combined data.
         Example:
             .. code-block:: python
-                obs_seq1 = obs_sequence(file='obs_seq1.final')
-                obs_seq2 = obs_sequence(file='obs_seq2.final')
-                obs_seq3 = obs_sequence(file='obs_seq3.final')
-                combined = obs_sequence.join([obs_seq1, obs_seq2, obs_seq3])
+                obs_seq1 = ObsSequence(file='obs_seq1.final')
+                obs_seq2 = ObsSequence(file='obs_seq2.final')
+                obs_seq3 = ObsSequence(file='obs_seq3.final')
+                combined = ObsSequence.join([obs_seq1, obs_seq2, obs_seq3])
         """
         if not obs_sequences:
             raise ValueError("The list of observation sequences is empty.")
-        # Create a new obs_sequnece object with the combined data
+        # Create a new ObsSequence object with the combined data
         combo = cls(file=None)
         # Check if all obs_sequences have compatible attributes
         first_loc_mod = obs_sequences[0].loc_mod
-        first_has_assimilation_info = obs_sequences[0].has_assimilation_info
-        first_has_posterior = obs_sequences[0].has_posterior
+        first_has_assimilation_info = obs_sequences[0].has_assimilation_info()
+        first_has_posterior = obs_sequences[0].has_posterior()
         for obs_seq in obs_sequences:
             if obs_seq.loc_mod != first_loc_mod:
                 raise ValueError(
                     "All observation sequences must have the same loc_mod."
                 )
-            if obs_seq.has_assimilation_info != first_has_assimilation_info:
+            if obs_seq.has_assimilation_info() != first_has_assimilation_info:
                 raise ValueError(
                     "All observation sequences must have assimilation info."
                 )
-            if obs_seq.has_posterior != first_has_posterior:
+            if obs_seq.has_posterior() != first_has_posterior:
                 raise ValueError(
                     "All observation sequences must have the posterior info."
                 )
@@ -908,7 +983,7 @@ class obs_sequence:
         combo.loc_mod = first_loc_mod
         # check the copies are compatible (list of copies to combine?)
-        # subset of copies if needed
+        # subset of copies if needed   # @todo HK 1d or 3d
         if copies:
             start_required_columns = ["obs_num", "observation"]
             end_required_columns = [
@@ -1009,30 +1084,40 @@ class obs_sequence:
         # create linked list for obs
         combo.df = combined_df.sort_values(by="time").reset_index(drop=True)
-        combo.df["linked_list"] = obs_sequence.generate_linked_list_pattern(
+        combo.df["linked_list"] = ObsSequence.generate_linked_list_pattern(
             len(combo.df)
         )
         combo.df["obs_num"] = combined_df.index + 1
         combo.create_header(len(combo.df))
-        # set assimilation info (mean and spread) (prior and posterior)
-        combo.has_assimilation_info = "prior_ensemble_mean".casefold() in map(
-            str.casefold, combo.df.columns
-        )
-        combo.has_assimilation_info = "prior_ensemble_spread".casefold() in map(
-            str.casefold, combo.df.columns
-        )
-        combo.has_posterior = "posterior_ensemble_mean".casefold() in map(
-            str.casefold, combo.df.columns
-        )
-        combo.has_posterior = "posterior_ensemble_spread".casefold() in map(
-            str.casefold, combo.df.columns
-        )
         return combo
+    def has_assimilation_info(self):
+        """
+        Check if the DataFrame has prior information.
+        Returns:
+            bool: True if both 'prior_ensemble_mean' and 'prior_ensemble_spread' columns are present, False otherwise.
+        """
+        return "prior_ensemble_mean".casefold() in map(
+            str.casefold, self.df.columns
+        ) and "prior_ensemble_spread".casefold() in map(str.casefold, self.df.columns)
+    def has_posterior(self):
+        """
+        Check if the DataFrame has posterior information.
+        Returns:
+            bool: True if both 'posterior_ensemble_mean' and 'posterior_ensemble_spread' columns are present, False otherwise.
+        """
+        return "posterior_ensemble_mean".casefold() in map(
+            str.casefold, self.df.columns
+        ) and "posterior_ensemble_spread".casefold() in map(
+            str.casefold, self.df.columns
+        )
     def create_header(self, n):
-        """Create a header for the obs_seq file from the obs_sequence object."""
+        """Create a header for the obs_seq file from the ObsSequence object."""
         assert (
             self.n_copies == self.n_non_qc + self.n_qc
         ), "n_copies must be equal to n_non_qc + n_qc"
@@ -1065,7 +1150,7 @@ def load_yaml_to_dict(file_path):
             return yaml.safe_load(file)
     except Exception as e:
         print(f"Error loading YAML file: {e}")
-        return None
+        raise
 def convert_dart_time(seconds, days):
@@ -1093,17 +1178,39 @@ def construct_composit(df_comp, composite, components):
         components (list of str): A list containing the type names of the two components to be combined.
     Returns:
-        merged_df (pd.DataFrame): The updated DataFrame with the new composite rows added.
+        merged_df (pd.DataFrame): A DataFrame containing the new composite rows.
     """
     selected_rows = df_comp[df_comp["type"] == components[0].upper()]
     selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
-    columns_to_combine = df_comp.filter(regex="ensemble").columns.tolist()
-    columns_to_combine.append("observation")  # TODO HK: bias, sq_err, obs_err_var
+    prior_columns_to_combine = df_comp.filter(regex="prior_ensemble").columns.tolist()
+    posterior_columns_to_combine = df_comp.filter(
+        regex="posterior_ensemble"
+    ).columns.tolist()
+    columns_to_combine = (
+        prior_columns_to_combine
+        + posterior_columns_to_combine
+        + ["observation", "obs_err_var"]
+    )
     merge_columns = ["latitude", "longitude", "vertical", "time"]
-    print("duplicates in u: ", selected_rows[merge_columns].duplicated().sum())
-    print("duplicates in v: ", selected_rows_v[merge_columns].duplicated().sum())
+    same_obs_columns = merge_columns + [
+        "observation",
+        "obs_err_var",
+    ]  # same observation is duplicated
+    if (
+        selected_rows[same_obs_columns].duplicated().sum() > 0
+        or selected_rows_v[same_obs_columns].duplicated().sum() > 0
+    ):
+        print(
+            f"{selected_rows[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
+        )
+        print(f"{selected_rows[same_obs_columns]}")
+        print(
+            f"{selected_rows_v[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
+        )
+        print(f"{selected_rows_v[same_obs_columns]}")
+        raise Exception("There are duplicates in the components.")
     # Merge the two DataFrames on location and time columns
     merged_df = pd.merge(

pydartdiags 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

pydartdiags 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl