PyPI - pydartdiags - Versions diffs - 0.6.0__tar.gz → 0.6.1__tar.gz - Mend

pydartdiags 0.6.0tar.gz → 0.6.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pydartdiags might be problematic. Click here for more details.

Files changed (24) hide show

{pydartdiags-0.6.0/src/pydartdiags.egg-info → pydartdiags-0.6.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydartdiags
-Version: 0.6.0
+Version: 0.6.1
 Summary: Observation Sequence Diagnostics for DART
 Home-page: https://github.com/NCAR/pyDARTdiags.git
 Author: Helen Kershaw

{pydartdiags-0.6.0 → pydartdiags-0.6.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "pydartdiags"
-version = "0.6.0"
+version = "0.6.1"
 authors = [
   { name="Helen Kershaw", email="hkershaw@ucar.edu" },
 ]

{pydartdiags-0.6.0 → pydartdiags-0.6.1}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
     name="pydartdiags",
-    version="0.6.0",
+    version="0.6.1",
     packages=find_packages(where="src"),
     package_dir={"": "src"},
     include_package_data=True,

{pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags/obs_sequence/obs_sequence.py RENAMED Viewed

@@ -184,6 +184,14 @@ class ObsSequence:
         }
         self.df = self.df.rename(columns=rename_dict)
+        if self.is_binary(file):
+            # binary files do not have "OBS      X" in, so set linked list from df.
+            self.update_attributes_from_df()
+        # Replace MISSING_R8s with NaNs in posterior stats where DART_quality_control = 2
+        if self.has_posterior():
+            ObsSequence.replace_qc2_nan(self.df)
     def create_all_obs(self):
         """steps through the generator to create a
         list of all observations in the sequence
@@ -197,7 +205,7 @@ class ObsSequence:
     def obs_to_list(self, obs):
         """put single observation into a list"""
         data = []
-        data.append(obs[0].split()[1])  # obs_num
+        data.append(int(obs[0].split()[1]))  # obs_num
         data.extend(list(map(float, obs[1 : self.n_copies + 1])))  # all the copies
         data.append(obs[self.n_copies + 1])  # linked list info
         try:  # HK todo only have to check loc3d or loc1d for the first observation, the whole file is the same
@@ -219,9 +227,9 @@ class ObsSequence:
                     "Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
                 )
         typeI = obs.index("kind")  # type of observation
-        type_value = obs[typeI + 1]
-        if not self.types:
-            data.append("Identity")
+        type_value = int(obs[typeI + 1])
+        if type_value < 0:
+            data.append(type_value)
         else:
             data.append(self.types[type_value])  # observation type
@@ -283,14 +291,22 @@ class ObsSequence:
                 + str(self.reversed_vert[data[self.n_copies + 5]])
             )  # location x, y, z, vert
             obs.append("kind")  # this is type of observation
-            obs.append(self.reverse_types[data[self.n_copies + 6]])  # observation type
+            obs_type = data[self.n_copies + 6]
+            if isinstance(obs_type, str):
+                obs.append(self.reverse_types[obs_type])  # observation type
+            else:
+                obs.append(obs_type)  # Identity obs negative integer
             # Convert metadata to a string and append !HK @todo you are not converting to string
             obs.extend(data[self.n_copies + 7])  # metadata
             obs.extend(data[self.n_copies + 8])  # external forward operator
         elif self.loc_mod == "loc1d":
             obs.append(data[self.n_copies + 2])  # 1d location
             obs.append("kind")  # this is type of observation
-            obs.append(self.reverse_types[data[self.n_copies + 3]])  # observation type
+            obs_type = data[self.n_copies + 3]
+            if isinstance(obs_type, str):
+                obs.append(self.reverse_types[obs_type])  # observation type
+            else:
+                obs.append(obs_type)  # Identity obs negative integer
             obs.extend(data[self.n_copies + 4])  # metadata
             obs.extend(data[self.n_copies + 5])  # external forward operator
         obs.append(" ".join(map(str, data[-4:-2])))  # seconds, days
@@ -316,14 +332,17 @@ class ObsSequence:
         This function writes the observation sequence stored in the obs_seq.DataFrame to a specified file.
         It updates the header with the number of observations, converts coordinates back to radians
-        if necessary, drops unnecessary columns, sorts the DataFrame by time, and generates a linked
-        list pattern for reading by DART programs.
+        if necessary, reverts NaNs back to MISSING_R8 for observations with QC=2, drops unnecessary
+        columns, sorts the DataFrame by time, and generates a linked list pattern for reading by DART
+        programs.
         Args:
             file (str): The path to the file where the observation sequence will be written.
         Notes:
             - Longitude and latitude are converted back to radians if the location model is 'loc3d'.
+            - The replacement of MISSING_R8 values with NaNs for any obs that failed the posterior
+              forward observation operators (QC2) is reverted.
             - The 'bias' and 'sq_err' columns are dropped if they exist in the DataFrame.
             - The DataFrame is sorted by the 'time' column.
             - An 'obs_num' column is added to the DataFrame to number the observations in time order.
@@ -334,7 +353,8 @@ class ObsSequence:
         """
-        self.create_header_from_dataframe()
+        # Update attributes, header, and linked list from dataframe
+        self.update_attributes_from_df()
         with open(file, "w") as f:
@@ -358,15 +378,9 @@ class ObsSequence:
             if "midpoint" in df_copy.columns:
                 df_copy = df_copy.drop(columns=["midpoint", "vlevels"])
-            # linked list for reading by dart programs
-            df_copy = df_copy.sort_values(
-                by=["time"], kind="stable"
-            )  # sort the DataFrame by time
-            df_copy.reset_index(drop=True, inplace=True)
-            df_copy["obs_num"] = df_copy.index + 1  # obs_num in time order
-            df_copy["linked_list"] = ObsSequence.generate_linked_list_pattern(
-                len(df_copy)
-            )  # linked list pattern
+            # Revert NaNs back to MISSING_R8s
+            if self.has_posterior():
+                ObsSequence.revert_qc2_nan(df_copy)
             def write_row(row):
                 ob_write = self.list_to_obs(row.tolist())
@@ -390,13 +404,16 @@ class ObsSequence:
             dict: The types dictionary with keys sorted in numerical order.
         """
         # Create a dictionary of observation types from the dataframe
-        unique_types = df["type"].unique()
+        # Ignore Identity obs (negative integers)
+        unique_types = df.loc[
+            df["type"].apply(lambda x: isinstance(x, str)), "type"
+        ].unique()
         # Ensure all unique types are in reverse_types
         for obs_type in unique_types:
             if obs_type not in reverse_types:
-                new_id = int(max(reverse_types.values(), default=0)) + 1
-                reverse_types[obs_type] = str(new_id)
+                new_id = max(reverse_types.values(), default=0) + 1
+                reverse_types[obs_type] = new_id
         not_sorted_types = {
             reverse_types[obs_type]: obs_type for obs_type in unique_types
@@ -431,9 +448,7 @@ class ObsSequence:
         self.header.append(f"{len(self.types)}")
         for key, value in self.types.items():
             self.header.append(f"{key} {value}")
-        self.header.append(
-            f"num_copies: {self.n_non_qc}  num_qc: {self.n_qc}"
-        )  # @todo HK not keeping track if num_qc changes
+        self.header.append(f"num_copies: {self.n_non_qc}  num_qc: {self.n_qc}")
         self.header.append(f"num_obs: {num_obs:>10} max_num_obs: {num_obs:>10}")
         stats_cols = [
             "prior_bias",
@@ -692,7 +707,8 @@ class ObsSequence:
     def collect_obs_types(header):
         """Create a dictionary for the observation types in the obs_seq header"""
         num_obs_types = int(header[2])
-        types = dict([x.split() for x in header[3 : num_obs_types + 3]])
+        # The first line containing obs types is the 4th line in an obs_seq file.
+        types = {int(x.split()[0]): x.split()[1] for x in header[3 : num_obs_types + 3]}
         return types
     @staticmethod
@@ -856,18 +872,45 @@ class ObsSequence:
                 #   kind (type of observation) value
                 obs.append("kind")
-                record_length_bytes = f.read(4)
-                record_length = struct.unpack("i", record_length_bytes)[0]
+                record_length = ObsSequence.read_record_length(f)
                 record = f.read(record_length)
                 kind = f"{struct.unpack('i', record)[0]}"
                 obs.append(kind)
                 ObsSequence.check_trailing_record_length(f, record_length)
+                # Skip metadata (obs_def) and go directly to the time record
+                while True:
+                    pos = f.tell()
+                    record_length = ObsSequence.read_record_length(f)
+                    if record_length is None:
+                        break  # End of file
+                    record = f.read(record_length)
+                    # Check if this record is likely the "time" record (8 bytes, can be unpacked as two ints)
+                    if record_length == 8:
+                        try:
+                            seconds, days = struct.unpack("ii", record)
+                            # If unpack succeeds, this is the time record
+                            f.seek(pos)  # Seek back so the main loop can process it
+                            break
+                        except struct.error:
+                            pass  # Not the time record, keep skipping
+                    ObsSequence.check_trailing_record_length(f, record_length)
                 # time (seconds, days)
                 record_length = ObsSequence.read_record_length(f)
                 record = f.read(record_length)
-                seconds, days = struct.unpack("ii", record)[:8]
+                try:  # This is incase the record is not the time record because of metadata funkyness
+                    seconds, days = struct.unpack("ii", record)
+                except struct.error as e:
+                    print(
+                        f"Reading observation {obs_num}... record length: {record_length} kind {kind}"
+                    )
+                    print(f"")
+                    print(f"Error unpacking seconds and days: {e}")
+                    raise
                 time_string = f"{seconds} {days}"
                 obs.append(time_string)
@@ -882,23 +925,27 @@ class ObsSequence:
                 yield obs
-    def composite_types(self, composite_types="use_default"):
+    def composite_types(self, composite_types="use_default", raise_on_duplicate=False):
         """
-        Set up and construct composite types for the DataFrame.
+        Set up and construct composite observation types for the DataFrame.
-        This function sets up composite types based on a provided YAML configuration or
+        This function sets up composite observation types based on a provided YAML configuration or
         a default configuration. It constructs new composite rows by combining specified
-        components and adds them to the DataFrame.
+        components and adds them to the DataFrame in place.
         Args:
             composite_types (str, optional): The YAML configuration for composite types.
-            If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
+                If 'use_default', the default configuration is used. Otherwise, a custom YAML
+                configuration can be provided.
+            raise_on_duplicate (bool, optional): If True, raises an exception if there are
+                duplicates in the components. otherwise default False, deals with duplicates as though
+                they are distinct observations.
         Returns:
             pd.DataFrame: The updated DataFrame with the new composite rows added.
         Raises:
-            Exception: If there are repeat values in the components.
+            Exception: If there are repeat values in the components and raise_on_duplicate = True
         """
         if composite_types == "use_default":
@@ -924,7 +971,10 @@ class ObsSequence:
         df = pd.DataFrame()
         for key in self.composite_types_dict:
             df_new = construct_composit(
-                df_comp, key, self.composite_types_dict[key]["components"]
+                df_comp,
+                key,
+                self.composite_types_dict[key]["components"],
+                raise_on_duplicate,
             )
             df = pd.concat([df, df_new], axis=0)
@@ -1045,53 +1095,49 @@ class ObsSequence:
                 if item in obs_sequences[0].qc_copie_names
             ]
-            combo.n_copies = len(combo.copie_names)
-            combo.n_qc = len(combo.qc_copie_names)
-            combo.n_non_qc = len(combo.non_qc_copie_names)
         else:
             for obs_seq in obs_sequences:
                 if not obs_sequences[0].df.columns.isin(obs_seq.df.columns).all():
                     raise ValueError(
                         "All observation sequences must have the same copies."
                     )
-            combo.n_copies = obs_sequences[0].n_copies
-            combo.n_qc = obs_sequences[0].n_qc
-            combo.n_non_qc = obs_sequences[0].n_non_qc
             combo.copie_names = obs_sequences[0].copie_names
+            combo.non_qc_copie_names = obs_sequences[0].non_qc_copie_names
+            combo.qc_copie_names = obs_sequences[0].qc_copie_names
+            combo.n_copies = len(combo.copie_names)
         # todo HK @todo combine synonyms for obs?
         # Initialize combined data
-        combined_types = []
-        combined_df = pd.DataFrame()
-        combo.all_obs = None  # set to none to force writing from the dataframe if write_obs_seq is called
+        combo.df = pd.DataFrame()
         # Iterate over the list of observation sequences and combine their data
         for obs_seq in obs_sequences:
             if copies:
-                combined_df = pd.concat(
-                    [combined_df, obs_seq.df[requested_columns]], ignore_index=True
+                combo.df = pd.concat(
+                    [combo.df, obs_seq.df[requested_columns]], ignore_index=True
                 )
             else:
-                combined_df = pd.concat([combined_df, obs_seq.df], ignore_index=True)
-            combined_types.extend(list(obs_seq.reverse_types.keys()))
-        # create dictionary of types
-        keys = set(combined_types)
-        combo.reverse_types = {item: i + 1 for i, item in enumerate(keys)}
-        combo.types = {v: k for k, v in combo.reverse_types.items()}
-        # create linked list for obs
-        combo.df = combined_df.sort_values(by="time").reset_index(drop=True)
-        combo.df["linked_list"] = ObsSequence.generate_linked_list_pattern(
-            len(combo.df)
-        )
-        combo.df["obs_num"] = combined_df.index + 1
-        combo.create_header(len(combo.df))
+                combo.df = pd.concat([combo.df, obs_seq.df], ignore_index=True)
+        # update ObsSequence attributes from the combined DataFrame
+        combo.update_attributes_from_df()
         return combo
+    @staticmethod
+    def update_linked_list(df):
+        """
+        Sorts the DataFrame by 'time', resets the index, and adds/updates 'linked_list'
+        and 'obs_num' columns in place.
+        Modifies the input DataFrame directly.
+        """
+        df.sort_values(by="time", inplace=True, kind="stable")
+        df.reset_index(drop=True, inplace=True)
+        df["linked_list"] = ObsSequence.generate_linked_list_pattern(len(df))
+        df["obs_num"] = df.index + 1
+        return None
     def has_assimilation_info(self):
         """
         Check if the DataFrame has prior information.
@@ -1134,6 +1180,100 @@ class ObsSequence:
             self.header.append(copie)
         self.header.append(f"first: 1 last: {n}")
+    @staticmethod
+    def replace_qc2_nan(df):
+        """
+        Replace MISSING_R8 values with NaNs in posterior columns for observations where
+        DART_quality_control = 2 (posterior forward observation operators failed)
+        This causes these observations to be ignored in the calculations of posterior statistics
+        """
+        df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_mean"] = np.nan
+        df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_spread"] = np.nan
+        num_post_members = len(
+            df.columns[df.columns.str.startswith("posterior_ensemble_member_")]
+        )
+        for i in range(1, num_post_members + 1):
+            df.loc[
+                df["DART_quality_control"] == 2.0,
+                "posterior_ensemble_member_" + str(i),
+            ] = np.nan
+    @staticmethod
+    def revert_qc2_nan(df):
+        """
+        Revert NaNs back to MISSING_R8s for observations where DART_quality_control = 2
+        (posterior forward observation operators failed)
+        """
+        df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_mean"] = (
+            -888888.000000
+        )
+        df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_spread"] = (
+            -888888.000000
+        )
+        num_post_members = len(
+            df.columns[df.columns.str.startswith("posterior_ensemble_member_")]
+        )
+        for i in range(1, num_post_members + 1):
+            df.loc[
+                df["DART_quality_control"] == 2.0, "posterior_ensemble_member_" + str(i)
+            ] = -888888.000000
+    def update_attributes_from_df(self):
+        """
+        Update all internal data (fields/properties) of the ObsSequence object that
+        depend on the DataFrame (self.df).
+        Call this after self.df is replaced or its structure changes.
+        Important:
+         Assumes copies are all columns between 'obs_num' and 'linked_list' (if present)
+        """
+        # Update columns
+        self.columns = list(self.df.columns)
+        # Update all_obs (list of lists, each row) @todo HK do we need this?
+        self.all_obs = None
+        # Update copie_names, non_qc_copie_names, qc_copie_names, n_copies, n_non_qc, n_qc
+        # Try to infer from columns if possible, else leave as is
+        # Assume copies are all columns between 'obs_num' and 'linked_list' (if present)
+        if "obs_num" in self.df.columns and "linked_list" in self.df.columns:
+            obs_num_idx = self.df.columns.get_loc("obs_num")
+            linked_list_idx = self.df.columns.get_loc("linked_list")
+            self.copie_names = list(self.df.columns[obs_num_idx + 1 : linked_list_idx])
+        else:
+            # Fallback: use previous value or empty
+            self.copie_names = getattr(self, "copie_names", [])
+        self.n_copies = len(self.copie_names)
+        # Try to infer non_qc and qc copies from previous names if possible
+        # Find qc copies first
+        self.qc_copie_names = [c for c in self.copie_names if c in self.qc_copie_names]
+        if self.qc_copie_names == []:  # If no qc copies found, assume all are non-qc
+            self.non_qc_copie_names = self.copie_names
+        else:  # pull out non-qc copies from the copie_names
+            self.non_qc_copie_names = [
+                c for c in self.copie_names if c not in self.qc_copie_names
+            ]
+        self.n_qc = len(self.qc_copie_names)
+        self.n_non_qc = len(self.non_qc_copie_names)
+        # Update header and types and reverse_types
+        self.create_header_from_dataframe()
+        # Update seq (generator should be empty or None if not from file)
+        self.seq = []
+        # Update loc_mod
+        if "vertical" in self.df.columns:
+            self.loc_mod = "loc3d"
+        else:
+            self.loc_mod = "loc1d"
+        # update linked list for obs and obs_nums
+        ObsSequence.update_linked_list(self.df)
 def load_yaml_to_dict(file_path):
     """
@@ -1164,24 +1304,31 @@ def convert_dart_time(seconds, days):
     return time
-def construct_composit(df_comp, composite, components):
+def construct_composit(df_comp, composite, components, raise_on_duplicate):
     """
-    Construct a composite DataFrame by combining rows from two components.
-    This function takes two DataFrames and combines rows from them based on matching
-    location and time. It creates a new row with a composite type by combining
-    specified columns using the square root of the sum of squares method.
+    Creates a new DataFrame by combining pairs of rows from two specified component
+    types in an observation DataFrame. It matches rows based on location and time,
+    and then combines certain columns using the square root of the sum of squares
+    of the components.
     Args:
         df_comp (pd.DataFrame): The DataFrame containing the component rows to be combined.
         composite (str): The type name for the new composite rows.
         components (list of str): A list containing the type names of the two components to be combined.
+        raise_on_duplicate (bool): If False, raises an exception if there are duplicates in the components.
+        otherwise deals with duplicates as though they are distinct observations.
     Returns:
         merged_df (pd.DataFrame): A DataFrame containing the new composite rows.
     """
+    # select rows for the two components
+    if len(components) != 2:
+        raise ValueError("components must be a list of two component types.")
     selected_rows = df_comp[df_comp["type"] == components[0].upper()]
     selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
+    selected_rows = selected_rows.copy()
+    selected_rows_v = selected_rows_v.copy()
     prior_columns_to_combine = df_comp.filter(regex="prior_ensemble").columns.tolist()
     posterior_columns_to_combine = df_comp.filter(
@@ -1192,7 +1339,7 @@ def construct_composit(df_comp, composite, components):
         + posterior_columns_to_combine
         + ["observation", "obs_err_var"]
     )
-    merge_columns = ["latitude", "longitude", "vertical", "time"]
+    merge_columns = ["latitude", "longitude", "vertical", "time"]  # @todo HK 1d or 3d
     same_obs_columns = merge_columns + [
         "observation",
         "obs_err_var",
@@ -1202,15 +1349,25 @@ def construct_composit(df_comp, composite, components):
         selected_rows[same_obs_columns].duplicated().sum() > 0
         or selected_rows_v[same_obs_columns].duplicated().sum() > 0
     ):
-        print(
-            f"{selected_rows[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
-        )
-        print(f"{selected_rows[same_obs_columns]}")
-        print(
-            f"{selected_rows_v[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
-        )
-        print(f"{selected_rows_v[same_obs_columns]}")
-        raise Exception("There are duplicates in the components.")
+        if raise_on_duplicate:
+            print(
+                f"{selected_rows[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
+            )
+            print(f"{selected_rows[same_obs_columns]}")
+            print(
+                f"{selected_rows_v[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
+            )
+            print(f"{selected_rows_v[same_obs_columns]}")
+            raise Exception("There are duplicates in the components.")
+        else:
+            selected_rows["dup_num"] = selected_rows.groupby(
+                same_obs_columns
+            ).cumcount()
+            selected_rows_v["dup_num"] = selected_rows_v.groupby(
+                same_obs_columns
+            ).cumcount()
     # Merge the two DataFrames on location and time columns
     merged_df = pd.merge(
@@ -1227,4 +1384,7 @@ def construct_composit(df_comp, composite, components):
         columns=[col for col in merged_df.columns if col.endswith("_v")]
     )
+    if "dup_num" in merged_df.columns:
+        merged_df = merged_df.drop(columns=["dup_num"])
     return merged_df

{pydartdiags-0.6.0 → pydartdiags-0.6.1/src/pydartdiags.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydartdiags
-Version: 0.6.0
+Version: 0.6.1
 Summary: Observation Sequence Diagnostics for DART
 Home-page: https://github.com/NCAR/pyDARTdiags.git
 Author: Helen Kershaw

{pydartdiags-0.6.0 → pydartdiags-0.6.1}/tests/test_obs_sequence.py RENAMED Viewed

@@ -165,6 +165,10 @@ class TestWriteAscii:
             ),
             os.path.join(os.path.dirname(__file__), "data", "obs_seq.1d.final"),
             os.path.join(os.path.dirname(__file__), "data", "obs_seq.out.GSI.small"),
+            os.path.join(os.path.dirname(__file__), "data", "obs_seq.final.qc2_2obs"),
+            os.path.join(os.path.dirname(__file__), "data", "obs_seq.in.all-id"),
+            os.path.join(os.path.dirname(__file__), "data", "obs_seq.in.mix"),
+            os.path.join(os.path.dirname(__file__), "data", "obs_seq.final.wrfhydro"),
         ],
     )
     def test_write_ascii(self, ascii_obs_seq_file_path, temp_dir):
@@ -420,32 +424,14 @@ class TestJoin:
         assert obs_seq_mega.loc_mod == "loc3d"
         assert obs_seq_mega.has_assimilation_info() == True
         assert obs_seq_mega.has_posterior() == False
-        assert list(obs_seq_mega.types.keys()) == list(range(1, 26))  # 25 obs types
+        assert list(obs_seq_mega.types.keys()) == list(range(1, 8))  # 7 obs types
         obs_types = [
-            "AIRCRAFT_TEMPERATURE",
-            "BLUE_LAND_SFC_ALTIMETER",
-            "MARINE_SFC_SPECIFIC_HUMIDITY",
-            "SAT_V_WIND_COMPONENT",
-            "RADIOSONDE_SPECIFIC_HUMIDITY",
-            "MARINE_SFC_TEMPERATURE",
-            "RADIOSONDE_U_WIND_COMPONENT",
-            "MARINE_SFC_ALTIMETER",
-            "AIRCRAFT_V_WIND_COMPONENT",
-            "RADIOSONDE_SURFACE_ALTIMETER",
             "ACARS_TEMPERATURE",
-            "LAND_SFC_ALTIMETER",
-            "MARINE_SFC_V_WIND_COMPONENT",
-            "AIRS_TEMPERATURE",
-            "GPSRO_REFRACTIVITY",
-            "MARINE_SFC_U_WIND_COMPONENT",
             "ACARS_U_WIND_COMPONENT",
-            "RADIOSONDE_V_WIND_COMPONENT",
-            "SAT_U_WIND_COMPONENT",
-            "GREEN_LAND_SFC_ALTIMETER",
             "ACARS_V_WIND_COMPONENT",
-            "RADIOSONDE_TEMPERATURE",
+            "AIRCRAFT_TEMPERATURE",
             "AIRCRAFT_U_WIND_COMPONENT",
-            "AIRS_SPECIFIC_HUMIDITY",
+            "AIRCRAFT_V_WIND_COMPONENT",
             "PINK_LAND_SFC_ALTIMETER",
         ]
         all_obs_present = all(
@@ -720,16 +706,16 @@ class TestUpdateTypesDicts:
         return pd.DataFrame(data)
     def test_update_types_dicts(self, sample_df):
-        reverse_types = {"ACARS_TEMPERATURE": "32", "RADIOSONDE_U_WIND_COMPONENT": "51"}
+        reverse_types = {"ACARS_TEMPERATURE": 32, "RADIOSONDE_U_WIND_COMPONENT": 51}
         expected_reverse_types = {
-            "ACARS_TEMPERATURE": "32",
-            "RADIOSONDE_U_WIND_COMPONENT": "51",
-            "PINEAPPLE_COUNT": "52",
+            "ACARS_TEMPERATURE": 32,
+            "RADIOSONDE_U_WIND_COMPONENT": 51,
+            "PINEAPPLE_COUNT": 52,
         }
         expected_types = {
-            "32": "ACARS_TEMPERATURE",
-            "51": "RADIOSONDE_U_WIND_COMPONENT",
-            "52": "PINEAPPLE_COUNT",
+            32: "ACARS_TEMPERATURE",
+            51: "RADIOSONDE_U_WIND_COMPONENT",
+            52: "PINEAPPLE_COUNT",
         }
         updated_reverse_types, types = obsq.ObsSequence.update_types_dicts(
@@ -846,14 +832,98 @@ class TestCompositeTypes:
                 == orig_df.loc[orig_df["type"] == "ACARS_TEMPERATURE", col].values[0]
             )
-    def test_composite_types_dups(self):
+    def test_composite_types_dups_catch(self):
         test_dir = os.path.dirname(__file__)
         file_path = os.path.join(test_dir, "data", "dups-obs.final")
         dup = obsq.ObsSequence(file_path)
         # Test that composite_types raises an error
         with pytest.raises(Exception, match="There are duplicates in the components."):
-            dup.composite_types()
+            dup.composite_types(raise_on_duplicate=True)
+    def test_composite_types_dups(self):
+        test_dir = os.path.dirname(__file__)
+        file_path = os.path.join(test_dir, "data", "dups-obs.final")
+        obs_seq = obsq.ObsSequence(file_path)
+        # Save the original DataFrame for comparison
+        orig_df = obs_seq.df.copy()
+        # Test that composite_types does not raise an error
+        obs_seq.composite_types(raise_on_duplicate=False)
+        # Verify that the DataFrame has the expected types
+        types = obs_seq.df["type"].unique()
+        expected_composite_types = [
+            "ACARS_TEMPERATURE",
+            "ACARS_U_WIND_COMPONENT",
+            "ACARS_V_WIND_COMPONENT",
+            "ACARS_HORIZONTAL_WIND",
+        ]
+        assert len(types) == len(expected_composite_types)
+        for type in expected_composite_types:
+            assert type in types
+        # Verify composite types are correctly calculated
+        prior_columns = obs_seq.df.filter(regex="prior_ensemble").columns.tolist()
+        posterior_columns = obs_seq.df.filter(
+            regex="posterior_ensemble"
+        ).columns.tolist()
+        combo_cols = ["observation", "obs_err_var"] + prior_columns + posterior_columns
+        for col in combo_cols:
+            u_wind = obs_seq.df.loc[
+                obs_seq.df["type"] == "ACARS_U_WIND_COMPONENT", col
+            ].values[0]
+            v_wind = obs_seq.df.loc[
+                obs_seq.df["type"] == "ACARS_V_WIND_COMPONENT", col
+            ].values[0]
+            wind = obs_seq.df.loc[
+                obs_seq.df["type"] == "ACARS_HORIZONTAL_WIND", col
+            ].values[0]
+            assert np.isclose(
+                np.sqrt(u_wind**2 + v_wind**2), wind
+            ), f"Mismatch in column {col}: {wind} != sqrt({u_wind}^2 + {v_wind}^2)"
+        # Verify that the non-composite columns are unchanged
+        for col in obs_seq.df.columns:
+            if col not in combo_cols:
+                assert (
+                    obs_seq.df.loc[
+                        obs_seq.df["type"] == "ACARS_U_WIND_COMPONENT", col
+                    ].values[0]
+                    == orig_df.loc[
+                        orig_df["type"] == "ACARS_U_WIND_COMPONENT", col
+                    ].values[0]
+                )
+                assert (
+                    obs_seq.df.loc[
+                        obs_seq.df["type"] == "ACARS_V_WIND_COMPONENT", col
+                    ].values[0]
+                    == orig_df.loc[
+                        orig_df["type"] == "ACARS_V_WIND_COMPONENT", col
+                    ].values[0]
+                )
+        # Horizontal wind not in original, should be the same as the component
+        for col in obs_seq.df.columns:
+            if col not in combo_cols and col != "type":
+                assert (
+                    obs_seq.df.loc[
+                        obs_seq.df["type"] == "ACARS_HORIZONTAL_WIND", col
+                    ].values[0]
+                    == obs_seq.df.loc[
+                        obs_seq.df["type"] == "ACARS_U_WIND_COMPONENT", col
+                    ].values[0]
+                )
+        # Verify that the non-composite types are unchanged for all columns
+        for col in obs_seq.df.columns:
+            assert (
+                obs_seq.df.loc[obs_seq.df["type"] == "ACARS_TEMPERATURE", col].values[0]
+                == orig_df.loc[orig_df["type"] == "ACARS_TEMPERATURE", col].values[0]
+            )
     def test_no_yaml_file(self):
         with pytest.raises(Exception):
@@ -874,6 +944,384 @@ class TestCompositeTypes:
         with pytest.raises(yaml.YAMLError):
             obsq.load_yaml_to_dict(broken_file)
+    def test_composite_types_more_than_two_components(self, tmpdir):
+        # Create a YAML file with a composite type with more than 2 components
+        composite_yaml = """
+        acars_super_wind:
+            components: [ACARS_U_WIND_COMPONENT, ACARS_V_WIND_COMPONENT, ACARS_TEMPERATURE]
+        """
+        composite_file = tmpdir.join("composite_more_than_two.yaml")
+        with open(composite_file, "w") as f:
+            f.write(composite_yaml)
+        test_dir = os.path.dirname(__file__)
+        file_path = os.path.join(test_dir, "data", "three-obs.final")
+        obs_seq = obsq.ObsSequence(file_path)
+        # Should raise an exception due to >2 components
+        with pytest.raises(
+            Exception, match="components must be a list of two component types."
+        ):
+            obs_seq.composite_types(composite_types=str(composite_file))
+class TestUpdateAttributesFromDf:
+    def test_update_attributes_from_df(self):
+        obj = obsq.ObsSequence(file=None)
+        df1 = pd.DataFrame(
+            {
+                "obs_num": [1, 2],
+                "observation": [10.0, 20.0],
+                "linked_list": ["-1 2 -1", "1 -1 -1"],
+                "type": ["A", "B"],
+                "time": [dt.datetime(2020, 1, 1), dt.datetime(2020, 1, 2)],
+            }
+        )
+        obj.df = df1
+        obj.update_attributes_from_df()
+        # Check initial state
+        assert obj.columns == ["obs_num", "observation", "linked_list", "type", "time"]
+        assert obj.all_obs == None
+        assert obj.copie_names == ["observation"]
+        assert obj.n_copies == 1
+        # Check linked_list and obs_num updated
+        assert list(obj.df["obs_num"]) == [1, 2]
+        assert list(
+            obj.df["linked_list"]
+        ) == obsq.ObsSequence.generate_linked_list_pattern(2)
+        # Change the DataFrame
+        df2 = pd.DataFrame(
+            {
+                "obs_num": [3],
+                "observation": [30.0],
+                "prior_ensemble_mean": [15.0],
+                "linked_list": ["-1 -1 -1"],
+                "type": ["C"],
+                "time": [dt.datetime(2020, 1, 3)],
+            }
+        )
+        obj.df = df2
+        obj.update_attributes_from_df()
+        # Check updated state
+        assert obj.columns == [
+            "obs_num",
+            "observation",
+            "prior_ensemble_mean",
+            "linked_list",
+            "type",
+            "time",
+        ]
+        assert obj.all_obs == None
+        assert "prior_ensemble_mean" in obj.copie_names
+        assert obj.n_copies == 2  # observation and prior_ensemble_mean
+        assert list(obj.df["obs_num"]) == [1]
+        assert list(
+            obj.df["linked_list"]
+        ) == obsq.ObsSequence.generate_linked_list_pattern(1)
+    def test_update_attributes_from_df_drop_column(self):
+        obj = obsq.ObsSequence(file=None)
+        df = pd.DataFrame(
+            {
+                "obs_num": [1, 2],
+                "observation": [10.0, 20.0],
+                "prior_ensemble_mean": [1.5, 2.5],
+                "linked_list": ["-1 2 -1", "1 -1 -1"],
+                "type": ["A", "B"],
+                "time": [dt.datetime(2020, 1, 1), dt.datetime(2020, 1, 2)],
+            }
+        )
+        obj.df = df
+        obj.update_attributes_from_df()
+        # Initial state
+        assert "prior_ensemble_mean" in obj.copie_names
+        assert obj.n_copies == 2  # observation and prior_ensemble_mean
+        assert list(obj.df["obs_num"]) == [1, 2]
+        assert list(
+            obj.df["linked_list"]
+        ) == obsq.ObsSequence.generate_linked_list_pattern(2)
+        # Drop a column and update
+        obj.df = obj.df.drop(columns=["prior_ensemble_mean"])
+        obj.update_attributes_from_df()
+        # Check that the dropped column is no longer present
+        assert "prior_ensemble_mean" not in obj.copie_names
+        assert obj.n_copies == 1  # only observation left
+        assert list(obj.df["obs_num"]) == [1, 2]
+        assert list(
+            obj.df["linked_list"]
+        ) == obsq.ObsSequence.generate_linked_list_pattern(2)
+    def test_update_attributes_from_df_qc_counts(self):
+        obj = obsq.ObsSequence(file=None)
+        df = pd.DataFrame(
+            {
+                "obs_num": [1, 2],
+                "observation": [10.0, 20.0],
+                "DART_QC": [0, 1],
+                "linked_list": ["-1 2 -1", "1 -1 -1"],
+                "type": ["A", "B"],
+                "time": [dt.datetime(2020, 1, 1), dt.datetime(2020, 1, 2)],
+            }
+        )
+        obj.df = df
+        obj.copie_names = ["observation", "DART_QC"]
+        obj.non_qc_copie_names = ["observation"]
+        obj.qc_copie_names = ["DART_QC"]
+        obj.n_non_qc = 1
+        obj.n_qc = 1
+        obj.update_attributes_from_df()
+        # Check initial QC/non-QC counts
+        assert obj.n_non_qc == 1
+        assert obj.n_qc == 1
+        assert obj.non_qc_copie_names == ["observation"]
+        assert obj.qc_copie_names == ["DART_QC"]
+        assert list(obj.df["obs_num"]) == [1, 2]
+        assert list(
+            obj.df["linked_list"]
+        ) == obsq.ObsSequence.generate_linked_list_pattern(2)
+        # Now drop the QC column and update
+        obj.df = obj.df.drop(columns=["DART_QC"])
+        obj.update_attributes_from_df()
+        # Check that n_qc is now 0 and n_non_qc is 1
+        assert obj.n_non_qc == 1
+        assert obj.n_qc == 0
+        assert obj.non_qc_copie_names == ["observation"]
+        assert obj.qc_copie_names == []
+        assert list(obj.df["obs_num"]) == [1, 2]
+        assert list(
+            obj.df["linked_list"]
+        ) == obsq.ObsSequence.generate_linked_list_pattern(2)
+    def test_update_attributes_from_df_drop_multiple_qc_copies(self):
+        obj = obsq.ObsSequence(file=None)
+        # Initial DataFrame with 1 non-QC and 3 QC copies
+        df = pd.DataFrame(
+            {
+                "obs_num": [1, 2],
+                "observation": [10.0, 20.0],
+                "QC1": [0, 1],
+                "QC2": [1, 0],
+                "QC3": [2, 2],
+                "linked_list": ["-1 2 -1", "1 -1 -1"],
+                "type": ["A", "B"],
+                "time": [dt.datetime(2020, 1, 1), dt.datetime(2020, 1, 2)],
+            }
+        )
+        obj.df = df
+        obj.copie_names = ["observation", "QC1", "QC2", "QC3"]
+        obj.non_qc_copie_names = ["observation"]
+        obj.qc_copie_names = ["QC1", "QC2", "QC3"]
+        obj.n_non_qc = 1
+        obj.n_qc = 3
+        obj.update_attributes_from_df()
+        # Check initial QC/non-QC counts
+        assert obj.n_non_qc == 1
+        assert obj.n_qc == 3
+        assert obj.non_qc_copie_names == ["observation"]
+        assert obj.qc_copie_names == ["QC1", "QC2", "QC3"]
+        assert list(obj.df["obs_num"]) == [1, 2]
+        assert list(
+            obj.df["linked_list"]
+        ) == obsq.ObsSequence.generate_linked_list_pattern(2)
+        # Drop two QC columns and update
+        obj.df = obj.df.drop(columns=["QC2", "QC3"])
+        obj.update_attributes_from_df()
+        # Check that only one QC copy remains
+        assert obj.n_non_qc == 1
+        assert obj.n_qc == 1
+        assert obj.non_qc_copie_names == ["observation"]
+        assert obj.qc_copie_names == ["QC1"]
+        assert obj.copie_names == ["observation", "QC1"]
+        assert list(obj.df["obs_num"]) == [1, 2]
+        assert list(
+            obj.df["linked_list"]
+        ) == obsq.ObsSequence.generate_linked_list_pattern(2)
+    def test_update_attributes_from_df_drop_row(self):
+        obj = obsq.ObsSequence(file=None)
+        df = pd.DataFrame(
+            {
+                "obs_num": [1, 2, 3],
+                "observation": [10.0, 20.0, 30.0],
+                "linked_list": ["-1 2 -1", "1 3 -1", "2 -1 -1"],
+                "type": ["A", "B", "C"],
+                "time": [
+                    dt.datetime(2020, 1, 1),
+                    dt.datetime(2020, 1, 2),
+                    dt.datetime(2020, 1, 3),
+                ],
+            }
+        )
+        obj.df = df
+        obj.update_attributes_from_df()
+        # Drop the middle row (index 1)
+        obj.df = obj.df.drop(index=1).reset_index(drop=True)
+        obj.update_attributes_from_df()
+        # After dropping, only rows with obs_num 1 and 3 remain, but obs_num should be renumbered
+        assert list(obj.df["obs_num"]) == [1, 2]
+        assert list(
+            obj.df["linked_list"]
+        ) == obsq.ObsSequence.generate_linked_list_pattern(2)
+        assert obj.n_copies == 1
+        assert obj.n_qc == 0
+        assert obj.n_non_qc == 1
+        assert obj.copie_names == ["observation"]
+        assert obj.columns == ["obs_num", "observation", "linked_list", "type", "time"]
+    def test_update_attributes_from_df_add_column(self):
+        obj = obsq.ObsSequence(file=None)
+        df = pd.DataFrame(
+            {
+                "obs_num": [1, 2],
+                "observation": [10.0, 20.0],
+                "linked_list": ["-1 2 -1", "1 -1 -1"],
+                "type": ["A", "B"],
+                "time": [dt.datetime(2020, 1, 1), dt.datetime(2020, 1, 2)],
+            }
+        )
+        obj.df = df
+        obj.update_attributes_from_df()
+        # Insert a new column between 'observation' and 'linked_list'
+        insert_at = obj.df.columns.get_loc("linked_list")
+        obj.df.insert(insert_at, "prior_ensemble_mean", [1.5, 2.5])
+        obj.update_attributes_from_df()
+        # Check that the new column is present and in the correct position
+        assert obj.df.columns.tolist() == [
+            "obs_num",
+            "observation",
+            "prior_ensemble_mean",
+            "linked_list",
+            "type",
+            "time",
+        ]
+        assert "prior_ensemble_mean" in obj.copie_names
+        assert obj.n_copies == 2  # observation and prior_ensemble_mean
+        assert obj.n_qc == 0  # no QC columns
+        assert obj.n_non_qc == 2
+        assert list(obj.df["obs_num"]) == [1, 2]
+        assert list(
+            obj.df["linked_list"]
+        ) == obsq.ObsSequence.generate_linked_list_pattern(2)
+class TestQC2Replacement:
+    @pytest.fixture
+    def obs_seq(self):
+        # Create a sample DataFrame for testing
+        data = {
+            "DART_quality_control": [0, 2, 2, 0],
+            "posterior_ensemble_mean": [1.1, -888888.0, -888888.0, 2.2],
+            "posterior_ensemble_spread": [0.1, -888888.0, -888888.0, 0.2],
+            "posterior_ensemble_member_1": [1.0, -888888.0, -888888.0, 2.0],
+            "posterior_ensemble_member_2": [1.2, -888888.0, -888888.0, 2.3],
+        }
+        df = pd.DataFrame(data)
+        # Create an instance of obs_sequence with the sample DataFrame
+        obs_seq = obsq.ObsSequence(file=None)
+        obs_seq.df = df
+        return obs_seq
+    @pytest.fixture
+    def obs_seq_nan(self):
+        # Create a sample DataFrame for testing
+        data_nan = {
+            "DART_quality_control": [0, 2, 2, 0],
+            "posterior_ensemble_mean": [1.1, np.nan, np.nan, 2.2],
+            "posterior_ensemble_spread": [0.1, np.nan, np.nan, 0.2],
+            "posterior_ensemble_member_1": [1.0, np.nan, np.nan, 2.0],
+            "posterior_ensemble_member_2": [1.2, np.nan, np.nan, 2.3],
+        }
+        df = pd.DataFrame(data_nan)
+        # Create an instance of obs_sequence with the sample DataFrame
+        obs_seq_nan = obsq.ObsSequence(file=None)
+        obs_seq_nan.df = df
+        return obs_seq_nan
+    def test_replace_qc2_nan(self, obs_seq):
+        # Call the replace_qc2_r8s method
+        obsq.ObsSequence.replace_qc2_nan(obs_seq.df)
+        # Verify that NaNs are correctly replaced for QC2 rows
+        assert (
+            obs_seq.df.loc[
+                obs_seq.df["DART_quality_control"] == 2.0, "posterior_ensemble_mean"
+            ]
+            .isnull()
+            .all()
+        )
+        assert (
+            obs_seq.df.loc[
+                obs_seq.df["DART_quality_control"] == 2.0, "posterior_ensemble_spread"
+            ]
+            .isnull()
+            .all()
+        )
+        assert (
+            obs_seq.df.loc[
+                obs_seq.df["DART_quality_control"] == 2.0, "posterior_ensemble_member_1"
+            ]
+            .isnull()
+            .all()
+        )
+        assert (
+            obs_seq.df.loc[
+                obs_seq.df["DART_quality_control"] == 2.0, "posterior_ensemble_member_2"
+            ]
+            .isnull()
+            .all()
+        )
+    def test_revert_qc2_nan(self, obs_seq_nan):
+        # Revert NaNs back to MISSING_R8s
+        obsq.ObsSequence.revert_qc2_nan(obs_seq_nan.df)
+        # Verify that MISSING_R8s (-888888.0) are correctly restored for QC2 rows
+        assert (
+            obs_seq_nan.df.loc[
+                obs_seq_nan.df["DART_quality_control"] == 2.0, "posterior_ensemble_mean"
+            ]
+            == -888888.0
+        ).all()
+        assert (
+            obs_seq_nan.df.loc[
+                obs_seq_nan.df["DART_quality_control"] == 2.0,
+                "posterior_ensemble_spread",
+            ]
+            == -888888.0
+        ).all()
+        assert (
+            obs_seq_nan.df.loc[
+                obs_seq_nan.df["DART_quality_control"] == 2.0,
+                "posterior_ensemble_member_1",
+            ]
+            == -888888.0
+        ).all()
+        assert (
+            obs_seq_nan.df.loc[
+                obs_seq_nan.df["DART_quality_control"] == 2.0,
+                "posterior_ensemble_member_2",
+            ]
+            == -888888.0
+        ).all()
 if __name__ == "__main__":
     pytest.main()