PyPI - pydartdiags - Versions diffs - 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

pydartdiags 0.5.1py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pydartdiags might be problematic. Click here for more details.

Files changed (7) hide show

pydartdiags/obs_sequence/obs_sequence.py CHANGED Viewed

@@ -19,17 +19,46 @@ def requires_assimilation_info(func):
     return wrapper
-class obs_sequence:
+class ObsSequence:
     """
-    Initialize an obs_sequence object from an ASCII or binary observation sequence file,
-    or create an empty obs_sequence object from scratch.
+    Initialize an ObsSequence object from an ASCII or binary observation sequence file,
+    or create an empty ObsSequence object from scratch.
+    1D observations are given a datetime of days, seconds since 2000-01-01 00:00:00
+    3D observations are given a datetime of days, seconds since 1601-01-01 00:00:00 (DART Gregorian calendar)
     Args:
         file (str): The input observation sequence ASCII or binary file.
-                If None, an empty obs_sequence object is created from scratch.
+            If None, an empty ObsSequence object is created from scratch.
+        synonyms (list, optional): List of additional synonyms for the observation column in the DataFrame.
+            The default list is
+            .. code-block:: python
+                ['NCEP BUFR observation',
+                'AIRS observation',
+                'GTSPP observation',
+                'SST observation',
+                'observations',
+                'WOD observation']
+            You can add more synonyms by providing a list of strings when
+            creating the ObsSequence object.
+            .. code-block:: python
+                ObsSequence(file, synonyms=['synonym1', 'synonym2'])
+    Raises:
+        ValueError: If neither 'loc3d' nor 'loc1d' could be found in the observation sequence.
+    Examples:
+        .. code-block:: python
+            obs_seq = ObsSequence(file='obs_seq.final')
-    Returns:
-        An obs_sequence object
     Attributes:
         df (pandas.DataFrame): The DataFrame containing the observation sequence data.
@@ -54,34 +83,18 @@ class obs_sequence:
             - scale height: 'VERTISSCALEHEIGHT' (unitless)
         loc_mod (str): The location model, either 'loc3d' or 'loc1d'.
             For 3D sphere models: latitude and longitude are in degrees in the DataFrame.
-        types (dict): Dictionary of types of observations the observation sequence,
+        types (dict): Dictionary of types of observations in the observation sequence,
             e.g. {23: 'ACARS_TEMPERATURE'},
         reverse_types (dict): Dictionary of types with keys and values reversed, e.g
             {'ACARS_TEMPERATURE': 23}
         synonyms_for_obs (list): List of synonyms for the observation column in the DataFrame.
-            The default list is
-            .. code-block:: python
-                [ 'NCEP BUFR observation',
-                'AIRS observation',
-                'GTSPP observation',
-                'SST observation',
-                'observations',
-                'WOD observation']
-            You can add more synonyms by providing a list of strings when
-            creating the obs_sequence object.
-            .. code-block:: python
-                obs_sequence(file, synonyms=['synonym1', 'synonym2']).df
         seq (generator): Generator of observations from the observation sequence file.
         all_obs (list): List of all observations, each observation is a list.
-            Valid when the obs_sequence is created from a file.
-            Set to None when the obs_sequence is created from scratch or multiple
-            obs_sequences are joined.
+            Valid when the ObsSequence is created from a file.
+            Set to None when the ObsSequence is created from scratch or multiple
+            ObsSequences are joined.
     """
     vert = {
@@ -96,27 +109,6 @@ class obs_sequence:
     reversed_vert = {value: key for key, value in vert.items()}
     def __init__(self, file, synonyms=None):
-        """
-        Create an obs_sequence object from an ASCII or binary observation sequence file,
-        or create an empty obs_sequence object from scratch.
-        Args:
-            file (str): The input observation sequence ASCII or binary file.
-                    If None, an empty obs_sequence object is created from scratch.
-            synonyms (list, optional): List of synonyms for the observation column in the DataFrame.
-        Returns:
-            an obs_sequence object
-            1D observations are given a datetime of days, seconds since 2000-01-01 00:00:00
-            3D observations are given a datetime of days, seconds since 1601-01-01 00:00:00 (DART Gregorian calendar)
-        Examples:
-            .. code-block:: python
-                obs_seq = obs_sequence(file='obs_seq.final')
-        """
         self.loc_mod = "None"
         self.file = file
@@ -192,6 +184,14 @@ class obs_sequence:
         }
         self.df = self.df.rename(columns=rename_dict)
+        if self.is_binary(file):
+            # binary files do not have "OBS      X" in, so set linked list from df.
+            self.update_attributes_from_df()
+        # Replace MISSING_R8s with NaNs in posterior stats where DART_quality_control = 2
+        if self.has_posterior():
+            ObsSequence.replace_qc2_nan(self.df)
     def create_all_obs(self):
         """steps through the generator to create a
         list of all observations in the sequence
@@ -205,7 +205,7 @@ class obs_sequence:
     def obs_to_list(self, obs):
         """put single observation into a list"""
         data = []
-        data.append(obs[0].split()[1])  # obs_num
+        data.append(int(obs[0].split()[1]))  # obs_num
         data.extend(list(map(float, obs[1 : self.n_copies + 1])))  # all the copies
         data.append(obs[self.n_copies + 1])  # linked list info
         try:  # HK todo only have to check loc3d or loc1d for the first observation, the whole file is the same
@@ -214,7 +214,7 @@ class obs_sequence:
             data.append(float(location[0]))  # location x
             data.append(float(location[1]))  # location y
             data.append(float(location[2]))  # location z
-            data.append(obs_sequence.vert[int(location[3])])
+            data.append(ObsSequence.vert[int(location[3])])
             self.loc_mod = "loc3d"
         except ValueError:
             try:
@@ -227,9 +227,9 @@ class obs_sequence:
                     "Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
                 )
         typeI = obs.index("kind")  # type of observation
-        type_value = obs[typeI + 1]
-        if not self.types:
-            data.append("Identity")
+        type_value = int(obs[typeI + 1])
+        if type_value < 0:
+            data.append(type_value)
         else:
             data.append(self.types[type_value])  # observation type
@@ -291,14 +291,22 @@ class obs_sequence:
                 + str(self.reversed_vert[data[self.n_copies + 5]])
             )  # location x, y, z, vert
             obs.append("kind")  # this is type of observation
-            obs.append(self.reverse_types[data[self.n_copies + 6]])  # observation type
+            obs_type = data[self.n_copies + 6]
+            if isinstance(obs_type, str):
+                obs.append(self.reverse_types[obs_type])  # observation type
+            else:
+                obs.append(obs_type)  # Identity obs negative integer
             # Convert metadata to a string and append !HK @todo you are not converting to string
             obs.extend(data[self.n_copies + 7])  # metadata
             obs.extend(data[self.n_copies + 8])  # external forward operator
         elif self.loc_mod == "loc1d":
             obs.append(data[self.n_copies + 2])  # 1d location
             obs.append("kind")  # this is type of observation
-            obs.append(self.reverse_types[data[self.n_copies + 3]])  # observation type
+            obs_type = data[self.n_copies + 3]
+            if isinstance(obs_type, str):
+                obs.append(self.reverse_types[obs_type])  # observation type
+            else:
+                obs.append(obs_type)  # Identity obs negative integer
             obs.extend(data[self.n_copies + 4])  # metadata
             obs.extend(data[self.n_copies + 5])  # external forward operator
         obs.append(" ".join(map(str, data[-4:-2])))  # seconds, days
@@ -324,14 +332,17 @@ class obs_sequence:
         This function writes the observation sequence stored in the obs_seq.DataFrame to a specified file.
         It updates the header with the number of observations, converts coordinates back to radians
-        if necessary, drops unnecessary columns, sorts the DataFrame by time, and generates a linked
-        list pattern for reading by DART programs.
+        if necessary, reverts NaNs back to MISSING_R8 for observations with QC=2, drops unnecessary
+        columns, sorts the DataFrame by time, and generates a linked list pattern for reading by DART
+        programs.
         Args:
             file (str): The path to the file where the observation sequence will be written.
         Notes:
             - Longitude and latitude are converted back to radians if the location model is 'loc3d'.
+            - The replacement of MISSING_R8 values with NaNs for any obs that failed the posterior
+              forward observation operators (QC2) is reverted.
             - The 'bias' and 'sq_err' columns are dropped if they exist in the DataFrame.
             - The DataFrame is sorted by the 'time' column.
             - An 'obs_num' column is added to the DataFrame to number the observations in time order.
@@ -342,7 +353,8 @@ class obs_sequence:
         """
-        self.create_header_from_dataframe()
+        # Update attributes, header, and linked list from dataframe
+        self.update_attributes_from_df()
         with open(file, "w") as f:
@@ -366,15 +378,9 @@ class obs_sequence:
             if "midpoint" in df_copy.columns:
                 df_copy = df_copy.drop(columns=["midpoint", "vlevels"])
-            # linked list for reading by dart programs
-            df_copy = df_copy.sort_values(
-                by=["time"], kind="stable"
-            )  # sort the DataFrame by time
-            df_copy.reset_index(drop=True, inplace=True)
-            df_copy["obs_num"] = df_copy.index + 1  # obs_num in time order
-            df_copy["linked_list"] = obs_sequence.generate_linked_list_pattern(
-                len(df_copy)
-            )  # linked list pattern
+            # Revert NaNs back to MISSING_R8s
+            if self.has_posterior():
+                ObsSequence.revert_qc2_nan(df_copy)
             def write_row(row):
                 ob_write = self.list_to_obs(row.tolist())
@@ -398,13 +404,16 @@ class obs_sequence:
             dict: The types dictionary with keys sorted in numerical order.
         """
         # Create a dictionary of observation types from the dataframe
-        unique_types = df["type"].unique()
+        # Ignore Identity obs (negative integers)
+        unique_types = df.loc[
+            df["type"].apply(lambda x: isinstance(x, str)), "type"
+        ].unique()
         # Ensure all unique types are in reverse_types
         for obs_type in unique_types:
             if obs_type not in reverse_types:
-                new_id = int(max(reverse_types.values(), default=0)) + 1
-                reverse_types[obs_type] = str(new_id)
+                new_id = max(reverse_types.values(), default=0) + 1
+                reverse_types[obs_type] = new_id
         not_sorted_types = {
             reverse_types[obs_type]: obs_type for obs_type in unique_types
@@ -439,9 +448,7 @@ class obs_sequence:
         self.header.append(f"{len(self.types)}")
         for key, value in self.types.items():
             self.header.append(f"{key} {value}")
-        self.header.append(
-            f"num_copies: {self.n_non_qc}  num_qc: {self.n_qc}"
-        )  # @todo HK not keeping track if num_qc changes
+        self.header.append(f"num_copies: {self.n_non_qc}  num_qc: {self.n_qc}")
         self.header.append(f"num_obs: {num_obs:>10} max_num_obs: {num_obs:>10}")
         stats_cols = [
             "prior_bias",
@@ -594,7 +601,7 @@ class obs_sequence:
         with open(file, "rb") as f:
             while True:
                 # Read the record length
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 if record_length is None:
                     break
                 record = f.read(record_length)
@@ -602,7 +609,7 @@ class obs_sequence:
                     break
                 # Read the trailing record length (should match the leading one)
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 linecount += 1
@@ -620,7 +627,7 @@ class obs_sequence:
             f.seek(0)
             for _ in range(2):
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 if record_length is None:
                     break
@@ -628,7 +635,7 @@ class obs_sequence:
                 if not record:  # end of file
                     break
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 header.append(record.decode("utf-8").strip())
             header.append(str(obs_types_definitions))
@@ -636,7 +643,7 @@ class obs_sequence:
             # obs_types_definitions
             for _ in range(3, 4 + obs_types_definitions):
                 # Read the record length
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 if record_length is None:
                     break
@@ -645,7 +652,7 @@ class obs_sequence:
                 if not record:  # end of file
                     break
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 if _ == 3:
                     continue  # num obs_types_definitions
@@ -663,7 +670,7 @@ class obs_sequence:
                 5 + obs_types_definitions + num_copies + num_qcs + 1,
             ):
                 # Read the record length
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 if record_length is None:
                     break
@@ -672,7 +679,7 @@ class obs_sequence:
                 if not record:
                     break
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 if _ == 5 + obs_types_definitions:
                     continue
@@ -683,12 +690,12 @@ class obs_sequence:
             # first and last obs
             # Read the record length
-            record_length = obs_sequence.read_record_length(f)
+            record_length = ObsSequence.read_record_length(f)
             # Read the actual record
             record = f.read(record_length)
-            obs_sequence.check_trailing_record_length(f, record_length)
+            ObsSequence.check_trailing_record_length(f, record_length)
             # Read the whole record as a two integers
             first, last = struct.unpack("ii", record)[:8]
@@ -700,7 +707,8 @@ class obs_sequence:
     def collect_obs_types(header):
         """Create a dictionary for the observation types in the obs_seq header"""
         num_obs_types = int(header[2])
-        types = dict([x.split() for x in header[3 : num_obs_types + 3]])
+        # The first line containing obs types is the 4th line in an obs_seq file.
+        types = {int(x.split()[0]): x.split()[1] for x in header[3 : num_obs_types + 3]}
         return types
     @staticmethod
@@ -813,7 +821,7 @@ class obs_sequence:
             # Skip the first len(obs_seq.header) lines
             for _ in range(header_length - 1):
                 # Read the record length
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 if record_length is None:  # End of file
                     break
@@ -830,7 +838,7 @@ class obs_sequence:
                 obs.append(f"OBS        {obs_num}")
                 for _ in range(n):  # number of copies
                     # Read the record length
-                    record_length = obs_sequence.read_record_length(f)
+                    record_length = ObsSequence.read_record_length(f)
                     if record_length is None:
                         break
                     # Read the actual record (copie)
@@ -838,10 +846,10 @@ class obs_sequence:
                     obs.append(struct.unpack("d", record)[0])
                     # Read the trailing record length (should match the leading one)
-                    obs_sequence.check_trailing_record_length(f, record_length)
+                    ObsSequence.check_trailing_record_length(f, record_length)
                 # linked list info
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 if record_length is None:
                     break
@@ -850,63 +858,94 @@ class obs_sequence:
                 linked_list_string = f"{int1:<12} {int2:<10} {int3:<12}"
                 obs.append(linked_list_string)
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 # location (note no location header "loc3d" or "loc1d" for binary files)
                 obs.append("loc3d")
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 record = f.read(record_length)
                 x, y, z, vert = struct.unpack("dddi", record[:28])
                 location_string = f"{x} {y} {z} {vert}"
                 obs.append(location_string)
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 #   kind (type of observation) value
                 obs.append("kind")
-                record_length_bytes = f.read(4)
-                record_length = struct.unpack("i", record_length_bytes)[0]
+                record_length = ObsSequence.read_record_length(f)
                 record = f.read(record_length)
                 kind = f"{struct.unpack('i', record)[0]}"
                 obs.append(kind)
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
+                # Skip metadata (obs_def) and go directly to the time record
+                while True:
+                    pos = f.tell()
+                    record_length = ObsSequence.read_record_length(f)
+                    if record_length is None:
+                        break  # End of file
+                    record = f.read(record_length)
+                    # Check if this record is likely the "time" record (8 bytes, can be unpacked as two ints)
+                    if record_length == 8:
+                        try:
+                            seconds, days = struct.unpack("ii", record)
+                            # If unpack succeeds, this is the time record
+                            f.seek(pos)  # Seek back so the main loop can process it
+                            break
+                        except struct.error:
+                            pass  # Not the time record, keep skipping
+                    ObsSequence.check_trailing_record_length(f, record_length)
                 # time (seconds, days)
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 record = f.read(record_length)
-                seconds, days = struct.unpack("ii", record)[:8]
+                try:  # This is incase the record is not the time record because of metadata funkyness
+                    seconds, days = struct.unpack("ii", record)
+                except struct.error as e:
+                    print(
+                        f"Reading observation {obs_num}... record length: {record_length} kind {kind}"
+                    )
+                    print(f"")
+                    print(f"Error unpacking seconds and days: {e}")
+                    raise
                 time_string = f"{seconds} {days}"
                 obs.append(time_string)
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 # obs error variance
-                record_length = obs_sequence.read_record_length(f)
+                record_length = ObsSequence.read_record_length(f)
                 record = f.read(record_length)
                 obs.append(struct.unpack("d", record)[0])
-                obs_sequence.check_trailing_record_length(f, record_length)
+                ObsSequence.check_trailing_record_length(f, record_length)
                 yield obs
-    def composite_types(self, composite_types="use_default"):
+    def composite_types(self, composite_types="use_default", raise_on_duplicate=False):
         """
-        Set up and construct composite types for the DataFrame.
+        Set up and construct composite observation types for the DataFrame.
-        This function sets up composite types based on a provided YAML configuration or
+        This function sets up composite observation types based on a provided YAML configuration or
         a default configuration. It constructs new composite rows by combining specified
-        components and adds them to the DataFrame.
+        components and adds them to the DataFrame in place.
         Args:
             composite_types (str, optional): The YAML configuration for composite types.
-            If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
+                If 'use_default', the default configuration is used. Otherwise, a custom YAML
+                configuration can be provided.
+            raise_on_duplicate (bool, optional): If True, raises an exception if there are
+                duplicates in the components. otherwise default False, deals with duplicates as though
+                they are distinct observations.
         Returns:
             pd.DataFrame: The updated DataFrame with the new composite rows added.
         Raises:
-            Exception: If there are repeat values in the components.
+            Exception: If there are repeat values in the components and raise_on_duplicate = True
         """
         if composite_types == "use_default":
@@ -932,7 +971,10 @@ class obs_sequence:
         df = pd.DataFrame()
         for key in self.composite_types_dict:
             df_new = construct_composit(
-                df_comp, key, self.composite_types_dict[key]["components"]
+                df_comp,
+                key,
+                self.composite_types_dict[key]["components"],
+                raise_on_duplicate,
             )
             df = pd.concat([df, df_new], axis=0)
@@ -945,29 +987,29 @@ class obs_sequence:
         """
         Join a list of observation sequences together.
-        This method combines the headers and observations from a list of obs_sequence objects
-        into a single obs_sequence object.
+        This method combines the headers and observations from a list of ObsSequence objects
+        into a single ObsSequence object.
         Args:
-            obs_sequences (list of obs_sequences): The list of observation sequences objects to join.
+            obs_sequences (list of ObsSequences): The list of observation sequences objects to join.
             copies (list of str, optional): A list of copy names to include in the combined data.
                     If not provided, all copies are included.
         Returns:
-            A new obs_sequence object containing the combined data.
+            A new ObsSequence object containing the combined data.
         Example:
             .. code-block:: python
-                obs_seq1 = obs_sequence(file='obs_seq1.final')
-                obs_seq2 = obs_sequence(file='obs_seq2.final')
-                obs_seq3 = obs_sequence(file='obs_seq3.final')
-                combined = obs_sequence.join([obs_seq1, obs_seq2, obs_seq3])
+                obs_seq1 = ObsSequence(file='obs_seq1.final')
+                obs_seq2 = ObsSequence(file='obs_seq2.final')
+                obs_seq3 = ObsSequence(file='obs_seq3.final')
+                combined = ObsSequence.join([obs_seq1, obs_seq2, obs_seq3])
         """
         if not obs_sequences:
             raise ValueError("The list of observation sequences is empty.")
-        # Create a new obs_sequnece object with the combined data
+        # Create a new ObsSequence object with the combined data
         combo = cls(file=None)
         # Check if all obs_sequences have compatible attributes
@@ -1053,53 +1095,49 @@ class obs_sequence:
                 if item in obs_sequences[0].qc_copie_names
             ]
-            combo.n_copies = len(combo.copie_names)
-            combo.n_qc = len(combo.qc_copie_names)
-            combo.n_non_qc = len(combo.non_qc_copie_names)
         else:
             for obs_seq in obs_sequences:
                 if not obs_sequences[0].df.columns.isin(obs_seq.df.columns).all():
                     raise ValueError(
                         "All observation sequences must have the same copies."
                     )
-            combo.n_copies = obs_sequences[0].n_copies
-            combo.n_qc = obs_sequences[0].n_qc
-            combo.n_non_qc = obs_sequences[0].n_non_qc
             combo.copie_names = obs_sequences[0].copie_names
+            combo.non_qc_copie_names = obs_sequences[0].non_qc_copie_names
+            combo.qc_copie_names = obs_sequences[0].qc_copie_names
+            combo.n_copies = len(combo.copie_names)
         # todo HK @todo combine synonyms for obs?
         # Initialize combined data
-        combined_types = []
-        combined_df = pd.DataFrame()
-        combo.all_obs = None  # set to none to force writing from the dataframe if write_obs_seq is called
+        combo.df = pd.DataFrame()
         # Iterate over the list of observation sequences and combine their data
         for obs_seq in obs_sequences:
             if copies:
-                combined_df = pd.concat(
-                    [combined_df, obs_seq.df[requested_columns]], ignore_index=True
+                combo.df = pd.concat(
+                    [combo.df, obs_seq.df[requested_columns]], ignore_index=True
                 )
             else:
-                combined_df = pd.concat([combined_df, obs_seq.df], ignore_index=True)
-            combined_types.extend(list(obs_seq.reverse_types.keys()))
-        # create dictionary of types
-        keys = set(combined_types)
-        combo.reverse_types = {item: i + 1 for i, item in enumerate(keys)}
-        combo.types = {v: k for k, v in combo.reverse_types.items()}
-        # create linked list for obs
-        combo.df = combined_df.sort_values(by="time").reset_index(drop=True)
-        combo.df["linked_list"] = obs_sequence.generate_linked_list_pattern(
-            len(combo.df)
-        )
-        combo.df["obs_num"] = combined_df.index + 1
-        combo.create_header(len(combo.df))
+                combo.df = pd.concat([combo.df, obs_seq.df], ignore_index=True)
+        # update ObsSequence attributes from the combined DataFrame
+        combo.update_attributes_from_df()
         return combo
+    @staticmethod
+    def update_linked_list(df):
+        """
+        Sorts the DataFrame by 'time', resets the index, and adds/updates 'linked_list'
+        and 'obs_num' columns in place.
+        Modifies the input DataFrame directly.
+        """
+        df.sort_values(by="time", inplace=True, kind="stable")
+        df.reset_index(drop=True, inplace=True)
+        df["linked_list"] = ObsSequence.generate_linked_list_pattern(len(df))
+        df["obs_num"] = df.index + 1
+        return None
     def has_assimilation_info(self):
         """
         Check if the DataFrame has prior information.
@@ -1125,7 +1163,7 @@ class obs_sequence:
         )
     def create_header(self, n):
-        """Create a header for the obs_seq file from the obs_sequence object."""
+        """Create a header for the obs_seq file from the ObsSequence object."""
         assert (
             self.n_copies == self.n_non_qc + self.n_qc
         ), "n_copies must be equal to n_non_qc + n_qc"
@@ -1142,6 +1180,100 @@ class obs_sequence:
             self.header.append(copie)
         self.header.append(f"first: 1 last: {n}")
+    @staticmethod
+    def replace_qc2_nan(df):
+        """
+        Replace MISSING_R8 values with NaNs in posterior columns for observations where
+        DART_quality_control = 2 (posterior forward observation operators failed)
+        This causes these observations to be ignored in the calculations of posterior statistics
+        """
+        df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_mean"] = np.nan
+        df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_spread"] = np.nan
+        num_post_members = len(
+            df.columns[df.columns.str.startswith("posterior_ensemble_member_")]
+        )
+        for i in range(1, num_post_members + 1):
+            df.loc[
+                df["DART_quality_control"] == 2.0,
+                "posterior_ensemble_member_" + str(i),
+            ] = np.nan
+    @staticmethod
+    def revert_qc2_nan(df):
+        """
+        Revert NaNs back to MISSING_R8s for observations where DART_quality_control = 2
+        (posterior forward observation operators failed)
+        """
+        df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_mean"] = (
+            -888888.000000
+        )
+        df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_spread"] = (
+            -888888.000000
+        )
+        num_post_members = len(
+            df.columns[df.columns.str.startswith("posterior_ensemble_member_")]
+        )
+        for i in range(1, num_post_members + 1):
+            df.loc[
+                df["DART_quality_control"] == 2.0, "posterior_ensemble_member_" + str(i)
+            ] = -888888.000000
+    def update_attributes_from_df(self):
+        """
+        Update all internal data (fields/properties) of the ObsSequence object that
+        depend on the DataFrame (self.df).
+        Call this after self.df is replaced or its structure changes.
+        Important:
+         Assumes copies are all columns between 'obs_num' and 'linked_list' (if present)
+        """
+        # Update columns
+        self.columns = list(self.df.columns)
+        # Update all_obs (list of lists, each row) @todo HK do we need this?
+        self.all_obs = None
+        # Update copie_names, non_qc_copie_names, qc_copie_names, n_copies, n_non_qc, n_qc
+        # Try to infer from columns if possible, else leave as is
+        # Assume copies are all columns between 'obs_num' and 'linked_list' (if present)
+        if "obs_num" in self.df.columns and "linked_list" in self.df.columns:
+            obs_num_idx = self.df.columns.get_loc("obs_num")
+            linked_list_idx = self.df.columns.get_loc("linked_list")
+            self.copie_names = list(self.df.columns[obs_num_idx + 1 : linked_list_idx])
+        else:
+            # Fallback: use previous value or empty
+            self.copie_names = getattr(self, "copie_names", [])
+        self.n_copies = len(self.copie_names)
+        # Try to infer non_qc and qc copies from previous names if possible
+        # Find qc copies first
+        self.qc_copie_names = [c for c in self.copie_names if c in self.qc_copie_names]
+        if self.qc_copie_names == []:  # If no qc copies found, assume all are non-qc
+            self.non_qc_copie_names = self.copie_names
+        else:  # pull out non-qc copies from the copie_names
+            self.non_qc_copie_names = [
+                c for c in self.copie_names if c not in self.qc_copie_names
+            ]
+        self.n_qc = len(self.qc_copie_names)
+        self.n_non_qc = len(self.non_qc_copie_names)
+        # Update header and types and reverse_types
+        self.create_header_from_dataframe()
+        # Update seq (generator should be empty or None if not from file)
+        self.seq = []
+        # Update loc_mod
+        if "vertical" in self.df.columns:
+            self.loc_mod = "loc3d"
+        else:
+            self.loc_mod = "loc1d"
+        # update linked list for obs and obs_nums
+        ObsSequence.update_linked_list(self.df)
 def load_yaml_to_dict(file_path):
     """
@@ -1172,24 +1304,31 @@ def convert_dart_time(seconds, days):
     return time
-def construct_composit(df_comp, composite, components):
+def construct_composit(df_comp, composite, components, raise_on_duplicate):
     """
-    Construct a composite DataFrame by combining rows from two components.
-    This function takes two DataFrames and combines rows from them based on matching
-    location and time. It creates a new row with a composite type by combining
-    specified columns using the square root of the sum of squares method.
+    Creates a new DataFrame by combining pairs of rows from two specified component
+    types in an observation DataFrame. It matches rows based on location and time,
+    and then combines certain columns using the square root of the sum of squares
+    of the components.
     Args:
         df_comp (pd.DataFrame): The DataFrame containing the component rows to be combined.
         composite (str): The type name for the new composite rows.
         components (list of str): A list containing the type names of the two components to be combined.
+        raise_on_duplicate (bool): If False, raises an exception if there are duplicates in the components.
+        otherwise deals with duplicates as though they are distinct observations.
     Returns:
         merged_df (pd.DataFrame): A DataFrame containing the new composite rows.
     """
+    # select rows for the two components
+    if len(components) != 2:
+        raise ValueError("components must be a list of two component types.")
     selected_rows = df_comp[df_comp["type"] == components[0].upper()]
     selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
+    selected_rows = selected_rows.copy()
+    selected_rows_v = selected_rows_v.copy()
     prior_columns_to_combine = df_comp.filter(regex="prior_ensemble").columns.tolist()
     posterior_columns_to_combine = df_comp.filter(
@@ -1200,7 +1339,7 @@ def construct_composit(df_comp, composite, components):
         + posterior_columns_to_combine
         + ["observation", "obs_err_var"]
     )
-    merge_columns = ["latitude", "longitude", "vertical", "time"]
+    merge_columns = ["latitude", "longitude", "vertical", "time"]  # @todo HK 1d or 3d
     same_obs_columns = merge_columns + [
         "observation",
         "obs_err_var",
@@ -1210,15 +1349,25 @@ def construct_composit(df_comp, composite, components):
         selected_rows[same_obs_columns].duplicated().sum() > 0
         or selected_rows_v[same_obs_columns].duplicated().sum() > 0
     ):
-        print(
-            f"{selected_rows[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
-        )
-        print(f"{selected_rows[same_obs_columns]}")
-        print(
-            f"{selected_rows_v[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
-        )
-        print(f"{selected_rows_v[same_obs_columns]}")
-        raise Exception("There are duplicates in the components.")
+        if raise_on_duplicate:
+            print(
+                f"{selected_rows[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
+            )
+            print(f"{selected_rows[same_obs_columns]}")
+            print(
+                f"{selected_rows_v[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
+            )
+            print(f"{selected_rows_v[same_obs_columns]}")
+            raise Exception("There are duplicates in the components.")
+        else:
+            selected_rows["dup_num"] = selected_rows.groupby(
+                same_obs_columns
+            ).cumcount()
+            selected_rows_v["dup_num"] = selected_rows_v.groupby(
+                same_obs_columns
+            ).cumcount()
     # Merge the two DataFrames on location and time columns
     merged_df = pd.merge(
@@ -1235,4 +1384,7 @@ def construct_composit(df_comp, composite, components):
         columns=[col for col in merged_df.columns if col.endswith("_v")]
     )
+    if "dup_num" in merged_df.columns:
+        merged_df = merged_df.drop(columns=["dup_num"])
     return merged_df

pydartdiags/stats/stats.py CHANGED Viewed

@@ -4,8 +4,6 @@ import numpy as np
 from functools import wraps
 from datetime import datetime, timedelta
-# from pydartdiags.obs_sequence import obs_sequence as obsq
 def apply_to_phases_in_place(func):
     """
@@ -93,6 +91,12 @@ def calculate_rank(df, phase):
     """
     Calculate the rank of observations within an ensemble.
+    Note:
+        This function is decorated with @apply_to_phases_by_obs, which modifies its usage.
+        You should call it as calculate_rank(df), and the decorator will automatically apply the
+        function to all relevant phases (‘prior’ and ‘posterior’).
     This function takes a DataFrame containing ensemble predictions and observed values,
     adds sampling noise to the ensemble predictions, and calculates the rank of the observed
     value within the perturbed ensemble for each observation. The rank indicates the position
@@ -103,8 +107,6 @@ def calculate_rank(df, phase):
     Parameters:
         df (pd.DataFrame): A DataFrame with columns for rank, and observation type.
-        phase (str): The phase for which to calculate the statistics ('prior' or 'posterior')
     Returns:
         DataFrame containing columns for 'rank' and observation 'type'.
     """
@@ -158,15 +160,20 @@ def diag_stats(df, phase):
     """
     Calculate diagnostic statistics for a given phase and add them to the DataFrame.
+    Note:
+        This function is decorated with @apply_to_phases_in_place, which modifies its usage.
+        You should call it as diag_stats(df), and the decorator will automatically apply the
+        function to all relevant phases (‘prior’ and ‘posterior’) modifying the DataFrame
+        in place.
     Args:
         df (pandas.DataFrame): The input DataFrame containing observation data and ensemble statistics.
-                               The DataFrame must include the following columns:
-                               - 'observation': The actual observation values.
-                               - 'obs_err_var': The variance of the observation error.
-                               - 'prior_ensemble_mean' and/or 'posterior_ensemble_mean': The mean of the ensemble.
-                               - 'prior_ensemble_spread' and/or 'posterior_ensemble_spread': The spread of the ensemble.
+            The DataFrame must include the following columns:
-        phase (str): The phase for which to calculate the statistics ('prior' or 'posterior')
+            - 'observation': The actual observation values.
+            - 'obs_err_var': The variance of the observation error.
+            - 'prior_ensemble_mean' and/or 'posterior_ensemble_mean': The mean of the ensemble.
+            - 'prior_ensemble_spread' and/or 'posterior_ensemble_spread': The spread of the ensemble.
     Returns:
         None: The function modifies the DataFrame in place by adding the following columns:
@@ -203,9 +210,12 @@ def bin_by_layer(df, levels, verticalUnit="pressure (Pa)"):
     vertical level bin. Only observations (row) with the specified vertical unit are binned.
     Args:
-        df (pandas.DataFrame): The input DataFrame containing observation data. The DataFrame must include the following columns:
+        df (pandas.DataFrame): The input DataFrame containing observation data.
+                               The DataFrame must include the following columns:
                                - 'vertical': The vertical coordinate values of the observations.
                                - 'vert_unit': The unit of the vertical coordinate values.
         levels (list): A list of bin edges for the vertical levels.
         verticalUnit (str, optional): The unit of the vertical axis (e.g., 'pressure (Pa)'). Default is 'pressure (Pa)'.
@@ -261,6 +271,28 @@ def bin_by_time(df, time_value):
 @apply_to_phases_by_type_return_df
 def grand_statistics(df, phase):
+    """
+    Calculate grand statistics (RMSE, bias, total spread) for each observation type and phase.
+    This function assumes that diagnostic statistics (such as squared error, bias, and total variance)
+    have already been computed by :func:`diag_stats` and are present in the DataFrame. It groups the data by observation
+    type and computes the root mean square error (RMSE), mean bias, and total spread for the specified phase.
+    Note:
+        This function is decorated with @apply_to_phases_by_type_return_df, which modifies its usage
+        You should call it as grand_statistics(df), and the decorator will automatically apply the function
+        to all relevant phases ('prior' and 'posterior') and return a merged DataFrame.
+    Args:
+        df (pandas.DataFrame): The input DataFrame containing diagnostic statistics for observations.
+    Returns:
+        pandas.DataFrame: A DataFrame with columns:
+            - 'type': The observation type.
+            - '{phase}_rmse': The root mean square error for the phase.
+            - '{phase}_bias': The mean bias for the phase.
+            - '{phase}_totalspread': The total spread for the phase.
+    """
     # assuming diag_stats has been called
     grand = (
@@ -283,6 +315,33 @@ def grand_statistics(df, phase):
 @apply_to_phases_by_type_return_df
 def layer_statistics(df, phase):
+    """
+    Calculate statistics (RMSE, bias, total spread) for each observation type and vertical layer.
+    This function assumes that diagnostic statistics (such as squared error, bias, and total variance)
+    have already been computed with :func:`diag_stats` and are present in the DataFrame. It groups the data by
+    vertical layer midpoint and observation type, and computes the root mean square error (RMSE),
+    mean bias, and total spread for the specified phase for each vertical layer.
+    Note:
+        This function is decorated with @apply_to_phases_by_type_return_df, which modifies its usage
+        You should call it as layer_statistics(df), and the decorator will automatically apply the function
+        to all relevant phases ('prior' and 'posterior') and return a merged DataFrame.
+    Args:
+        df (pandas.DataFrame): The input DataFrame containing diagnostic statistics for observations.
+        phase (str): The phase for which to calculate the statistics ('prior' or 'posterior').
+    Returns:
+        pandas.DataFrame: A DataFrame with columns:
+            - 'midpoint': The midpoint of the vertical layer.
+            - 'type': The observation type.
+            - '{phase}_rmse': The root mean square error for the phase.
+            - '{phase}_bias': The mean bias for the phase.
+            - '{phase}_totalspread': The total spread for the phase.
+            - 'vert_unit': The vertical unit.
+            - 'vlevels': The categorized vertical level.
+    """
     # assuming diag_stats has been called
     layer_stats = (
@@ -310,14 +369,31 @@ def layer_statistics(df, phase):
 @apply_to_phases_by_type_return_df
 def time_statistics(df, phase):
     """
-    Calculate time-based statistics for a given phase and return a new DataFrame.
+    Calculate time-based statistics (RMSE, bias, total spread) for each observation type and time bin.
+    This function assumes that diagnostic statistics (such as squared error, bias, and total variance)
+    have already been computed by :func:`diag_stats` and are present in the DataFrame. It groups the data
+    by time bin midpoint and observation type, and computes the root mean square error (RMSE), mean bias,
+    and total spread for the specified phase for each time bin.
+    Note:
+        This function is decorated with @apply_to_phases_by_type_return_df.
+        You should call it as time_statistics(df), and the decorator will automatically apply the function
+        to all relevant phases ('prior' and 'posterior') and return a merged DataFrame.
     Args:
-        df (pandas.DataFrame): The input DataFrame containing observation data and ensemble statistics.
+        df (pandas.DataFrame): The input DataFrame containing diagnostic statistics for observations.
         phase (str): The phase for which to calculate the statistics ('prior' or 'posterior').
     Returns:
-        pandas.DataFrame: A DataFrame containing time-based statistics for the specified phase.
+        pandas.DataFrame: A DataFrame with columns:
+            - 'time_bin_midpoint': The midpoint of the time bin.
+            - 'type': The observation type.
+            - '{phase}_rmse': The root mean square error for the phase.
+            - '{phase}_bias': The mean bias for the phase.
+            - '{phase}_totalspread': The total spread for the phase.
+            - 'time_bin': The time bin interval.
+            - 'time': The first time value in the bin.
     """
     # Assuming diag_stats has been called
     time_stats = (
@@ -402,7 +478,9 @@ def possible_vs_used_by_time(df):
     Calculates the count of possible vs. used observations by type and time bin.
     Args:
-        df (pd.DataFrame): The input DataFrame containing observation data. The DataFrame must include:
+        df (pd.DataFrame): The input DataFrame containing observation data.
+                           The DataFrame must include:
                            - 'type': The observation type.
                            - 'time_bin_midpoint': The midpoint of the time bin.
                            - 'observation': The observation values.

{pydartdiags-0.5.1.dist-info → pydartdiags-0.6.1.dist-info}/METADATA RENAMED Viewed

@@ -1,15 +1,15 @@
 Metadata-Version: 2.4
 Name: pydartdiags
-Version: 0.5.1
+Version: 0.6.1
 Summary: Observation Sequence Diagnostics for DART
 Home-page: https://github.com/NCAR/pyDARTdiags.git
 Author: Helen Kershaw
 Author-email: Helen Kershaw <hkershaw@ucar.edu>
+License-Expression: Apache-2.0
 Project-URL: Homepage, https://github.com/NCAR/pyDARTdiags.git
 Project-URL: Issues, https://github.com/NCAR/pyDARTdiags/issues
 Project-URL: Documentation, https://ncar.github.io/pyDARTdiags
 Classifier: Programming Language :: Python :: 3
-Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown

{pydartdiags-0.5.1.dist-info → pydartdiags-0.6.1.dist-info}/RECORD RENAMED Viewed

@@ -3,13 +3,13 @@ pydartdiags/matplots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
 pydartdiags/matplots/matplots.py,sha256=Bo0TTz1gvsHEvTfTfLfdTi_3hNRN1okmyY5a5yYgtzk,13455
 pydartdiags/obs_sequence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pydartdiags/obs_sequence/composite_types.yaml,sha256=PVLMU6x6KcVMCwPB-U65C_e0YQUemfqUhYMpf1DhFOY,917
-pydartdiags/obs_sequence/obs_sequence.py,sha256=8RGUzfWxSlGtPx_uz5lhLJaUaG8ju6qmiIU7da43nwk,48444
+pydartdiags/obs_sequence/obs_sequence.py,sha256=szxASzecTcJzP2rEqssRo9VHw26nwpZ7W9Yi6sTbbHI,55112
 pydartdiags/plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pydartdiags/plots/plots.py,sha256=U7WQjE_qN-5a8-85D-PkkgILSFBzTJQ1mcGBa7l5DHI,6464
 pydartdiags/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pydartdiags/stats/stats.py,sha256=HbRj3toQRx63mX1a1FXHA5_7yGITz8JKHbhjMoAHChk,16163
-pydartdiags-0.5.1.dist-info/licenses/LICENSE,sha256=ROglds_Eg_ylXp-1MHmEawDqMw_UsCB4r9sk7z9PU9M,11377
-pydartdiags-0.5.1.dist-info/METADATA,sha256=Fn3KsjQZma-696rO-yGpAHrHqV2izTNpVmBnYPx9z6k,2413
-pydartdiags-0.5.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-pydartdiags-0.5.1.dist-info/top_level.txt,sha256=LfMoPLnSd0VhhlWev1eeX9t6AzvyASOloag0LO_ppWg,12
-pydartdiags-0.5.1.dist-info/RECORD,,
+pydartdiags/stats/stats.py,sha256=a88VuLoHOlhbjYjnrVPHVNnhiDx-4B3YA1jbc6FUSyU,20193
+pydartdiags-0.6.1.dist-info/licenses/LICENSE,sha256=ROglds_Eg_ylXp-1MHmEawDqMw_UsCB4r9sk7z9PU9M,11377
+pydartdiags-0.6.1.dist-info/METADATA,sha256=AeuLMziCQas1vggEwAKD6CEfdadxwoSDWEu-Fgwaix0,2381
+pydartdiags-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+pydartdiags-0.6.1.dist-info/top_level.txt,sha256=LfMoPLnSd0VhhlWev1eeX9t6AzvyASOloag0LO_ppWg,12
+pydartdiags-0.6.1.dist-info/RECORD,,

{pydartdiags-0.5.1.dist-info → pydartdiags-0.6.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (78.1.0)
+Generator: setuptools (80.9.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{pydartdiags-0.5.1.dist-info → pydartdiags-0.6.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{pydartdiags-0.5.1.dist-info → pydartdiags-0.6.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

pydartdiags 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

Potentially problematic release.

pydartdiags 0.5.1py3-none-any.whl → 0.6.1py3-none-any.whl