pydartdiags 0.0.42__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydartdiags might be problematic. Click here for more details.

@@ -1,3 +1,4 @@
1
+ # SPDX-License-Identifier: Apache-2.0
1
2
  import pandas as pd
2
3
  import datetime as dt
3
4
  import numpy as np
@@ -5,75 +6,163 @@ import os
5
6
  import yaml
6
7
  import struct
7
8
 
9
+
10
+ def requires_assimilation_info(func):
11
+ def wrapper(self, *args, **kwargs):
12
+ if self.has_assimilation_info:
13
+ return func(self, *args, **kwargs)
14
+ else:
15
+ raise ValueError(
16
+ "Assimilation information is required to call this function."
17
+ )
18
+
19
+ return wrapper
20
+
21
+
22
+ def requires_posterior_info(func):
23
+ def wrapper(self, *args, **kwargs):
24
+ if self.has_posterior:
25
+ return func(self, *args, **kwargs)
26
+ else:
27
+ raise ValueError("Posterior information is required to call this function.")
28
+
29
+ return wrapper
30
+
31
+
8
32
  class obs_sequence:
9
- """Create an obs_sequence object from an ascii observation sequence file.
33
+ """
34
+ Initialize an obs_sequence object from an ASCII or binary observation sequence file,
35
+ or create an empty obs_sequence object from scratch.
36
+
37
+ Args:
38
+ file (str): The input observation sequence ASCII or binary file.
39
+ If None, an empty obs_sequence object is created from scratch.
40
+
41
+ Returns:
42
+ An obs_sequence object
10
43
 
11
44
  Attributes:
12
- df (pandas.DataFrame): DataFrame containing all the observations.
13
- all_obs (list): List of all observations, each observation is a list.
14
- header (str): Header from the ascii file.
15
- vert (dict): Dictionary of dart vertical units.
16
- types (dict): Dictionary of types in the observation sequence file.
17
- copie_names (list): Names of copies in the observation sequence file.
18
- Spelled 'copie' to avoid conflict with the Python built-in copy function.
45
+ df (pandas.DataFrame): The DataFrame containing the observation sequence data.
46
+ header (list): The header of the observation sequence.
47
+ copie_names (list): The names of the copies in the observation sequence.
48
+ Spelled 'copie' to avoid conflict with the Python built-in 'copy'.
19
49
  Spaces are replaced with underscores in copie_names.
20
-
21
- Parameters:
22
- file : the input observation sequence ascii file
23
-
24
- Example:
25
- Read the observation sequence from file:
26
- ``obs_seq = obs_sequence('/home/data/obs_seq.final.ascii.small')``
27
- Access the resulting pandas DataFrame:
28
- ``obs_seq.df``
29
-
30
- For 3D sphere models: latitude and longitude are in degrees in the DataFrame
31
-
32
- Calculations:
33
-
34
- - sq_err = (mean-obs)**2
35
- - bias = (mean-obs)
36
- - rmse = sqrt( sum((mean-obs)**2)/n )
37
- - bias = sum((mean-obs)/n)
38
- - spread = sum(sd)
39
- - totalspread = sqrt(sum(sd+obs_err_var))
40
-
50
+ non_qc_copie_names (list): The names of the copies not including quality control,
51
+ e.g. observation, mean, ensemble_members
52
+ qc_copie_names (list): The names of the quality control copies, e.g. DART_QC
53
+ n_copies(int): The total number of copies in the observation sequence.
54
+ n_non_qc(int): The number of copies not including quality control.
55
+ n_qc(int): The number of quality control copies.
56
+ vert (dict): A dictionary mapping DART vertical coordinate types to their
57
+ corresponding integer values.
58
+
59
+ - undefined: 'VERTISUNDEF'
60
+ - surface: 'VERTISSURFACE' (value is surface elevation in meters)
61
+ - model level: 'VERTISLEVEL'
62
+ - pressure: 'VERTISPRESSURE' (in Pascals)
63
+ - height: 'VERTISHEIGHT' (in meters)
64
+ - scale height: 'VERTISSCALEHEIGHT' (unitless)
65
+ loc_mod (str): The location model, either 'loc3d' or 'loc1d'.
66
+ For 3D sphere models: latitude and longitude are in degrees in the DataFrame.
67
+ types (dict): Dictionary of types of observations the observation sequence,
68
+ e.g. {23: 'ACARS_TEMPERATURE'},
69
+ reverse_types (dict): Dictionary of types with keys and values reversed, e.g
70
+ {'ACARS_TEMPERATURE': 23}
71
+ synonyms_for_obs (list): List of synonyms for the observation column in the DataFrame.
72
+ The defualt list is
73
+
74
+ .. code-block:: python
75
+
76
+ [ 'NCEP BUFR observation',
77
+ 'AIRS observation',
78
+ 'GTSPP observation',
79
+ 'SST observation',
80
+ 'observations',
81
+ 'WOD observation']
82
+
83
+ You can add more synonyms by providing a list of strings when
84
+ creating the obs_sequence object.
85
+
86
+ .. code-block:: python
87
+
88
+ obs_sequence(file, synonyms=['synonym1', 'synonym2']).df
89
+
90
+ has_assimilation_info (bool): Indicates if assimilation information is present.
91
+ has_posterior (bool): Indicates if posterior information is present.
92
+ seq (generator): Generator of observations from the observation sequence file.
93
+ all_obs (list): List of all observations, each observation is a list.
94
+ Valid when the obs_sequence is created from a file.
95
+ Set to None when the obs_sequence is created from scratch or multiple
96
+ obs_sequences are joined.
41
97
  """
42
- ## static variables
43
- # vertrical coordinate:
44
- # undefined 'VERTISUNDEF'
45
- # surface 'VERTISSURFACE' (value is surface elevation in m)
46
- # model level 'VERTISLEVEL'
47
- # pressure 'VERTISPRESSURE' (in pascals)
48
- # height 'VERTISHEIGHT' (in meters)
49
- # scale height 'VERTISSCALEHEIGHT' (unitless)
50
- vert = {-2: 'undefined',
51
- -1: 'surface (m)',
52
- 1: 'model level',
53
- 2: 'pressure (Pa)',
54
- 3: 'height (m)',
55
- 4: 'scale height' }
98
+
99
+ vert = {
100
+ -2: "undefined",
101
+ -1: "surface (m)",
102
+ 1: "model level",
103
+ 2: "pressure (Pa)",
104
+ 3: "height (m)",
105
+ 4: "scale height",
106
+ }
56
107
 
57
108
  reversed_vert = {value: key for key, value in vert.items()}
58
109
 
59
-
60
110
  def __init__(self, file, synonyms=None):
61
- self.loc_mod = 'None'
111
+ """
112
+ Create an obs_sequence object from an ASCII or binary observation sequence file,
113
+ or create an empty obs_sequence object from scratch.
114
+
115
+ Args:
116
+ file (str): The input observation sequence ASCII or binary file.
117
+ If None, an empty obs_sequence object is created from scratch.
118
+ synonyms (list, optional): List of synonyms for the observation column in the DataFrame.
119
+
120
+ Returns:
121
+ an obs_sequence object
122
+
123
+ Examples:
124
+
125
+ .. code-block:: python
126
+
127
+ obs_seq = obs_sequence(file='obs_seq.final')
128
+
129
+ """
130
+
131
+ self.loc_mod = "None"
132
+ self.has_assimilation_info = False
133
+ self.has_posterior = False
62
134
  self.file = file
63
- self.synonyms_for_obs = ['NCEP BUFR observation',
64
- 'AIRS observation',
65
- 'GTSPP observation',
66
- 'SST observation',
67
- 'observations',
68
- 'WOD observation']
135
+ self.synonyms_for_obs = [
136
+ "NCEP BUFR observation",
137
+ "AIRS observation",
138
+ "GTSPP observation",
139
+ "SST observation",
140
+ "observations",
141
+ "WOD observation",
142
+ ]
69
143
  if synonyms:
70
144
  if isinstance(synonyms, list):
71
145
  self.synonyms_for_obs.extend(synonyms)
72
146
  else:
73
147
  self.synonyms_for_obs.append(synonyms)
74
148
 
149
+ if file is None:
150
+ # Early exit - for testing purposes or creating obs_seq objects from scratch
151
+ self.df = pd.DataFrame()
152
+ self.types = {}
153
+ self.reverse_types = {}
154
+ self.copie_names = []
155
+ self.non_qc_copie_names = []
156
+ self.qc_copie_names = []
157
+ self.n_copies = 0 # copies including qc
158
+ self.n_non_qc = 0 # copies not including qc
159
+ self.n_qc = 0 # number of qc copies
160
+ self.seq = []
161
+ self.all_obs = []
162
+ return
163
+
75
164
  module_dir = os.path.dirname(__file__)
76
- self.default_composite_types = os.path.join(module_dir,"composite_types.yaml")
165
+ self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
77
166
 
78
167
  if self.is_binary(file):
79
168
  self.header = self.read_binary_header(file)
@@ -83,35 +172,47 @@ class obs_sequence:
83
172
  self.types = self.collect_obs_types(self.header)
84
173
  self.reverse_types = {v: k for k, v in self.types.items()}
85
174
  self.copie_names, self.n_copies = self.collect_copie_names(self.header)
175
+ self.n_non_qc, self.n_qc = self.num_qc_non_qc(self.header)
176
+ self.non_qc_copie_names = self.copie_names[: self.n_non_qc]
177
+ self.qc_copie_names = self.copie_names[self.n_non_qc :]
86
178
 
87
179
  if self.is_binary(file):
88
180
  self.seq = self.obs_binary_reader(file, self.n_copies)
89
- self.loc_mod = 'loc3d' # only loc3d supported for binary, & no way to check
181
+ self.loc_mod = "loc3d" # only loc3d supported for binary, & no way to check
90
182
  else:
91
183
  self.seq = self.obs_reader(file, self.n_copies)
92
184
 
93
- self.all_obs = self.create_all_obs() # uses up the generator
185
+ self.all_obs = self.create_all_obs() # uses up the generator
94
186
  # at this point you know if the seq is loc3d or loc1d
95
- if self.loc_mod == 'None':
96
- raise ValueError("Neither 'loc3d' nor 'loc1d' could be found in the observation sequence.")
187
+ if self.loc_mod == "None":
188
+ raise ValueError(
189
+ "Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
190
+ )
97
191
  self.columns = self.column_headers()
98
- self.df = pd.DataFrame(self.all_obs, columns = self.columns)
99
- if self.loc_mod == 'loc3d':
100
- self.df['longitude'] = np.rad2deg(self.df['longitude'])
101
- self.df['latitude'] = np.rad2deg(self.df['latitude'])
192
+ self.df = pd.DataFrame(self.all_obs, columns=self.columns)
193
+ if self.loc_mod == "loc3d":
194
+ self.df["longitude"] = np.rad2deg(self.df["longitude"])
195
+ self.df["latitude"] = np.rad2deg(self.df["latitude"])
102
196
  # rename 'X observation' to observation
103
- self.synonyms_for_obs = [synonym.replace(' ', '_') for synonym in self.synonyms_for_obs]
104
- rename_dict = {old: 'observation' for old in self.synonyms_for_obs if old in self.df.columns}
197
+ self.synonyms_for_obs = [
198
+ synonym.replace(" ", "_") for synonym in self.synonyms_for_obs
199
+ ]
200
+ rename_dict = {
201
+ old: "observation"
202
+ for old in self.synonyms_for_obs
203
+ if old in self.df.columns
204
+ }
105
205
  self.df = self.df.rename(columns=rename_dict)
106
- # calculate bias and sq_err is the obs_seq is an obs_seq.final
107
- if 'prior_ensemble_mean'.casefold() in map(str.casefold, self.columns):
108
- self.df['bias'] = (self.df['prior_ensemble_mean'] - self.df['observation'])
109
- self.df['sq_err'] = self.df['bias']**2 # squared error
110
-
206
+
207
+ # check if the assimilation info is present
208
+ if "prior_ensemble_mean".casefold() in map(str.casefold, self.columns):
209
+ self.has_assimilation_info = True
210
+ if "posterior_ensemble_mean".casefold() in map(str.casefold, self.columns):
211
+ self.has_posterior = True
111
212
 
112
213
  def create_all_obs(self):
113
- """ steps through the generator to create a
114
- list of all observations in the sequence
214
+ """steps through the generator to create a
215
+ list of all observations in the sequence
115
216
  """
116
217
  all_obs = []
117
218
  for obs in self.seq:
@@ -120,71 +221,110 @@ class obs_sequence:
120
221
  return all_obs
121
222
 
122
223
  def obs_to_list(self, obs):
123
- """put single observation into a list
124
-
125
- discards obs_def
126
- """
224
+ """put single observation into a list"""
127
225
  data = []
128
- data.append(obs[0].split()[1]) # obs_num
129
- data.extend(list(map(float,obs[1:self.n_copies+1]))) # all the copies
130
- data.append(obs[self.n_copies+1]) # linked list info
226
+ data.append(obs[0].split()[1]) # obs_num
227
+ data.extend(list(map(float, obs[1 : self.n_copies + 1]))) # all the copies
228
+ data.append(obs[self.n_copies + 1]) # linked list info
131
229
  try: # HK todo only have to check loc3d or loc1d for the first observation, the whole file is the same
132
- locI = obs.index('loc3d')
133
- location = obs[locI+1].split()
230
+ locI = obs.index("loc3d")
231
+ location = obs[locI + 1].split()
134
232
  data.append(float(location[0])) # location x
135
233
  data.append(float(location[1])) # location y
136
234
  data.append(float(location[2])) # location z
137
235
  data.append(obs_sequence.vert[int(location[3])])
138
- self.loc_mod = 'loc3d'
236
+ self.loc_mod = "loc3d"
139
237
  except ValueError:
140
238
  try:
141
- locI = obs.index('loc1d')
142
- location = obs[locI+1]
143
- data.append(float(location)) # 1d location
144
- self.loc_mod = 'loc1d'
239
+ locI = obs.index("loc1d")
240
+ location = obs[locI + 1]
241
+ data.append(float(location)) # 1d location
242
+ self.loc_mod = "loc1d"
145
243
  except ValueError:
146
- raise ValueError("Neither 'loc3d' nor 'loc1d' could be found in the observation sequence.")
147
- typeI = obs.index('kind') # type of observation
244
+ raise ValueError(
245
+ "Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
246
+ )
247
+ typeI = obs.index("kind") # type of observation
148
248
  type_value = obs[typeI + 1]
149
249
  if not self.types:
150
- data.append('Identity')
250
+ data.append("Identity")
151
251
  else:
152
- data.append(self.types[type_value]) # observation type
153
-
252
+ data.append(self.types[type_value]) # observation type
253
+
154
254
  # any observation specific obs def info is between here and the end of the list
255
+ # can be obs_def & external forward operator
256
+ metadata = obs[typeI + 2 : -2]
257
+ obs_def_metadata, external_metadata = self.split_metadata(metadata)
258
+ data.append(obs_def_metadata)
259
+ data.append(external_metadata)
260
+
155
261
  time = obs[-2].split()
156
- data.append(int(time[0])) # seconds
157
- data.append(int(time[1])) # days
158
- data.append(convert_dart_time(int(time[0]), int(time[1]))) # datetime # HK todo what is approprate for 1d models?
159
- data.append(float(obs[-1])) # obs error variance ?convert to sd?
160
-
262
+ data.append(int(time[0])) # seconds
263
+ data.append(int(time[1])) # days
264
+ data.append(
265
+ convert_dart_time(int(time[0]), int(time[1]))
266
+ ) # datetime # HK todo what is approprate for 1d models?
267
+ data.append(float(obs[-1])) # obs error variance ?convert to sd?
268
+
161
269
  return data
162
270
 
271
+ @staticmethod
272
+ def split_metadata(metadata):
273
+ """
274
+ Split the metadata list at the first occurrence of an element starting with 'externalF0'.
275
+
276
+ Args:
277
+ metadata (list of str): The metadata list to be split.
278
+
279
+ Returns:
280
+ tuple: Two sublists, the first containing elements before 'externalF0', and the second
281
+ containing 'externalF0' and all elements after it. If 'externalF0' is not found,
282
+ the first sublist contains the entire metadata list, and the second is empty.
283
+ """
284
+ for i, item in enumerate(metadata):
285
+ if item.startswith("external_FO"):
286
+ return metadata[:i], metadata[i:]
287
+ return metadata, []
288
+
163
289
  def list_to_obs(self, data):
290
+ """convert a list of data to an observation
291
+
292
+ Assuming the order of the list is obs_seq.copie_names
293
+
294
+ """
164
295
  obs = []
165
- obs.append('OBS ' + str(data[0])) # obs_num lots of space
166
- obs.extend(data[1:self.n_copies+1]) # all the copies
167
- obs.append(data[self.n_copies+1]) # linked list info
168
- obs.append('obdef') # TODO HK: metadata obs_def
296
+ obs.append("OBS " + str(data[0])) # obs_num lots of space
297
+ obs.extend(data[1 : self.n_copies + 1]) # all the copies
298
+ obs.append(data[self.n_copies + 1]) # linked list info
299
+ obs.append("obdef") # TODO HK: extended_FO obs_def
169
300
  obs.append(self.loc_mod)
170
- if self.loc_mod == 'loc3d':
171
- obs.append(' '.join(map(str, data[self.n_copies+2:self.n_copies+5])) + ' ' + str(self.reversed_vert[data[self.n_copies+5]]) ) # location x, y, z, vert
172
- obs.append('kind') # this is type of observation
301
+ if self.loc_mod == "loc3d":
302
+ obs.append(
303
+ " ".join(map(str, data[self.n_copies + 2 : self.n_copies + 5]))
304
+ + " "
305
+ + str(self.reversed_vert[data[self.n_copies + 5]])
306
+ ) # location x, y, z, vert
307
+ obs.append("kind") # this is type of observation
173
308
  obs.append(self.reverse_types[data[self.n_copies + 6]]) # observation type
174
- elif self.loc_mod == 'loc1d':
175
- obs.append(data[self.n_copies+2]) # 1d location
176
- obs.append('kind') # this is type of observation
309
+ # Convert metadata to a string and append !HK @todo you are not converting to string
310
+ obs.extend(data[self.n_copies + 7]) # metadata
311
+ obs.extend(data[self.n_copies + 8]) # external forward operator
312
+ elif self.loc_mod == "loc1d":
313
+ obs.append(data[self.n_copies + 2]) # 1d location
314
+ obs.append("kind") # this is type of observation
177
315
  obs.append(self.reverse_types[data[self.n_copies + 3]]) # observation type
178
- obs.append(' '.join(map(str, data[-4:-2]))) # seconds, days
316
+ obs.extend(data[self.n_copies + 4]) # metadata
317
+ obs.extend(data[self.n_copies + 5]) # external forward operator
318
+ obs.append(" ".join(map(str, data[-4:-2]))) # seconds, days
179
319
  obs.append(data[-1]) # obs error variance
180
320
 
181
321
  return obs
182
322
 
183
323
  @staticmethod
184
324
  def generate_linked_list_pattern(n):
185
- """Create a list of strings with the linked list pattern for n lines."""
325
+ """Create a list of strings with the linked list pattern for n observations."""
186
326
  result = []
187
- for i in range(n-1):
327
+ for i in range(n - 1):
188
328
  col1 = i if i > 0 else -1
189
329
  col2 = i + 2
190
330
  col3 = -1
@@ -192,108 +332,162 @@ class obs_sequence:
192
332
  result.append(f"{n-1:<12}{'-1':<11}{'-1'}")
193
333
  return result
194
334
 
195
- def write_obs_seq(self, file, df=None):
335
+ def write_obs_seq(self, file):
196
336
  """
197
337
  Write the observation sequence to a file.
198
-
199
- This function writes the observation sequence to disk.
200
- If no DataFrame is provided, it writes the obs_sequence object to a file using the
201
- header and all observations stored in the object.
202
- If a DataFrame is provided,it creates a header and linked list from the DataFrame,
203
- then writes the DataFrame obs to an obs_sequence file. Note the DataFrame is assumed
204
- to have been created from obs_sequence object.
205
-
206
-
207
- Parameters:
338
+
339
+ This function writes the observation sequence stored in the obs_seq.DataFrame to a specified file.
340
+ It updates the header with the number of observations, converts coordinates back to radians
341
+ if necessary, drops unnecessary columns, sorts the DataFrame by time, and generates a linked
342
+ list pattern for reading by DART programs.
343
+
344
+ Args:
208
345
  file (str): The path to the file where the observation sequence will be written.
209
- df (pandas.DataFrame, optional): A DataFrame containing the observation data. If not provided, the function uses self.header and self.all_obs.
210
-
211
- Returns:
212
- None
213
-
214
- Examples:
215
- ``obs_seq.write_obs_seq('/path/to/output/file')``
216
- ``obs_seq.write_obs_seq('/path/to/output/file', df=obs_seq.df)``
217
- """
218
- with open(file, 'w') as f:
219
-
220
- if df is not None:
221
- # If a DataFrame is provided, update the header with the number of observations
222
- num_rows = len(df)
223
- replacement_string = f'num_obs: {num_rows:>10} max_num_obs: {num_rows:>10}'
224
- new_header = [replacement_string if 'num_obs' in element else element for element in self.header]
225
-
226
- for line in new_header[:-1]:
227
- f.write(str(line) + '\n')
228
- first = 1
229
- f.write(f"first: {first:>12} last: {num_rows:>12}\n")
230
-
231
- # TODO HK is there something better than copying the whole thing here?
232
- df_copy = df.copy() # copy since you want to change for writing.
233
- # back to radians for obs_seq
234
- if self.loc_mod == 'loc3d':
235
- df_copy['longitude'] = np.deg2rad(self.df['longitude'])
236
- df_copy['latitude'] = np.deg2rad(self.df['latitude'])
237
- if 'bias' in df_copy.columns:
238
- df_copy = df_copy.drop(columns=['bias', 'sq_err'])
239
-
240
- # linked list for reading by dart programs
241
- df_copy = df_copy.sort_values(by=['time']) # sort the DataFrame by time
242
- df_copy['obs_num'] = df.index + 1 # obs_num in time order
243
- df_copy['linked_list'] = obs_sequence.generate_linked_list_pattern(len(df_copy)) # linked list pattern
244
-
245
- def write_row(row):
246
- ob_write = self.list_to_obs(row.tolist())
247
- for line in ob_write:
248
- f.write(str(line) + '\n')
249
-
250
- df_copy.apply(write_row, axis=1)
251
-
252
- else:
253
- # If no DataFrame is provided, use self.header and self.all_obs
254
- for line in self.header:
255
- f.write(str(line) + '\n')
256
- for obs in self.all_obs:
257
- ob_write = self.list_to_obs(obs)
258
- for line in ob_write:
259
- f.write(str(line) + '\n')
260
346
 
347
+ Notes:
348
+ - Longitude and latitude are converted back to radians if the location model is 'loc3d'.
349
+ - The 'bias' and 'sq_err' columns are dropped if they exist in the DataFrame.
350
+ - The DataFrame is sorted by the 'time' column.
351
+ - An 'obs_num' column is added to the DataFrame to number the observations in time order.
352
+ - A 'linked_list' column is generated to create a linked list pattern for the observations.
353
+
354
+ Example:
355
+ obsq.write_obs_seq('obs_seq.new')
356
+
357
+ """
358
+ with open(file, "w") as f:
359
+
360
+ # If a DataFrame is provided, update the header with the number of observations
361
+ num_rows = len(self.df)
362
+ replacement_string = f"num_obs: {num_rows:>10} max_num_obs: {num_rows:>10}"
363
+ new_header = [
364
+ replacement_string if "num_obs" in element else element
365
+ for element in self.header
366
+ ]
367
+
368
+ for line in new_header[:-1]:
369
+ f.write(str(line) + "\n")
370
+ first = 1
371
+ f.write(f"first: {first:>12} last: {num_rows:>12}\n")
372
+
373
+ # TODO HK is there something better than copying the whole thing here?
374
+ df_copy = self.df.copy() # copy since you want to change for writing.
375
+ # back to radians for obs_seq
376
+ if self.loc_mod == "loc3d":
377
+ df_copy["longitude"] = np.deg2rad(self.df["longitude"]).round(16)
378
+ df_copy["latitude"] = np.deg2rad(self.df["latitude"]).round(16)
379
+ if "bias" in df_copy.columns:
380
+ df_copy = df_copy.drop(columns=["bias", "sq_err"])
381
+
382
+ # linked list for reading by dart programs
383
+ df_copy = df_copy.sort_values(
384
+ by=["time"], kind="stable"
385
+ ) # sort the DataFrame by time
386
+ df_copy["obs_num"] = self.df.index + 1 # obs_num in time order
387
+ df_copy["linked_list"] = obs_sequence.generate_linked_list_pattern(
388
+ len(df_copy)
389
+ ) # linked list pattern
390
+
391
+ def write_row(row):
392
+ ob_write = self.list_to_obs(row.tolist())
393
+ for line in ob_write:
394
+ f.write(str(line) + "\n")
395
+
396
+ df_copy.apply(write_row, axis=1)
261
397
 
262
398
  def column_headers(self):
263
- """define the columns for the dataframe """
399
+ """define the columns for the dataframe"""
264
400
  heading = []
265
- heading.append('obs_num')
401
+ heading.append("obs_num")
266
402
  heading.extend(self.copie_names)
267
- heading.append('linked_list')
268
- if self.loc_mod == 'loc3d':
269
- heading.append('longitude')
270
- heading.append('latitude')
271
- heading.append('vertical')
272
- heading.append('vert_unit')
273
- elif self.loc_mod == 'loc1d':
274
- heading.append('location')
275
- heading.append('type')
276
- heading.append('seconds')
277
- heading.append('days')
278
- heading.append('time')
279
- heading.append('obs_err_var')
403
+ heading.append("linked_list")
404
+ if self.loc_mod == "loc3d":
405
+ heading.append("longitude")
406
+ heading.append("latitude")
407
+ heading.append("vertical")
408
+ heading.append("vert_unit")
409
+ elif self.loc_mod == "loc1d":
410
+ heading.append("location")
411
+ heading.append("type")
412
+ heading.append("metadata")
413
+ heading.append("external_FO")
414
+ heading.append("seconds")
415
+ heading.append("days")
416
+ heading.append("time")
417
+ heading.append("obs_err_var")
280
418
  return heading
281
419
 
420
+ @requires_assimilation_info
421
+ def select_by_dart_qc(self, dart_qc):
422
+ """
423
+ Selects rows from a DataFrame based on the DART quality control flag.
424
+
425
+ Args:
426
+ df (DataFrame): A pandas DataFrame.
427
+ dart_qc (int): The DART quality control flag to select.
428
+
429
+ Returns:
430
+ DataFrame: A DataFrame containing only the rows with the specified DART quality control flag.
431
+
432
+ Raises:
433
+ ValueError: If the DART quality control flag is not present in the DataFrame.
434
+ """
435
+ if dart_qc not in self.df["DART_quality_control"].unique():
436
+ raise ValueError(
437
+ f"DART quality control flag '{dart_qc}' not found in DataFrame."
438
+ )
439
+ else:
440
+ return self.df[self.df["DART_quality_control"] == dart_qc]
441
+
442
+ @requires_assimilation_info
443
+ def select_failed_qcs(self):
444
+ """
445
+ Select rows from the DataFrame where the DART quality control flag is greater than 0.
446
+
447
+ Returns:
448
+ pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
449
+ """
450
+ return self.df[self.df["DART_quality_control"] > 0]
451
+
452
+ @requires_assimilation_info
453
+ def possible_vs_used(self):
454
+ """
455
+ Calculates the count of possible vs. used observations by type.
456
+
457
+ This function takes a DataFrame containing observation data, including a 'type' column for the observation
458
+ type and an 'observation' column. The number of used observations ('used'), is the total number
459
+ minus the observations that failed quality control checks (as determined by the `select_failed_qcs` function).
460
+ The result is a DataFrame with each observation type, the count of possible observations, and the count of
461
+ used observations.
462
+
463
+ Returns:
464
+ pd.DataFrame: A DataFrame with three columns: 'type', 'possible', and 'used'. 'type' is the observation type,
465
+ 'possible' is the count of all observations of that type, and 'used' is the count of observations of that type
466
+ that passed quality control checks.
467
+ """
468
+ possible = self.df.groupby("type")["observation"].count()
469
+ possible.rename("possible", inplace=True)
470
+
471
+ failed_qcs = self.select_failed_qcs().groupby("type")["observation"].count()
472
+ used = possible - failed_qcs.reindex(possible.index, fill_value=0)
473
+ used.rename("used", inplace=True)
474
+
475
+ return pd.concat([possible, used], axis=1).reset_index()
476
+
282
477
  @staticmethod
283
478
  def is_binary(file):
284
479
  """Check if a file is binary file."""
285
- with open(file, 'rb') as f:
480
+ with open(file, "rb") as f:
286
481
  chunk = f.read(1024)
287
- if b'\0' in chunk:
482
+ if b"\0" in chunk:
288
483
  return True
289
484
  return False
290
485
 
291
-
292
486
  @staticmethod
293
487
  def read_header(file):
294
488
  """Read the header and number of lines in the header of an ascii obs_seq file"""
295
489
  header = []
296
- with open(file, 'r') as f:
490
+ with open(file, "r") as f:
297
491
  for line in f:
298
492
  if "first:" in line and "last:" in line:
299
493
  header.append(line.strip())
@@ -309,19 +503,19 @@ class obs_sequence:
309
503
  linecount = 0
310
504
  obs_types_definitions = -1000
311
505
  num_obs = 0
312
- max_num_obs = 0
506
+ max_num_obs = 0
313
507
  # need to get:
314
508
  # number of obs_type_definitions
315
509
  # number of copies
316
510
  # number of qcs
317
- with open(file, 'rb') as f:
511
+ with open(file, "rb") as f:
318
512
  while True:
319
513
  # Read the record length
320
514
  record_length = obs_sequence.read_record_length(f)
321
515
  if record_length is None:
322
516
  break
323
517
  record = f.read(record_length)
324
- if not record: # end of file
518
+ if not record: # end of file
325
519
  break
326
520
 
327
521
  # Read the trailing record length (should match the leading one)
@@ -329,17 +523,19 @@ class obs_sequence:
329
523
 
330
524
  linecount += 1
331
525
 
332
- if linecount == 3:
333
- obs_types_definitions = struct.unpack('i', record)[0]
334
- continue
526
+ if linecount == 3:
527
+ obs_types_definitions = struct.unpack("i", record)[0]
528
+ continue
335
529
 
336
- if linecount == 4+obs_types_definitions:
337
- num_copies, num_qcs, num_obs, max_num_obs = struct.unpack('iiii', record)[:16]
530
+ if linecount == 4 + obs_types_definitions:
531
+ num_copies, num_qcs, num_obs, max_num_obs = struct.unpack(
532
+ "iiii", record
533
+ )[:16]
338
534
  break
339
-
535
+
340
536
  # Go back to the beginning of the file
341
537
  f.seek(0)
342
-
538
+
343
539
  for _ in range(2):
344
540
  record_length = obs_sequence.read_record_length(f)
345
541
  if record_length is None:
@@ -349,14 +545,14 @@ class obs_sequence:
349
545
  if not record: # end of file
350
546
  break
351
547
 
352
- obs_sequence.check_trailing_record_length(f, record_length)
353
- header.append(record.decode('utf-8').strip())
548
+ obs_sequence.check_trailing_record_length(f, record_length)
549
+ header.append(record.decode("utf-8").strip())
354
550
 
355
551
  header.append(str(obs_types_definitions))
356
552
 
357
553
  # obs_types_definitions
358
- for _ in range(3,4+obs_types_definitions):
359
- # Read the record length
554
+ for _ in range(3, 4 + obs_types_definitions):
555
+ # Read the record length
360
556
  record_length = obs_sequence.read_record_length(f)
361
557
  if record_length is None:
362
558
  break
@@ -366,21 +562,24 @@ class obs_sequence:
366
562
  if not record: # end of file
367
563
  break
368
564
 
369
- obs_sequence.check_trailing_record_length(f, record_length)
565
+ obs_sequence.check_trailing_record_length(f, record_length)
370
566
 
371
567
  if _ == 3:
372
- continue # num obs_types_definitions
568
+ continue # num obs_types_definitions
373
569
  # Read an integer and a string from the record
374
- integer_value = struct.unpack('i', record[:4])[0]
375
- string_value = record[4:].decode('utf-8').strip()
570
+ integer_value = struct.unpack("i", record[:4])[0]
571
+ string_value = record[4:].decode("utf-8").strip()
376
572
  header.append(f"{integer_value} {string_value}")
377
573
 
378
574
  header.append(f"num_copies: {num_copies} num_qc: {num_qcs}")
379
575
  header.append(f"num_obs: {num_obs} max_num_obs: {max_num_obs}")
380
-
381
- #copie names
382
- for _ in range(5+obs_types_definitions, 5+obs_types_definitions+num_copies+num_qcs+1):
383
- # Read the record length
576
+
577
+ # copie names
578
+ for _ in range(
579
+ 5 + obs_types_definitions,
580
+ 5 + obs_types_definitions + num_copies + num_qcs + 1,
581
+ ):
582
+ # Read the record length
384
583
  record_length = obs_sequence.read_record_length(f)
385
584
  if record_length is None:
386
585
  break
@@ -390,26 +589,26 @@ class obs_sequence:
390
589
  if not record:
391
590
  break
392
591
 
393
- obs_sequence.check_trailing_record_length(f, record_length)
592
+ obs_sequence.check_trailing_record_length(f, record_length)
394
593
 
395
- if _ == 5+obs_types_definitions:
594
+ if _ == 5 + obs_types_definitions:
396
595
  continue
397
596
 
398
597
  # Read the whole record as a string
399
- string_value = record.decode('utf-8').strip()
598
+ string_value = record.decode("utf-8").strip()
400
599
  header.append(string_value)
401
600
 
402
601
  # first and last obs
403
- # Read the record length
602
+ # Read the record length
404
603
  record_length = obs_sequence.read_record_length(f)
405
604
 
406
605
  # Read the actual record
407
606
  record = f.read(record_length)
408
-
409
- obs_sequence.check_trailing_record_length(f, record_length)
607
+
608
+ obs_sequence.check_trailing_record_length(f, record_length)
410
609
 
411
610
  # Read the whole record as a two integers
412
- first, last = struct.unpack('ii', record)[:8]
611
+ first, last = struct.unpack("ii", record)[:8]
413
612
  header.append(f"first: {first} last: {last}")
414
613
 
415
614
  return header
@@ -418,7 +617,7 @@ class obs_sequence:
418
617
  def collect_obs_types(header):
419
618
  """Create a dictionary for the observation types in the obs_seq header"""
420
619
  num_obs_types = int(header[2])
421
- types = dict([x.split() for x in header[3:num_obs_types+3]])
620
+ types = dict([x.split() for x in header[3 : num_obs_types + 3]])
422
621
  return types
423
622
 
424
623
  @staticmethod
@@ -426,32 +625,45 @@ class obs_sequence:
426
625
  """
427
626
  Extracts the names of the copies from the header of an obs_seq file.
428
627
 
429
- Parameters:
628
+ Args:
430
629
  header (list): A list of strings representing the lines in the header of the obs_seq file.
431
630
 
432
631
  Returns:
433
- tuple: A tuple containing two elements:
434
- - copie_names (list): A list of strings representing the copy names with underscores for spaces.
632
+ tuple: A tuple containing two elements:
633
+ - copie_names (list): A list of strings representing the copy names with underscores for spaces.
435
634
  - len(copie_names) (int): The number of copy names.
436
635
  """
437
636
  for i, line in enumerate(header):
438
637
  if "num_obs:" in line and "max_num_obs:" in line:
439
- first_copie = i+1
638
+ first_copie = i + 1
440
639
  break
441
- copie_names = ['_'.join(x.split()) for x in header[first_copie:-1]] # first and last is last line of header
640
+ copie_names = [
641
+ "_".join(x.split()) for x in header[first_copie:-1]
642
+ ] # first and last is last line of header
442
643
  return copie_names, len(copie_names)
443
644
 
645
+ @staticmethod
646
+ def num_qc_non_qc(header):
647
+ """Find the number of qc and non-qc copies in the header"""
648
+ for line in header:
649
+ if "num_copies:" in line and "num_qc:" in line:
650
+ num_non_qc = int(line.split()[1])
651
+ num_qc = int(line.split()[3])
652
+ return num_non_qc, num_qc
653
+
444
654
  @staticmethod
445
655
  def obs_reader(file, n):
446
656
  """Reads the ascii obs sequence file and returns a generator of the obs"""
447
- previous_line = ''
448
- with open(file, 'r') as f:
657
+ previous_line = ""
658
+ with open(file, "r") as f:
449
659
  for line in f:
450
660
  if "OBS" in line or "OBS" in previous_line:
451
661
  if "OBS" in line:
452
662
  obs = []
453
- obs.append(line.strip())
454
- for i in range(n+100): # number of copies + 100. Needs to be bigger than any metadata
663
+ obs.append(line.strip())
664
+ for i in range(
665
+ n + 100
666
+ ): # number of copies + 100. Needs to be bigger than any metadata
455
667
  try:
456
668
  next_line = next(f)
457
669
  except:
@@ -464,11 +676,15 @@ class obs_sequence:
464
676
  else:
465
677
  obs.append(next_line.strip())
466
678
  yield obs
467
- elif "OBS" in previous_line: # previous line is because I cannot use f.tell with next
679
+ elif (
680
+ "OBS" in previous_line
681
+ ): # previous line is because I cannot use f.tell with next
468
682
  obs = []
469
- obs.append(previous_line.strip())
470
- obs.append(line.strip())
471
- for i in range(n+100): # number of copies + 100. Needs to be bigger than any metadata
683
+ obs.append(previous_line.strip())
684
+ obs.append(line.strip())
685
+ for i in range(
686
+ n + 100
687
+ ): # number of copies + 100. Needs to be bigger than any metadata
472
688
  try:
473
689
  next_line = next(f)
474
690
  except:
@@ -485,19 +701,19 @@ class obs_sequence:
485
701
 
486
702
  @staticmethod
487
703
  def check_trailing_record_length(file, expected_length):
488
- """Reads and checks the trailing record length from the binary file written by Fortran.
704
+ """Reads and checks the trailing record length from the binary file written by Fortran.
489
705
 
490
- Parameters:
491
- file (file): The file object.
492
- expected_length (int): The expected length of the trailing record.
706
+ Args:
707
+ file (file): The file object.
708
+ expected_length (int): The expected length of the trailing record.
493
709
 
494
- Assuming 4 bytes:
495
- | Record Length (4 bytes) | Data (N bytes) | Trailing Record Length (4 bytes) |
496
- """
497
- trailing_record_length_bytes = file.read(4)
498
- trailing_record_length = struct.unpack('i', trailing_record_length_bytes)[0]
499
- if expected_length != trailing_record_length:
500
- raise ValueError("Record length mismatch in Fortran binary file")
710
+ Assuming 4 bytes:
711
+ | Record Length (4 bytes) | Data (N bytes) | Trailing Record Length (4 bytes) |
712
+ """
713
+ trailing_record_length_bytes = file.read(4)
714
+ trailing_record_length = struct.unpack("i", trailing_record_length_bytes)[0]
715
+ if expected_length != trailing_record_length:
716
+ raise ValueError("Record length mismatch in Fortran binary file")
501
717
 
502
718
  @staticmethod
503
719
  def read_record_length(file):
@@ -505,18 +721,17 @@ class obs_sequence:
505
721
  record_length_bytes = file.read(4)
506
722
  if not record_length_bytes:
507
723
  return None # End of file
508
- return struct.unpack('i', record_length_bytes)[0]
509
-
724
+ return struct.unpack("i", record_length_bytes)[0]
510
725
 
511
726
  def obs_binary_reader(self, file, n):
512
727
  """Reads the obs sequence binary file and returns a generator of the obs"""
513
728
  header_length = len(self.header)
514
- with open(file, 'rb') as f:
729
+ with open(file, "rb") as f:
515
730
  # Skip the first len(obs_seq.header) lines
516
- for _ in range(header_length-1):
731
+ for _ in range(header_length - 1):
517
732
  # Read the record length
518
733
  record_length = obs_sequence.read_record_length(f)
519
- if record_length is None: # End of file
734
+ if record_length is None: # End of file
520
735
  break
521
736
 
522
737
  # Skip the actual record
@@ -529,78 +744,78 @@ class obs_sequence:
529
744
  while True:
530
745
  obs = []
531
746
  obs_num += 1
532
- obs.append(f"OBS {obs_num}")
533
- for _ in range(n): # number of copies
747
+ obs.append(f"OBS {obs_num}")
748
+ for _ in range(n): # number of copies
534
749
  # Read the record length
535
750
  record_length = obs_sequence.read_record_length(f)
536
751
  if record_length is None:
537
752
  break
538
753
  # Read the actual record (copie)
539
754
  record = f.read(record_length)
540
- obs.append(struct.unpack('d', record)[0])
755
+ obs.append(struct.unpack("d", record)[0])
541
756
 
542
757
  # Read the trailing record length (should match the leading one)
543
758
  obs_sequence.check_trailing_record_length(f, record_length)
544
-
759
+
545
760
  # linked list info
546
761
  record_length = obs_sequence.read_record_length(f)
547
762
  if record_length is None:
548
763
  break
549
764
 
550
765
  record = f.read(record_length)
551
- int1, int2, int3 = struct.unpack('iii', record[:12])
766
+ int1, int2, int3 = struct.unpack("iii", record[:12])
552
767
  linked_list_string = f"{int1:<12} {int2:<10} {int3:<12}"
553
768
  obs.append(linked_list_string)
554
769
 
555
770
  obs_sequence.check_trailing_record_length(f, record_length)
556
771
 
557
772
  # location (note no location header "loc3d" or "loc1d" for binary files)
558
- obs.append('loc3d')
773
+ obs.append("loc3d")
559
774
  record_length = obs_sequence.read_record_length(f)
560
775
  record = f.read(record_length)
561
- x,y,z,vert = struct.unpack('dddi', record[:28])
562
- location_string = f"{x} {y} {z} {vert}"
563
- obs.append(location_string)
776
+ x, y, z, vert = struct.unpack("dddi", record[:28])
777
+ location_string = f"{x} {y} {z} {vert}"
778
+ obs.append(location_string)
564
779
 
565
780
  obs_sequence.check_trailing_record_length(f, record_length)
566
-
781
+
567
782
  # kind (type of observation) value
568
- obs.append('kind')
783
+ obs.append("kind")
569
784
  record_length_bytes = f.read(4)
570
- record_length = struct.unpack('i', record_length_bytes)[0]
785
+ record_length = struct.unpack("i", record_length_bytes)[0]
571
786
  record = f.read(record_length)
572
787
  kind = f"{struct.unpack('i', record)[0]}"
573
788
  obs.append(kind)
574
-
789
+
575
790
  obs_sequence.check_trailing_record_length(f, record_length)
576
791
 
577
792
  # time (seconds, days)
578
793
  record_length = obs_sequence.read_record_length(f)
579
794
  record = f.read(record_length)
580
- seconds, days = struct.unpack('ii', record)[:8]
795
+ seconds, days = struct.unpack("ii", record)[:8]
581
796
  time_string = f"{seconds} {days}"
582
797
  obs.append(time_string)
583
798
 
584
799
  obs_sequence.check_trailing_record_length(f, record_length)
585
-
800
+
586
801
  # obs error variance
587
802
  record_length = obs_sequence.read_record_length(f)
588
803
  record = f.read(record_length)
589
- obs.append(struct.unpack('d', record)[0])
590
-
804
+ obs.append(struct.unpack("d", record)[0])
805
+
591
806
  obs_sequence.check_trailing_record_length(f, record_length)
592
807
 
593
808
  yield obs
594
809
 
595
- def composite_types(self, composite_types='use_default'):
810
+ def composite_types(self, composite_types="use_default"):
596
811
  """
597
812
  Set up and construct composite types for the DataFrame.
598
813
 
599
- This function sets up composite types based on a provided YAML configuration or
600
- a default configuration. It constructs new composite rows by combining specified
814
+ This function sets up composite types based on a provided YAML configuration or
815
+ a default configuration. It constructs new composite rows by combining specified
601
816
  components and adds them to the DataFrame.
602
817
 
603
- Parameters:
818
+ Args:
604
819
  composite_types (str, optional): The YAML configuration for composite types. If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
605
820
 
606
821
  Returns:
@@ -610,12 +825,12 @@ class obs_sequence:
610
825
  Exception: If there are repeat values in the components.
611
826
  """
612
827
 
613
- if composite_types == 'use_default':
828
+ if composite_types == "use_default":
614
829
  composite_yaml = self.default_composite_types
615
830
  else:
616
831
  composite_yaml = composite_types
617
- self.composite_types_dict = load_yaml_to_dict(composite_yaml)
618
-
832
+ self.composite_types_dict = load_yaml_to_dict(composite_yaml)
833
+
619
834
  components = []
620
835
  for value in self.composite_types_dict.values():
621
836
  components.extend(value["components"])
@@ -623,31 +838,234 @@ class obs_sequence:
623
838
  if len(components) != len(set(components)):
624
839
  raise Exception("There are repeat values in components.")
625
840
 
626
- df_comp = self.df[self.df['type'].str.upper().isin([component.upper() for component in components])]
627
- df_no_comp = self.df[~self.df['type'].str.upper().isin([component.upper() for component in components])]
841
+ df_comp = self.df[
842
+ self.df["type"]
843
+ .str.upper()
844
+ .isin([component.upper() for component in components])
845
+ ]
846
+ df_no_comp = self.df[
847
+ ~self.df["type"]
848
+ .str.upper()
849
+ .isin([component.upper() for component in components])
850
+ ]
628
851
 
629
852
  for key in self.composite_types_dict:
630
- df_new = construct_composit(df_comp, key, self.composite_types_dict[key]['components'])
853
+ df_new = construct_composit(
854
+ df_comp, key, self.composite_types_dict[key]["components"]
855
+ )
631
856
  df_no_comp = pd.concat([df_no_comp, df_new], axis=0)
632
857
 
633
858
  return df_no_comp
634
-
859
+
860
+ @classmethod
861
+ def join(cls, obs_sequences, copies=None):
862
+ """
863
+ Join a list of observation sequences together.
864
+
865
+ This method combines the headers and observations from a list of obs_sequence objects
866
+ into a single obs_sequence object.
867
+
868
+ Args:
869
+ obs_sequences (list of obs_sequences): The list of observation sequences objects to join.
870
+ copies (list of str, optional): A list of copy names to include in the combined data.
871
+ If not provided, all copies are included.
872
+
873
+ Returns:
874
+ A new obs_sequence object containing the combined data.
875
+
876
+ Example:
877
+ .. code-block:: python
878
+
879
+ obs_seq1 = obs_sequence(file='obs_seq1.final')
880
+ obs_seq2 = obs_sequence(file='obs_seq2.final')
881
+ obs_seq3 = obs_sequence(file='obs_seq3.final')
882
+ combined = obs_sequence.join([obs_seq1, obs_seq2, obs_seq3])
883
+ """
884
+ if not obs_sequences:
885
+ raise ValueError("The list of observation sequences is empty.")
886
+
887
+ # Create a new obs_sequnece object with the combined data
888
+ combo = cls(file=None)
889
+
890
+ # Check if all obs_sequences have compatible attributes
891
+ first_loc_mod = obs_sequences[0].loc_mod
892
+ first_has_assimilation_info = obs_sequences[0].has_assimilation_info
893
+ first_has_posterior = obs_sequences[0].has_posterior
894
+ for obs_seq in obs_sequences:
895
+ if obs_seq.loc_mod != first_loc_mod:
896
+ raise ValueError(
897
+ "All observation sequences must have the same loc_mod."
898
+ )
899
+ if obs_seq.has_assimilation_info != first_has_assimilation_info:
900
+ raise ValueError(
901
+ "All observation sequences must have assimilation info."
902
+ )
903
+ if obs_seq.has_posterior != first_has_posterior:
904
+ raise ValueError(
905
+ "All observation sequences must have the posterior info."
906
+ )
907
+ # HK @todo prior only
908
+ combo.loc_mod = first_loc_mod
909
+
910
+ # check the copies are compatible (list of copies to combine?)
911
+ # subset of copies if needed
912
+ if copies:
913
+ start_required_columns = ["obs_num", "observation"]
914
+ end_required_columns = [
915
+ "linked_list",
916
+ "longitude",
917
+ "latitude",
918
+ "vertical",
919
+ "vert_unit",
920
+ "type",
921
+ "metadata",
922
+ "external_FO",
923
+ "seconds",
924
+ "days",
925
+ "time",
926
+ "obs_err_var",
927
+ ]
928
+ required_columns = start_required_columns + end_required_columns
929
+
930
+ requested_columns = (
931
+ start_required_columns
932
+ + [item for item in copies if item not in required_columns]
933
+ + end_required_columns
934
+ )
935
+
936
+ for obs_seq in obs_sequences:
937
+ if not set(requested_columns).issubset(obs_seq.df.columns):
938
+ raise ValueError(
939
+ "All observation sequences must have the selected copies."
940
+ )
941
+
942
+ # go through columns and create header
943
+ remove_list = [
944
+ "obs_num",
945
+ "linked_list",
946
+ "latitude",
947
+ "longitude",
948
+ "vertical",
949
+ "vert_unit",
950
+ "type",
951
+ "metadata",
952
+ "external_FO",
953
+ "time",
954
+ "seconds",
955
+ "days",
956
+ "obs_err_var",
957
+ ]
958
+ # using lists to retain copy order, non_qcs followed by qcs
959
+ combo.copie_names = [
960
+ item for item in requested_columns if item not in remove_list
961
+ ]
962
+ combo.non_qc_copie_names = [
963
+ item
964
+ for item in combo.copie_names
965
+ if item in obs_sequences[0].non_qc_copie_names
966
+ ]
967
+ combo.qc_copie_names = [
968
+ item
969
+ for item in combo.copie_names
970
+ if item in obs_sequences[0].qc_copie_names
971
+ ]
972
+
973
+ combo.n_copies = len(combo.copie_names)
974
+ combo.n_qc = len(combo.qc_copie_names)
975
+ combo.n_non_qc = len(combo.non_qc_copie_names)
976
+
977
+ else:
978
+ for obs_seq in obs_sequences:
979
+ if not obs_sequences[0].df.columns.isin(obs_seq.df.columns).all():
980
+ raise ValueError(
981
+ "All observation sequences must have the same copies."
982
+ )
983
+ combo.n_copies = obs_sequences[0].n_copies
984
+ combo.n_qc = obs_sequences[0].n_qc
985
+ combo.n_non_qc = obs_sequences[0].n_non_qc
986
+ combo.copie_names = obs_sequences[0].copie_names
987
+
988
+ # todo HK @todo combine synonyms for obs?
989
+
990
+ # Initialize combined data
991
+ combined_types = []
992
+ combined_df = pd.DataFrame()
993
+ combo.all_obs = None # set to none to force writing from the dataframe if write_obs_seq is called
994
+
995
+ # Iterate over the list of observation sequences and combine their data
996
+ for obs_seq in obs_sequences:
997
+ if copies:
998
+ combined_df = pd.concat(
999
+ [combined_df, obs_seq.df[requested_columns]], ignore_index=True
1000
+ )
1001
+ else:
1002
+ combined_df = pd.concat([combined_df, obs_seq.df], ignore_index=True)
1003
+ combined_types.extend(list(obs_seq.reverse_types.keys()))
1004
+
1005
+ # create dictionary of types
1006
+ keys = set(combined_types)
1007
+ combo.reverse_types = {item: i + 1 for i, item in enumerate(keys)}
1008
+ combo.types = {v: k for k, v in combo.reverse_types.items()}
1009
+
1010
+ # create linked list for obs
1011
+ combo.df = combined_df.sort_values(by="time").reset_index(drop=True)
1012
+ combo.df["linked_list"] = obs_sequence.generate_linked_list_pattern(
1013
+ len(combo.df)
1014
+ )
1015
+ combo.df["obs_num"] = combined_df.index + 1
1016
+ combo.create_header(len(combo.df))
1017
+
1018
+ # set assimilation info (mean and spread) (prior and posterior)
1019
+ combo.has_assimilation_info = "prior_ensemble_mean".casefold() in map(
1020
+ str.casefold, combo.df.columns
1021
+ )
1022
+ combo.has_assimilation_info = "prior_ensemble_spread".casefold() in map(
1023
+ str.casefold, combo.df.columns
1024
+ )
1025
+ combo.has_posterior = "posterior_ensemble_mean".casefold() in map(
1026
+ str.casefold, combo.df.columns
1027
+ )
1028
+ combo.has_posterior = "posterior_ensemble_spread".casefold() in map(
1029
+ str.casefold, combo.df.columns
1030
+ )
1031
+
1032
+ return combo
1033
+
1034
+ def create_header(self, n):
1035
+ """Create a header for the obs_seq file from the obs_sequence object."""
1036
+ assert (
1037
+ self.n_copies == self.n_non_qc + self.n_qc
1038
+ ), "n_copies must be equal to n_non_qc + n_qc"
1039
+
1040
+ self.header = []
1041
+ self.header.append(f"obs_sequence")
1042
+ self.header.append("obs_type_definitions")
1043
+ self.header.append(f"{len(self.types)}")
1044
+ for key, value in self.types.items():
1045
+ self.header.append(f"{key} {value}")
1046
+ self.header.append(f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}")
1047
+ self.header.append(f"num_obs: {n} max_num_obs: {n}")
1048
+ for copie in self.copie_names:
1049
+ self.header.append(copie)
1050
+ self.header.append(f"first: 1 last: {n}")
1051
+
1052
+
635
1053
  def load_yaml_to_dict(file_path):
636
1054
  """
637
1055
  Load a YAML file and convert it to a dictionary.
638
1056
 
639
- Parameters:
1057
+ Args:
640
1058
  file_path (str): The path to the YAML file.
641
1059
 
642
1060
  Returns:
643
1061
  dict: The YAML file content as a dictionary.
644
1062
  """
645
1063
  try:
646
- with open(file_path, 'r') as file:
1064
+ with open(file_path, "r") as file:
647
1065
  return yaml.safe_load(file)
648
1066
  except Exception as e:
649
1067
  print(f"Error loading YAML file: {e}")
650
- return None
1068
+ return None
651
1069
 
652
1070
 
653
1071
  def convert_dart_time(seconds, days):
@@ -657,66 +1075,8 @@ def convert_dart_time(seconds, days):
657
1075
  - base year for Gregorian calendar is 1601
658
1076
  - dart time is seconds, days since 1601
659
1077
  """
660
- time = dt.datetime(1601,1,1) + dt.timedelta(days=days, seconds=seconds)
1078
+ time = dt.datetime(1601, 1, 1) + dt.timedelta(days=days, seconds=seconds)
661
1079
  return time
662
-
663
- def select_by_dart_qc(df, dart_qc):
664
- """
665
- Selects rows from a DataFrame based on the DART quality control flag.
666
-
667
- Parameters:
668
- df (DataFrame): A pandas DataFrame.
669
- dart_qc (int): The DART quality control flag to select.
670
-
671
- Returns:
672
- DataFrame: A DataFrame containing only the rows with the specified DART quality control flag.
673
-
674
- Raises:
675
- ValueError: If the DART quality control flag is not present in the DataFrame.
676
- """
677
- if dart_qc not in df['DART_quality_control'].unique():
678
- raise ValueError(f"DART quality control flag '{dart_qc}' not found in DataFrame.")
679
- else:
680
- return df[df['DART_quality_control'] == dart_qc]
681
-
682
- def select_failed_qcs(df):
683
- """
684
- Selects rows from a DataFrame where the DART quality control flag is greater than 0.
685
-
686
- Parameters:
687
- df (DataFrame): A pandas DataFrame.
688
-
689
- Returns:
690
- DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
691
- """
692
- return df[df['DART_quality_control'] > 0]
693
-
694
- def possible_vs_used(df):
695
- """
696
- Calculates the count of possible vs. used observations by type.
697
-
698
- This function takes a DataFrame containing observation data, including a 'type' column for the observation
699
- type and an 'observation' column. The number of used observations ('used'), is the total number
700
- minus the observations that failed quality control checks (as determined by the `select_failed_qcs` function).
701
- The result is a DataFrame with each observation type, the count of possible observations, and the count of
702
- used observations.
703
-
704
- Parameters:
705
- df (pd.DataFrame): A DataFrame with at least two columns: 'type' for the observation type and 'observation'
706
- for the observation data. It may also contain other columns required by the `select_failed_qcs` function
707
- to determine failed quality control checks.
708
-
709
- Returns:
710
- pd.DataFrame: A DataFrame with three columns: 'type', 'possible', and 'used'. 'type' is the observation type,
711
- 'possible' is the count of all observations of that type, and 'used' is the count of observations of that type
712
- that passed quality control checks.
713
-
714
- """
715
- possible = df.groupby('type')['observation'].count()
716
- possible.rename('possible', inplace=True)
717
- used = df.groupby('type')['observation'].count() - select_failed_qcs(df).groupby('type')['observation'].count()
718
- used.rename('used', inplace=True)
719
- return pd.concat([possible, used], axis=1).reset_index()
720
1080
 
721
1081
 
722
1082
  def construct_composit(df_comp, composite, components):
@@ -724,10 +1084,10 @@ def construct_composit(df_comp, composite, components):
724
1084
  Construct a composite DataFrame by combining rows from two components.
725
1085
 
726
1086
  This function takes two DataFrames and combines rows from them based on matching
727
- location and time. It creates a new row with a composite type by combining
1087
+ location and time. It creates a new row with a composite type by combining
728
1088
  specified columns using the square root of the sum of squares method.
729
1089
 
730
- Parameters:
1090
+ Args:
731
1091
  df_comp (pd.DataFrame): The DataFrame containing the component rows to be combined.
732
1092
  composite (str): The type name for the new composite rows.
733
1093
  components (list of str): A list containing the type names of the two components to be combined.
@@ -735,27 +1095,29 @@ def construct_composit(df_comp, composite, components):
735
1095
  Returns:
736
1096
  merged_df (pd.DataFrame): The updated DataFrame with the new composite rows added.
737
1097
  """
738
- selected_rows = df_comp[df_comp['type'] == components[0].upper()]
739
- selected_rows_v = df_comp[df_comp['type'] == components[1].upper()]
1098
+ selected_rows = df_comp[df_comp["type"] == components[0].upper()]
1099
+ selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
740
1100
 
741
- columns_to_combine = df_comp.filter(regex='ensemble').columns.tolist()
742
- columns_to_combine.append('observation') # TODO HK: bias, sq_err, obs_err_var
743
- merge_columns = ['latitude', 'longitude', 'vertical', 'time']
1101
+ columns_to_combine = df_comp.filter(regex="ensemble").columns.tolist()
1102
+ columns_to_combine.append("observation") # TODO HK: bias, sq_err, obs_err_var
1103
+ merge_columns = ["latitude", "longitude", "vertical", "time"]
744
1104
 
745
1105
  print("duplicates in u: ", selected_rows[merge_columns].duplicated().sum())
746
- print("duplicates in v: ",selected_rows_v[merge_columns].duplicated().sum())
1106
+ print("duplicates in v: ", selected_rows_v[merge_columns].duplicated().sum())
747
1107
 
748
1108
  # Merge the two DataFrames on location and time columns
749
- merged_df = pd.merge(selected_rows, selected_rows_v, on=merge_columns, suffixes=('', '_v'))
1109
+ merged_df = pd.merge(
1110
+ selected_rows, selected_rows_v, on=merge_columns, suffixes=("", "_v")
1111
+ )
750
1112
 
751
1113
  # Apply the square root of the sum of squares method to the relevant columns
752
1114
  for col in columns_to_combine:
753
- merged_df[col] = np.sqrt(merged_df[col]**2 + merged_df[f'{col}_v']**2)
1115
+ merged_df[col] = np.sqrt(merged_df[col] ** 2 + merged_df[f"{col}_v"] ** 2)
754
1116
 
755
1117
  # Create the new composite rows
756
- merged_df['type'] = composite.upper()
757
- merged_df = merged_df.drop(columns=[col for col in merged_df.columns if col.endswith('_v')])
1118
+ merged_df["type"] = composite.upper()
1119
+ merged_df = merged_df.drop(
1120
+ columns=[col for col in merged_df.columns if col.endswith("_v")]
1121
+ )
758
1122
 
759
1123
  return merged_df
760
-
761
-