pydartdiags 0.0.43__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydartdiags might be problematic. Click here for more details.

@@ -1,3 +1,4 @@
1
+ # SPDX-License-Identifier: Apache-2.0
1
2
  import pandas as pd
2
3
  import datetime as dt
3
4
  import numpy as np
@@ -5,86 +6,140 @@ import os
5
6
  import yaml
6
7
  import struct
7
8
 
9
+
8
10
  def requires_assimilation_info(func):
9
11
  def wrapper(self, *args, **kwargs):
10
12
  if self.has_assimilation_info:
11
13
  return func(self, *args, **kwargs)
12
14
  else:
13
- raise ValueError("Assimilation information is required to call this function.")
15
+ raise ValueError(
16
+ "Assimilation information is required to call this function."
17
+ )
18
+
14
19
  return wrapper
15
20
 
21
+
16
22
  def requires_posterior_info(func):
17
23
  def wrapper(self, *args, **kwargs):
18
- if self.has_posterior_info:
24
+ if self.has_posterior:
19
25
  return func(self, *args, **kwargs)
20
26
  else:
21
27
  raise ValueError("Posterior information is required to call this function.")
28
+
22
29
  return wrapper
23
30
 
24
31
 
25
32
  class obs_sequence:
26
- """Create an obs_sequence object from an ascii observation sequence file.
33
+ """
34
+ Initialize an obs_sequence object from an ASCII or binary observation sequence file,
35
+ or create an empty obs_sequence object from scratch.
36
+
37
+ Args:
38
+ file (str): The input observation sequence ASCII or binary file.
39
+ If None, an empty obs_sequence object is created from scratch.
40
+
41
+ Returns:
42
+ An obs_sequence object
27
43
 
28
44
  Attributes:
29
- df (pandas.DataFrame): DataFrame containing all the observations.
30
- all_obs (list): List of all observations, each observation is a list.
31
- header (str): Header from the ascii file.
32
- vert (dict): Dictionary of dart vertical units.
33
- types (dict): Dictionary of types in the observation sequence file.
34
- copie_names (list): Names of copies in the observation sequence file.
35
- Spelled 'copie' to avoid conflict with the Python built-in copy function.
45
+ df (pandas.DataFrame): The DataFrame containing the observation sequence data.
46
+ header (list): The header of the observation sequence.
47
+ copie_names (list): The names of the copies in the observation sequence.
48
+ Spelled 'copie' to avoid conflict with the Python built-in 'copy'.
36
49
  Spaces are replaced with underscores in copie_names.
37
-
38
- Parameters:
39
- file : the input observation sequence ascii file
40
-
41
- Example:
42
- Read the observation sequence from file:
43
- ``obs_seq = obs_sequence('/home/data/obs_seq.final.ascii.small')``
44
- Access the resulting pandas DataFrame:
45
- ``obs_seq.df``
46
-
47
- For 3D sphere models: latitude and longitude are in degrees in the DataFrame
48
-
49
- Calculations:
50
-
51
- - sq_err = (mean-obs)**2
52
- - bias = (mean-obs)
53
- - rmse = sqrt( sum((mean-obs)**2)/n )
54
- - bias = sum((mean-obs)/n)
55
- - spread = sum(sd)
56
- - totalspread = sqrt(sum(sd+obs_err_var))
57
-
50
+ non_qc_copie_names (list): The names of the copies not including quality control,
51
+ e.g. observation, mean, ensemble_members
52
+ qc_copie_names (list): The names of the quality control copies, e.g. DART_QC
53
+ n_copies(int): The total number of copies in the observation sequence.
54
+ n_non_qc(int): The number of copies not including quality control.
55
+ n_qc(int): The number of quality control copies.
56
+ vert (dict): A dictionary mapping DART vertical coordinate types to their
57
+ corresponding integer values.
58
+
59
+ - undefined: 'VERTISUNDEF'
60
+ - surface: 'VERTISSURFACE' (value is surface elevation in meters)
61
+ - model level: 'VERTISLEVEL'
62
+ - pressure: 'VERTISPRESSURE' (in Pascals)
63
+ - height: 'VERTISHEIGHT' (in meters)
64
+ - scale height: 'VERTISSCALEHEIGHT' (unitless)
65
+ loc_mod (str): The location model, either 'loc3d' or 'loc1d'.
66
+ For 3D sphere models: latitude and longitude are in degrees in the DataFrame.
67
+ types (dict): Dictionary of types of observations the observation sequence,
68
+ e.g. {23: 'ACARS_TEMPERATURE'},
69
+ reverse_types (dict): Dictionary of types with keys and values reversed, e.g
70
+ {'ACARS_TEMPERATURE': 23}
71
+ synonyms_for_obs (list): List of synonyms for the observation column in the DataFrame.
72
+ The defualt list is
73
+
74
+ .. code-block:: python
75
+
76
+ [ 'NCEP BUFR observation',
77
+ 'AIRS observation',
78
+ 'GTSPP observation',
79
+ 'SST observation',
80
+ 'observations',
81
+ 'WOD observation']
82
+
83
+ You can add more synonyms by providing a list of strings when
84
+ creating the obs_sequence object.
85
+
86
+ .. code-block:: python
87
+
88
+ obs_sequence(file, synonyms=['synonym1', 'synonym2']).df
89
+
90
+ has_assimilation_info (bool): Indicates if assimilation information is present.
91
+ has_posterior (bool): Indicates if posterior information is present.
92
+ seq (generator): Generator of observations from the observation sequence file.
93
+ all_obs (list): List of all observations, each observation is a list.
94
+ Valid when the obs_sequence is created from a file.
95
+ Set to None when the obs_sequence is created from scratch or multiple
96
+ obs_sequences are joined.
58
97
  """
59
- ## static variables
60
- # vertrical coordinate:
61
- # undefined 'VERTISUNDEF'
62
- # surface 'VERTISSURFACE' (value is surface elevation in m)
63
- # model level 'VERTISLEVEL'
64
- # pressure 'VERTISPRESSURE' (in pascals)
65
- # height 'VERTISHEIGHT' (in meters)
66
- # scale height 'VERTISSCALEHEIGHT' (unitless)
67
- vert = {-2: 'undefined',
68
- -1: 'surface (m)',
69
- 1: 'model level',
70
- 2: 'pressure (Pa)',
71
- 3: 'height (m)',
72
- 4: 'scale height' }
98
+
99
+ vert = {
100
+ -2: "undefined",
101
+ -1: "surface (m)",
102
+ 1: "model level",
103
+ 2: "pressure (Pa)",
104
+ 3: "height (m)",
105
+ 4: "scale height",
106
+ }
73
107
 
74
108
  reversed_vert = {value: key for key, value in vert.items()}
75
109
 
76
-
77
110
  def __init__(self, file, synonyms=None):
78
- self.loc_mod = 'None'
111
+ """
112
+ Create an obs_sequence object from an ASCII or binary observation sequence file,
113
+ or create an empty obs_sequence object from scratch.
114
+
115
+ Args:
116
+ file (str): The input observation sequence ASCII or binary file.
117
+ If None, an empty obs_sequence object is created from scratch.
118
+ synonyms (list, optional): List of synonyms for the observation column in the DataFrame.
119
+
120
+ Returns:
121
+ an obs_sequence object
122
+
123
+ Examples:
124
+
125
+ .. code-block:: python
126
+
127
+ obs_seq = obs_sequence(file='obs_seq.final')
128
+
129
+ """
130
+
131
+ self.loc_mod = "None"
79
132
  self.has_assimilation_info = False
80
133
  self.has_posterior = False
81
134
  self.file = file
82
- self.synonyms_for_obs = ['NCEP BUFR observation',
83
- 'AIRS observation',
84
- 'GTSPP observation',
85
- 'SST observation',
86
- 'observations',
87
- 'WOD observation']
135
+ self.synonyms_for_obs = [
136
+ "NCEP BUFR observation",
137
+ "AIRS observation",
138
+ "GTSPP observation",
139
+ "SST observation",
140
+ "observations",
141
+ "WOD observation",
142
+ ]
88
143
  if synonyms:
89
144
  if isinstance(synonyms, list):
90
145
  self.synonyms_for_obs.extend(synonyms)
@@ -92,18 +147,22 @@ class obs_sequence:
92
147
  self.synonyms_for_obs.append(synonyms)
93
148
 
94
149
  if file is None:
95
- # Early exit for testing purposes
150
+ # Early exit - for testing purposes or creating obs_seq objects from scratch
96
151
  self.df = pd.DataFrame()
97
152
  self.types = {}
98
153
  self.reverse_types = {}
99
154
  self.copie_names = []
100
- self.n_copies = 0
155
+ self.non_qc_copie_names = []
156
+ self.qc_copie_names = []
157
+ self.n_copies = 0 # copies including qc
158
+ self.n_non_qc = 0 # copies not including qc
159
+ self.n_qc = 0 # number of qc copies
101
160
  self.seq = []
102
161
  self.all_obs = []
103
162
  return
104
163
 
105
164
  module_dir = os.path.dirname(__file__)
106
- self.default_composite_types = os.path.join(module_dir,"composite_types.yaml")
165
+ self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
107
166
 
108
167
  if self.is_binary(file):
109
168
  self.header = self.read_binary_header(file)
@@ -113,40 +172,47 @@ class obs_sequence:
113
172
  self.types = self.collect_obs_types(self.header)
114
173
  self.reverse_types = {v: k for k, v in self.types.items()}
115
174
  self.copie_names, self.n_copies = self.collect_copie_names(self.header)
175
+ self.n_non_qc, self.n_qc = self.num_qc_non_qc(self.header)
176
+ self.non_qc_copie_names = self.copie_names[: self.n_non_qc]
177
+ self.qc_copie_names = self.copie_names[self.n_non_qc :]
116
178
 
117
179
  if self.is_binary(file):
118
180
  self.seq = self.obs_binary_reader(file, self.n_copies)
119
- self.loc_mod = 'loc3d' # only loc3d supported for binary, & no way to check
181
+ self.loc_mod = "loc3d" # only loc3d supported for binary, & no way to check
120
182
  else:
121
183
  self.seq = self.obs_reader(file, self.n_copies)
122
184
 
123
- self.all_obs = self.create_all_obs() # uses up the generator
185
+ self.all_obs = self.create_all_obs() # uses up the generator
124
186
  # at this point you know if the seq is loc3d or loc1d
125
- if self.loc_mod == 'None':
126
- raise ValueError("Neither 'loc3d' nor 'loc1d' could be found in the observation sequence.")
187
+ if self.loc_mod == "None":
188
+ raise ValueError(
189
+ "Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
190
+ )
127
191
  self.columns = self.column_headers()
128
- self.df = pd.DataFrame(self.all_obs, columns = self.columns)
129
- if self.loc_mod == 'loc3d':
130
- self.df['longitude'] = np.rad2deg(self.df['longitude'])
131
- self.df['latitude'] = np.rad2deg(self.df['latitude'])
192
+ self.df = pd.DataFrame(self.all_obs, columns=self.columns)
193
+ if self.loc_mod == "loc3d":
194
+ self.df["longitude"] = np.rad2deg(self.df["longitude"])
195
+ self.df["latitude"] = np.rad2deg(self.df["latitude"])
132
196
  # rename 'X observation' to observation
133
- self.synonyms_for_obs = [synonym.replace(' ', '_') for synonym in self.synonyms_for_obs]
134
- rename_dict = {old: 'observation' for old in self.synonyms_for_obs if old in self.df.columns}
197
+ self.synonyms_for_obs = [
198
+ synonym.replace(" ", "_") for synonym in self.synonyms_for_obs
199
+ ]
200
+ rename_dict = {
201
+ old: "observation"
202
+ for old in self.synonyms_for_obs
203
+ if old in self.df.columns
204
+ }
135
205
  self.df = self.df.rename(columns=rename_dict)
136
206
 
137
- # calculate bias and sq_err is the obs_seq is an obs_seq.final
138
- if 'prior_ensemble_mean'.casefold() in map(str.casefold, self.columns):
207
+ # check if the assimilation info is present
208
+ if "prior_ensemble_mean".casefold() in map(str.casefold, self.columns):
139
209
  self.has_assimilation_info = True
140
- self.df['prior_bias'] = (self.df['prior_ensemble_mean'] - self.df['observation'])
141
- self.df['prior_sq_err'] = self.df['prior_bias']**2 # squared error
142
- if 'posterior_ensemble_mean'.casefold() in map(str.casefold, self.columns):
143
- self.has_posterior_info = True
144
- self.df['posterior_bias'] = (self.df['posterior_ensemble_mean'] - self.df['observation'])
145
- self.df['posterior_sq_err'] = self.df['posterior_bias']**2
210
+ if "posterior_ensemble_mean".casefold() in map(str.casefold, self.columns):
211
+ self.has_posterior = True
146
212
 
147
213
  def create_all_obs(self):
148
- """ steps through the generator to create a
149
- list of all observations in the sequence
214
+ """steps through the generator to create a
215
+ list of all observations in the sequence
150
216
  """
151
217
  all_obs = []
152
218
  for obs in self.seq:
@@ -155,49 +221,50 @@ class obs_sequence:
155
221
  return all_obs
156
222
 
157
223
  def obs_to_list(self, obs):
158
- """put single observation into a list
159
-
160
- discards obs_def
161
- """
224
+ """put single observation into a list"""
162
225
  data = []
163
- data.append(obs[0].split()[1]) # obs_num
164
- data.extend(list(map(float,obs[1:self.n_copies+1]))) # all the copies
165
- data.append(obs[self.n_copies+1]) # linked list info
226
+ data.append(obs[0].split()[1]) # obs_num
227
+ data.extend(list(map(float, obs[1 : self.n_copies + 1]))) # all the copies
228
+ data.append(obs[self.n_copies + 1]) # linked list info
166
229
  try: # HK todo only have to check loc3d or loc1d for the first observation, the whole file is the same
167
- locI = obs.index('loc3d')
168
- location = obs[locI+1].split()
230
+ locI = obs.index("loc3d")
231
+ location = obs[locI + 1].split()
169
232
  data.append(float(location[0])) # location x
170
233
  data.append(float(location[1])) # location y
171
234
  data.append(float(location[2])) # location z
172
235
  data.append(obs_sequence.vert[int(location[3])])
173
- self.loc_mod = 'loc3d'
236
+ self.loc_mod = "loc3d"
174
237
  except ValueError:
175
238
  try:
176
- locI = obs.index('loc1d')
177
- location = obs[locI+1]
178
- data.append(float(location)) # 1d location
179
- self.loc_mod = 'loc1d'
239
+ locI = obs.index("loc1d")
240
+ location = obs[locI + 1]
241
+ data.append(float(location)) # 1d location
242
+ self.loc_mod = "loc1d"
180
243
  except ValueError:
181
- raise ValueError("Neither 'loc3d' nor 'loc1d' could be found in the observation sequence.")
182
- typeI = obs.index('kind') # type of observation
244
+ raise ValueError(
245
+ "Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
246
+ )
247
+ typeI = obs.index("kind") # type of observation
183
248
  type_value = obs[typeI + 1]
184
249
  if not self.types:
185
- data.append('Identity')
250
+ data.append("Identity")
186
251
  else:
187
- data.append(self.types[type_value]) # observation type
188
-
252
+ data.append(self.types[type_value]) # observation type
253
+
189
254
  # any observation specific obs def info is between here and the end of the list
190
255
  # can be obs_def & external forward operator
191
- metadata = obs[typeI+2:-2]
256
+ metadata = obs[typeI + 2 : -2]
192
257
  obs_def_metadata, external_metadata = self.split_metadata(metadata)
193
258
  data.append(obs_def_metadata)
194
259
  data.append(external_metadata)
195
260
 
196
261
  time = obs[-2].split()
197
- data.append(int(time[0])) # seconds
198
- data.append(int(time[1])) # days
199
- data.append(convert_dart_time(int(time[0]), int(time[1]))) # datetime # HK todo what is approprate for 1d models?
200
- data.append(float(obs[-1])) # obs error variance ?convert to sd?
262
+ data.append(int(time[0])) # seconds
263
+ data.append(int(time[1])) # days
264
+ data.append(
265
+ convert_dart_time(int(time[0]), int(time[1]))
266
+ ) # datetime # HK todo what is approprate for 1d models?
267
+ data.append(float(obs[-1])) # obs error variance ?convert to sd?
201
268
 
202
269
  return data
203
270
 
@@ -215,41 +282,49 @@ class obs_sequence:
215
282
  the first sublist contains the entire metadata list, and the second is empty.
216
283
  """
217
284
  for i, item in enumerate(metadata):
218
- if item.startswith('external_FO'):
285
+ if item.startswith("external_FO"):
219
286
  return metadata[:i], metadata[i:]
220
287
  return metadata, []
221
288
 
222
289
  def list_to_obs(self, data):
290
+ """convert a list of data to an observation
291
+
292
+ Assuming the order of the list is obs_seq.copie_names
293
+
294
+ """
223
295
  obs = []
224
- obs.append('OBS ' + str(data[0])) # obs_num lots of space
225
- obs.extend(data[1:self.n_copies+1]) # all the copies
226
- obs.append(data[self.n_copies+1]) # linked list info
227
- obs.append('obdef') # TODO HK: metadata obs_def
296
+ obs.append("OBS " + str(data[0])) # obs_num lots of space
297
+ obs.extend(data[1 : self.n_copies + 1]) # all the copies
298
+ obs.append(data[self.n_copies + 1]) # linked list info
299
+ obs.append("obdef") # TODO HK: extended_FO obs_def
228
300
  obs.append(self.loc_mod)
229
- if self.loc_mod == 'loc3d':
230
- obs.append(' '.join(map(str, data[self.n_copies+2:self.n_copies+5])) + ' ' + str(self.reversed_vert[data[self.n_copies+5]]) ) # location x, y, z, vert
231
- obs.append('kind') # this is type of observation
301
+ if self.loc_mod == "loc3d":
302
+ obs.append(
303
+ " ".join(map(str, data[self.n_copies + 2 : self.n_copies + 5]))
304
+ + " "
305
+ + str(self.reversed_vert[data[self.n_copies + 5]])
306
+ ) # location x, y, z, vert
307
+ obs.append("kind") # this is type of observation
232
308
  obs.append(self.reverse_types[data[self.n_copies + 6]]) # observation type
233
- # Convert metadata to a string and append
309
+ # Convert metadata to a string and append !HK @todo you are not converting to string
234
310
  obs.extend(data[self.n_copies + 7]) # metadata
235
- elif self.loc_mod == 'loc1d':
236
- obs.append(data[self.n_copies+2]) # 1d location
237
- obs.append('kind') # this is type of observation
311
+ obs.extend(data[self.n_copies + 8]) # external forward operator
312
+ elif self.loc_mod == "loc1d":
313
+ obs.append(data[self.n_copies + 2]) # 1d location
314
+ obs.append("kind") # this is type of observation
238
315
  obs.append(self.reverse_types[data[self.n_copies + 3]]) # observation type
239
- # Convert metadata to a string and append
240
- metadata = ' '.join(map(str, data[self.n_copies + 4:-4]))
241
- if metadata:
242
- obs.append(metadata) # metadata
243
- obs.append(' '.join(map(str, data[-4:-2]))) # seconds, days
316
+ obs.extend(data[self.n_copies + 4]) # metadata
317
+ obs.extend(data[self.n_copies + 5]) # external forward operator
318
+ obs.append(" ".join(map(str, data[-4:-2]))) # seconds, days
244
319
  obs.append(data[-1]) # obs error variance
245
320
 
246
321
  return obs
247
322
 
248
323
  @staticmethod
249
324
  def generate_linked_list_pattern(n):
250
- """Create a list of strings with the linked list pattern for n lines."""
325
+ """Create a list of strings with the linked list pattern for n observations."""
251
326
  result = []
252
- for i in range(n-1):
327
+ for i in range(n - 1):
253
328
  col1 = i if i > 0 else -1
254
329
  col2 = i + 2
255
330
  col3 = -1
@@ -257,101 +332,97 @@ class obs_sequence:
257
332
  result.append(f"{n-1:<12}{'-1':<11}{'-1'}")
258
333
  return result
259
334
 
260
- def write_obs_seq(self, file, df=None):
335
+ def write_obs_seq(self, file):
261
336
  """
262
337
  Write the observation sequence to a file.
263
-
264
- This function writes the observation sequence to disk.
265
- If no DataFrame is provided, it writes the obs_sequence object to a file using the
266
- header and all observations stored in the object.
267
- If a DataFrame is provided,it creates a header and linked list from the DataFrame,
268
- then writes the DataFrame obs to an obs_sequence file. Note the DataFrame is assumed
269
- to have been created from obs_sequence object.
270
-
271
-
272
- Parameters:
338
+
339
+ This function writes the observation sequence stored in the obs_seq.DataFrame to a specified file.
340
+ It updates the header with the number of observations, converts coordinates back to radians
341
+ if necessary, drops unnecessary columns, sorts the DataFrame by time, and generates a linked
342
+ list pattern for reading by DART programs.
343
+
344
+ Args:
273
345
  file (str): The path to the file where the observation sequence will be written.
274
- df (pandas.DataFrame, optional): A DataFrame containing the observation data. If not provided, the function uses self.header and self.all_obs.
275
-
276
- Returns:
277
- None
278
-
279
- Examples:
280
- ``obs_seq.write_obs_seq('/path/to/output/file')``
281
- ``obs_seq.write_obs_seq('/path/to/output/file', df=obs_seq.df)``
282
- """
283
- with open(file, 'w') as f:
284
-
285
- if df is not None:
286
- # If a DataFrame is provided, update the header with the number of observations
287
- num_rows = len(df)
288
- replacement_string = f'num_obs: {num_rows:>10} max_num_obs: {num_rows:>10}'
289
- new_header = [replacement_string if 'num_obs' in element else element for element in self.header]
290
-
291
- for line in new_header[:-1]:
292
- f.write(str(line) + '\n')
293
- first = 1
294
- f.write(f"first: {first:>12} last: {num_rows:>12}\n")
295
-
296
- # TODO HK is there something better than copying the whole thing here?
297
- df_copy = df.copy() # copy since you want to change for writing.
298
- # back to radians for obs_seq
299
- if self.loc_mod == 'loc3d':
300
- df_copy['longitude'] = np.deg2rad(self.df['longitude'])
301
- df_copy['latitude'] = np.deg2rad(self.df['latitude'])
302
- if 'bias' in df_copy.columns:
303
- df_copy = df_copy.drop(columns=['bias', 'sq_err'])
304
-
305
- # linked list for reading by dart programs
306
- df_copy = df_copy.sort_values(by=['time']) # sort the DataFrame by time
307
- df_copy['obs_num'] = df.index + 1 # obs_num in time order
308
- df_copy['linked_list'] = obs_sequence.generate_linked_list_pattern(len(df_copy)) # linked list pattern
309
-
310
- def write_row(row):
311
- ob_write = self.list_to_obs(row.tolist())
312
- for line in ob_write:
313
- f.write(str(line) + '\n')
314
-
315
- df_copy.apply(write_row, axis=1)
316
-
317
- else:
318
- # If no DataFrame is provided, use self.header and self.all_obs
319
- for line in self.header:
320
- f.write(str(line) + '\n')
321
- for obs in self.all_obs:
322
- ob_write = self.list_to_obs(obs)
323
- for line in ob_write:
324
- f.write(str(line) + '\n')
325
346
 
347
+ Notes:
348
+ - Longitude and latitude are converted back to radians if the location model is 'loc3d'.
349
+ - The 'bias' and 'sq_err' columns are dropped if they exist in the DataFrame.
350
+ - The DataFrame is sorted by the 'time' column.
351
+ - An 'obs_num' column is added to the DataFrame to number the observations in time order.
352
+ - A 'linked_list' column is generated to create a linked list pattern for the observations.
353
+
354
+ Example:
355
+ obsq.write_obs_seq('obs_seq.new')
356
+
357
+ """
358
+ with open(file, "w") as f:
359
+
360
+ # If a DataFrame is provided, update the header with the number of observations
361
+ num_rows = len(self.df)
362
+ replacement_string = f"num_obs: {num_rows:>10} max_num_obs: {num_rows:>10}"
363
+ new_header = [
364
+ replacement_string if "num_obs" in element else element
365
+ for element in self.header
366
+ ]
367
+
368
+ for line in new_header[:-1]:
369
+ f.write(str(line) + "\n")
370
+ first = 1
371
+ f.write(f"first: {first:>12} last: {num_rows:>12}\n")
372
+
373
+ # TODO HK is there something better than copying the whole thing here?
374
+ df_copy = self.df.copy() # copy since you want to change for writing.
375
+ # back to radians for obs_seq
376
+ if self.loc_mod == "loc3d":
377
+ df_copy["longitude"] = np.deg2rad(self.df["longitude"]).round(16)
378
+ df_copy["latitude"] = np.deg2rad(self.df["latitude"]).round(16)
379
+ if "bias" in df_copy.columns:
380
+ df_copy = df_copy.drop(columns=["bias", "sq_err"])
381
+
382
+ # linked list for reading by dart programs
383
+ df_copy = df_copy.sort_values(
384
+ by=["time"], kind="stable"
385
+ ) # sort the DataFrame by time
386
+ df_copy["obs_num"] = self.df.index + 1 # obs_num in time order
387
+ df_copy["linked_list"] = obs_sequence.generate_linked_list_pattern(
388
+ len(df_copy)
389
+ ) # linked list pattern
390
+
391
+ def write_row(row):
392
+ ob_write = self.list_to_obs(row.tolist())
393
+ for line in ob_write:
394
+ f.write(str(line) + "\n")
395
+
396
+ df_copy.apply(write_row, axis=1)
326
397
 
327
398
  def column_headers(self):
328
- """define the columns for the dataframe """
399
+ """define the columns for the dataframe"""
329
400
  heading = []
330
- heading.append('obs_num')
401
+ heading.append("obs_num")
331
402
  heading.extend(self.copie_names)
332
- heading.append('linked_list')
333
- if self.loc_mod == 'loc3d':
334
- heading.append('longitude')
335
- heading.append('latitude')
336
- heading.append('vertical')
337
- heading.append('vert_unit')
338
- elif self.loc_mod == 'loc1d':
339
- heading.append('location')
340
- heading.append('type')
341
- heading.append('metadata')
342
- heading.append('external_FO')
343
- heading.append('seconds')
344
- heading.append('days')
345
- heading.append('time')
346
- heading.append('obs_err_var')
403
+ heading.append("linked_list")
404
+ if self.loc_mod == "loc3d":
405
+ heading.append("longitude")
406
+ heading.append("latitude")
407
+ heading.append("vertical")
408
+ heading.append("vert_unit")
409
+ elif self.loc_mod == "loc1d":
410
+ heading.append("location")
411
+ heading.append("type")
412
+ heading.append("metadata")
413
+ heading.append("external_FO")
414
+ heading.append("seconds")
415
+ heading.append("days")
416
+ heading.append("time")
417
+ heading.append("obs_err_var")
347
418
  return heading
348
419
 
349
- @requires_assimilation_info
420
+ @requires_assimilation_info
350
421
  def select_by_dart_qc(self, dart_qc):
351
422
  """
352
423
  Selects rows from a DataFrame based on the DART quality control flag.
353
424
 
354
- Parameters:
425
+ Args:
355
426
  df (DataFrame): A pandas DataFrame.
356
427
  dart_qc (int): The DART quality control flag to select.
357
428
 
@@ -361,10 +432,12 @@ class obs_sequence:
361
432
  Raises:
362
433
  ValueError: If the DART quality control flag is not present in the DataFrame.
363
434
  """
364
- if dart_qc not in self.df['DART_quality_control'].unique():
365
- raise ValueError(f"DART quality control flag '{dart_qc}' not found in DataFrame.")
435
+ if dart_qc not in self.df["DART_quality_control"].unique():
436
+ raise ValueError(
437
+ f"DART quality control flag '{dart_qc}' not found in DataFrame."
438
+ )
366
439
  else:
367
- return self.df[self.df['DART_quality_control'] == dart_qc]
440
+ return self.df[self.df["DART_quality_control"] == dart_qc]
368
441
 
369
442
  @requires_assimilation_info
370
443
  def select_failed_qcs(self):
@@ -374,7 +447,7 @@ class obs_sequence:
374
447
  Returns:
375
448
  pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
376
449
  """
377
- return self.df[self.df['DART_quality_control'] > 0]
450
+ return self.df[self.df["DART_quality_control"] > 0]
378
451
 
379
452
  @requires_assimilation_info
380
453
  def possible_vs_used(self):
@@ -392,31 +465,29 @@ class obs_sequence:
392
465
  'possible' is the count of all observations of that type, and 'used' is the count of observations of that type
393
466
  that passed quality control checks.
394
467
  """
395
- possible = self.df.groupby('type')['observation'].count()
396
- possible.rename('possible', inplace=True)
397
-
398
- failed_qcs = self.select_failed_qcs().groupby('type')['observation'].count()
468
+ possible = self.df.groupby("type")["observation"].count()
469
+ possible.rename("possible", inplace=True)
470
+
471
+ failed_qcs = self.select_failed_qcs().groupby("type")["observation"].count()
399
472
  used = possible - failed_qcs.reindex(possible.index, fill_value=0)
400
- used.rename('used', inplace=True)
401
-
402
- return pd.concat([possible, used], axis=1).reset_index()
473
+ used.rename("used", inplace=True)
403
474
 
475
+ return pd.concat([possible, used], axis=1).reset_index()
404
476
 
405
477
  @staticmethod
406
478
  def is_binary(file):
407
479
  """Check if a file is binary file."""
408
- with open(file, 'rb') as f:
480
+ with open(file, "rb") as f:
409
481
  chunk = f.read(1024)
410
- if b'\0' in chunk:
482
+ if b"\0" in chunk:
411
483
  return True
412
484
  return False
413
485
 
414
-
415
486
  @staticmethod
416
487
  def read_header(file):
417
488
  """Read the header and number of lines in the header of an ascii obs_seq file"""
418
489
  header = []
419
- with open(file, 'r') as f:
490
+ with open(file, "r") as f:
420
491
  for line in f:
421
492
  if "first:" in line and "last:" in line:
422
493
  header.append(line.strip())
@@ -432,19 +503,19 @@ class obs_sequence:
432
503
  linecount = 0
433
504
  obs_types_definitions = -1000
434
505
  num_obs = 0
435
- max_num_obs = 0
506
+ max_num_obs = 0
436
507
  # need to get:
437
508
  # number of obs_type_definitions
438
509
  # number of copies
439
510
  # number of qcs
440
- with open(file, 'rb') as f:
511
+ with open(file, "rb") as f:
441
512
  while True:
442
513
  # Read the record length
443
514
  record_length = obs_sequence.read_record_length(f)
444
515
  if record_length is None:
445
516
  break
446
517
  record = f.read(record_length)
447
- if not record: # end of file
518
+ if not record: # end of file
448
519
  break
449
520
 
450
521
  # Read the trailing record length (should match the leading one)
@@ -452,17 +523,19 @@ class obs_sequence:
452
523
 
453
524
  linecount += 1
454
525
 
455
- if linecount == 3:
456
- obs_types_definitions = struct.unpack('i', record)[0]
457
- continue
526
+ if linecount == 3:
527
+ obs_types_definitions = struct.unpack("i", record)[0]
528
+ continue
458
529
 
459
- if linecount == 4+obs_types_definitions:
460
- num_copies, num_qcs, num_obs, max_num_obs = struct.unpack('iiii', record)[:16]
530
+ if linecount == 4 + obs_types_definitions:
531
+ num_copies, num_qcs, num_obs, max_num_obs = struct.unpack(
532
+ "iiii", record
533
+ )[:16]
461
534
  break
462
-
535
+
463
536
  # Go back to the beginning of the file
464
537
  f.seek(0)
465
-
538
+
466
539
  for _ in range(2):
467
540
  record_length = obs_sequence.read_record_length(f)
468
541
  if record_length is None:
@@ -472,14 +545,14 @@ class obs_sequence:
472
545
  if not record: # end of file
473
546
  break
474
547
 
475
- obs_sequence.check_trailing_record_length(f, record_length)
476
- header.append(record.decode('utf-8').strip())
548
+ obs_sequence.check_trailing_record_length(f, record_length)
549
+ header.append(record.decode("utf-8").strip())
477
550
 
478
551
  header.append(str(obs_types_definitions))
479
552
 
480
553
  # obs_types_definitions
481
- for _ in range(3,4+obs_types_definitions):
482
- # Read the record length
554
+ for _ in range(3, 4 + obs_types_definitions):
555
+ # Read the record length
483
556
  record_length = obs_sequence.read_record_length(f)
484
557
  if record_length is None:
485
558
  break
@@ -489,21 +562,24 @@ class obs_sequence:
489
562
  if not record: # end of file
490
563
  break
491
564
 
492
- obs_sequence.check_trailing_record_length(f, record_length)
565
+ obs_sequence.check_trailing_record_length(f, record_length)
493
566
 
494
567
  if _ == 3:
495
- continue # num obs_types_definitions
568
+ continue # num obs_types_definitions
496
569
  # Read an integer and a string from the record
497
- integer_value = struct.unpack('i', record[:4])[0]
498
- string_value = record[4:].decode('utf-8').strip()
570
+ integer_value = struct.unpack("i", record[:4])[0]
571
+ string_value = record[4:].decode("utf-8").strip()
499
572
  header.append(f"{integer_value} {string_value}")
500
573
 
501
574
  header.append(f"num_copies: {num_copies} num_qc: {num_qcs}")
502
575
  header.append(f"num_obs: {num_obs} max_num_obs: {max_num_obs}")
503
-
504
- #copie names
505
- for _ in range(5+obs_types_definitions, 5+obs_types_definitions+num_copies+num_qcs+1):
506
- # Read the record length
576
+
577
+ # copie names
578
+ for _ in range(
579
+ 5 + obs_types_definitions,
580
+ 5 + obs_types_definitions + num_copies + num_qcs + 1,
581
+ ):
582
+ # Read the record length
507
583
  record_length = obs_sequence.read_record_length(f)
508
584
  if record_length is None:
509
585
  break
@@ -513,26 +589,26 @@ class obs_sequence:
513
589
  if not record:
514
590
  break
515
591
 
516
- obs_sequence.check_trailing_record_length(f, record_length)
592
+ obs_sequence.check_trailing_record_length(f, record_length)
517
593
 
518
- if _ == 5+obs_types_definitions:
594
+ if _ == 5 + obs_types_definitions:
519
595
  continue
520
596
 
521
597
  # Read the whole record as a string
522
- string_value = record.decode('utf-8').strip()
598
+ string_value = record.decode("utf-8").strip()
523
599
  header.append(string_value)
524
600
 
525
601
  # first and last obs
526
- # Read the record length
602
+ # Read the record length
527
603
  record_length = obs_sequence.read_record_length(f)
528
604
 
529
605
  # Read the actual record
530
606
  record = f.read(record_length)
531
-
532
- obs_sequence.check_trailing_record_length(f, record_length)
607
+
608
+ obs_sequence.check_trailing_record_length(f, record_length)
533
609
 
534
610
  # Read the whole record as a two integers
535
- first, last = struct.unpack('ii', record)[:8]
611
+ first, last = struct.unpack("ii", record)[:8]
536
612
  header.append(f"first: {first} last: {last}")
537
613
 
538
614
  return header
@@ -541,7 +617,7 @@ class obs_sequence:
541
617
  def collect_obs_types(header):
542
618
  """Create a dictionary for the observation types in the obs_seq header"""
543
619
  num_obs_types = int(header[2])
544
- types = dict([x.split() for x in header[3:num_obs_types+3]])
620
+ types = dict([x.split() for x in header[3 : num_obs_types + 3]])
545
621
  return types
546
622
 
547
623
  @staticmethod
@@ -549,32 +625,45 @@ class obs_sequence:
549
625
  """
550
626
  Extracts the names of the copies from the header of an obs_seq file.
551
627
 
552
- Parameters:
628
+ Args:
553
629
  header (list): A list of strings representing the lines in the header of the obs_seq file.
554
630
 
555
631
  Returns:
556
- tuple: A tuple containing two elements:
557
- - copie_names (list): A list of strings representing the copy names with underscores for spaces.
632
+ tuple: A tuple containing two elements:
633
+ - copie_names (list): A list of strings representing the copy names with underscores for spaces.
558
634
  - len(copie_names) (int): The number of copy names.
559
635
  """
560
636
  for i, line in enumerate(header):
561
637
  if "num_obs:" in line and "max_num_obs:" in line:
562
- first_copie = i+1
638
+ first_copie = i + 1
563
639
  break
564
- copie_names = ['_'.join(x.split()) for x in header[first_copie:-1]] # first and last is last line of header
640
+ copie_names = [
641
+ "_".join(x.split()) for x in header[first_copie:-1]
642
+ ] # first and last is last line of header
565
643
  return copie_names, len(copie_names)
566
644
 
645
+ @staticmethod
646
+ def num_qc_non_qc(header):
647
+ """Find the number of qc and non-qc copies in the header"""
648
+ for line in header:
649
+ if "num_copies:" in line and "num_qc:" in line:
650
+ num_non_qc = int(line.split()[1])
651
+ num_qc = int(line.split()[3])
652
+ return num_non_qc, num_qc
653
+
567
654
  @staticmethod
568
655
  def obs_reader(file, n):
569
656
  """Reads the ascii obs sequence file and returns a generator of the obs"""
570
- previous_line = ''
571
- with open(file, 'r') as f:
657
+ previous_line = ""
658
+ with open(file, "r") as f:
572
659
  for line in f:
573
660
  if "OBS" in line or "OBS" in previous_line:
574
661
  if "OBS" in line:
575
662
  obs = []
576
- obs.append(line.strip())
577
- for i in range(n+100): # number of copies + 100. Needs to be bigger than any metadata
663
+ obs.append(line.strip())
664
+ for i in range(
665
+ n + 100
666
+ ): # number of copies + 100. Needs to be bigger than any metadata
578
667
  try:
579
668
  next_line = next(f)
580
669
  except:
@@ -587,11 +676,15 @@ class obs_sequence:
587
676
  else:
588
677
  obs.append(next_line.strip())
589
678
  yield obs
590
- elif "OBS" in previous_line: # previous line is because I cannot use f.tell with next
679
+ elif (
680
+ "OBS" in previous_line
681
+ ): # previous line is because I cannot use f.tell with next
591
682
  obs = []
592
- obs.append(previous_line.strip())
593
- obs.append(line.strip())
594
- for i in range(n+100): # number of copies + 100. Needs to be bigger than any metadata
683
+ obs.append(previous_line.strip())
684
+ obs.append(line.strip())
685
+ for i in range(
686
+ n + 100
687
+ ): # number of copies + 100. Needs to be bigger than any metadata
595
688
  try:
596
689
  next_line = next(f)
597
690
  except:
@@ -608,19 +701,19 @@ class obs_sequence:
608
701
 
609
702
  @staticmethod
610
703
  def check_trailing_record_length(file, expected_length):
611
- """Reads and checks the trailing record length from the binary file written by Fortran.
704
+ """Reads and checks the trailing record length from the binary file written by Fortran.
612
705
 
613
- Parameters:
614
- file (file): The file object.
615
- expected_length (int): The expected length of the trailing record.
706
+ Args:
707
+ file (file): The file object.
708
+ expected_length (int): The expected length of the trailing record.
616
709
 
617
- Assuming 4 bytes:
618
- | Record Length (4 bytes) | Data (N bytes) | Trailing Record Length (4 bytes) |
619
- """
620
- trailing_record_length_bytes = file.read(4)
621
- trailing_record_length = struct.unpack('i', trailing_record_length_bytes)[0]
622
- if expected_length != trailing_record_length:
623
- raise ValueError("Record length mismatch in Fortran binary file")
710
+ Assuming 4 bytes:
711
+ | Record Length (4 bytes) | Data (N bytes) | Trailing Record Length (4 bytes) |
712
+ """
713
+ trailing_record_length_bytes = file.read(4)
714
+ trailing_record_length = struct.unpack("i", trailing_record_length_bytes)[0]
715
+ if expected_length != trailing_record_length:
716
+ raise ValueError("Record length mismatch in Fortran binary file")
624
717
 
625
718
  @staticmethod
626
719
  def read_record_length(file):
@@ -628,18 +721,17 @@ class obs_sequence:
628
721
  record_length_bytes = file.read(4)
629
722
  if not record_length_bytes:
630
723
  return None # End of file
631
- return struct.unpack('i', record_length_bytes)[0]
632
-
724
+ return struct.unpack("i", record_length_bytes)[0]
633
725
 
634
726
  def obs_binary_reader(self, file, n):
635
727
  """Reads the obs sequence binary file and returns a generator of the obs"""
636
728
  header_length = len(self.header)
637
- with open(file, 'rb') as f:
729
+ with open(file, "rb") as f:
638
730
  # Skip the first len(obs_seq.header) lines
639
- for _ in range(header_length-1):
731
+ for _ in range(header_length - 1):
640
732
  # Read the record length
641
733
  record_length = obs_sequence.read_record_length(f)
642
- if record_length is None: # End of file
734
+ if record_length is None: # End of file
643
735
  break
644
736
 
645
737
  # Skip the actual record
@@ -652,78 +744,78 @@ class obs_sequence:
652
744
  while True:
653
745
  obs = []
654
746
  obs_num += 1
655
- obs.append(f"OBS {obs_num}")
656
- for _ in range(n): # number of copies
747
+ obs.append(f"OBS {obs_num}")
748
+ for _ in range(n): # number of copies
657
749
  # Read the record length
658
750
  record_length = obs_sequence.read_record_length(f)
659
751
  if record_length is None:
660
752
  break
661
753
  # Read the actual record (copie)
662
754
  record = f.read(record_length)
663
- obs.append(struct.unpack('d', record)[0])
755
+ obs.append(struct.unpack("d", record)[0])
664
756
 
665
757
  # Read the trailing record length (should match the leading one)
666
758
  obs_sequence.check_trailing_record_length(f, record_length)
667
-
759
+
668
760
  # linked list info
669
761
  record_length = obs_sequence.read_record_length(f)
670
762
  if record_length is None:
671
763
  break
672
764
 
673
765
  record = f.read(record_length)
674
- int1, int2, int3 = struct.unpack('iii', record[:12])
766
+ int1, int2, int3 = struct.unpack("iii", record[:12])
675
767
  linked_list_string = f"{int1:<12} {int2:<10} {int3:<12}"
676
768
  obs.append(linked_list_string)
677
769
 
678
770
  obs_sequence.check_trailing_record_length(f, record_length)
679
771
 
680
772
  # location (note no location header "loc3d" or "loc1d" for binary files)
681
- obs.append('loc3d')
773
+ obs.append("loc3d")
682
774
  record_length = obs_sequence.read_record_length(f)
683
775
  record = f.read(record_length)
684
- x,y,z,vert = struct.unpack('dddi', record[:28])
685
- location_string = f"{x} {y} {z} {vert}"
686
- obs.append(location_string)
776
+ x, y, z, vert = struct.unpack("dddi", record[:28])
777
+ location_string = f"{x} {y} {z} {vert}"
778
+ obs.append(location_string)
687
779
 
688
780
  obs_sequence.check_trailing_record_length(f, record_length)
689
-
781
+
690
782
  # kind (type of observation) value
691
- obs.append('kind')
783
+ obs.append("kind")
692
784
  record_length_bytes = f.read(4)
693
- record_length = struct.unpack('i', record_length_bytes)[0]
785
+ record_length = struct.unpack("i", record_length_bytes)[0]
694
786
  record = f.read(record_length)
695
787
  kind = f"{struct.unpack('i', record)[0]}"
696
788
  obs.append(kind)
697
-
789
+
698
790
  obs_sequence.check_trailing_record_length(f, record_length)
699
791
 
700
792
  # time (seconds, days)
701
793
  record_length = obs_sequence.read_record_length(f)
702
794
  record = f.read(record_length)
703
- seconds, days = struct.unpack('ii', record)[:8]
795
+ seconds, days = struct.unpack("ii", record)[:8]
704
796
  time_string = f"{seconds} {days}"
705
797
  obs.append(time_string)
706
798
 
707
799
  obs_sequence.check_trailing_record_length(f, record_length)
708
-
800
+
709
801
  # obs error variance
710
802
  record_length = obs_sequence.read_record_length(f)
711
803
  record = f.read(record_length)
712
- obs.append(struct.unpack('d', record)[0])
713
-
804
+ obs.append(struct.unpack("d", record)[0])
805
+
714
806
  obs_sequence.check_trailing_record_length(f, record_length)
715
807
 
716
808
  yield obs
717
809
 
718
- def composite_types(self, composite_types='use_default'):
810
+ def composite_types(self, composite_types="use_default"):
719
811
  """
720
812
  Set up and construct composite types for the DataFrame.
721
813
 
722
- This function sets up composite types based on a provided YAML configuration or
723
- a default configuration. It constructs new composite rows by combining specified
814
+ This function sets up composite types based on a provided YAML configuration or
815
+ a default configuration. It constructs new composite rows by combining specified
724
816
  components and adds them to the DataFrame.
725
817
 
726
- Parameters:
818
+ Args:
727
819
  composite_types (str, optional): The YAML configuration for composite types. If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
728
820
 
729
821
  Returns:
@@ -733,12 +825,12 @@ class obs_sequence:
733
825
  Exception: If there are repeat values in the components.
734
826
  """
735
827
 
736
- if composite_types == 'use_default':
828
+ if composite_types == "use_default":
737
829
  composite_yaml = self.default_composite_types
738
830
  else:
739
831
  composite_yaml = composite_types
740
- self.composite_types_dict = load_yaml_to_dict(composite_yaml)
741
-
832
+ self.composite_types_dict = load_yaml_to_dict(composite_yaml)
833
+
742
834
  components = []
743
835
  for value in self.composite_types_dict.values():
744
836
  components.extend(value["components"])
@@ -746,31 +838,234 @@ class obs_sequence:
746
838
  if len(components) != len(set(components)):
747
839
  raise Exception("There are repeat values in components.")
748
840
 
749
- df_comp = self.df[self.df['type'].str.upper().isin([component.upper() for component in components])]
750
- df_no_comp = self.df[~self.df['type'].str.upper().isin([component.upper() for component in components])]
841
+ df_comp = self.df[
842
+ self.df["type"]
843
+ .str.upper()
844
+ .isin([component.upper() for component in components])
845
+ ]
846
+ df_no_comp = self.df[
847
+ ~self.df["type"]
848
+ .str.upper()
849
+ .isin([component.upper() for component in components])
850
+ ]
751
851
 
752
852
  for key in self.composite_types_dict:
753
- df_new = construct_composit(df_comp, key, self.composite_types_dict[key]['components'])
853
+ df_new = construct_composit(
854
+ df_comp, key, self.composite_types_dict[key]["components"]
855
+ )
754
856
  df_no_comp = pd.concat([df_no_comp, df_new], axis=0)
755
857
 
756
858
  return df_no_comp
757
-
859
+
860
+ @classmethod
861
+ def join(cls, obs_sequences, copies=None):
862
+ """
863
+ Join a list of observation sequences together.
864
+
865
+ This method combines the headers and observations from a list of obs_sequence objects
866
+ into a single obs_sequence object.
867
+
868
+ Args:
869
+ obs_sequences (list of obs_sequences): The list of observation sequences objects to join.
870
+ copies (list of str, optional): A list of copy names to include in the combined data.
871
+ If not provided, all copies are included.
872
+
873
+ Returns:
874
+ A new obs_sequence object containing the combined data.
875
+
876
+ Example:
877
+ .. code-block:: python
878
+
879
+ obs_seq1 = obs_sequence(file='obs_seq1.final')
880
+ obs_seq2 = obs_sequence(file='obs_seq2.final')
881
+ obs_seq3 = obs_sequence(file='obs_seq3.final')
882
+ combined = obs_sequence.join([obs_seq1, obs_seq2, obs_seq3])
883
+ """
884
+ if not obs_sequences:
885
+ raise ValueError("The list of observation sequences is empty.")
886
+
887
+ # Create a new obs_sequnece object with the combined data
888
+ combo = cls(file=None)
889
+
890
+ # Check if all obs_sequences have compatible attributes
891
+ first_loc_mod = obs_sequences[0].loc_mod
892
+ first_has_assimilation_info = obs_sequences[0].has_assimilation_info
893
+ first_has_posterior = obs_sequences[0].has_posterior
894
+ for obs_seq in obs_sequences:
895
+ if obs_seq.loc_mod != first_loc_mod:
896
+ raise ValueError(
897
+ "All observation sequences must have the same loc_mod."
898
+ )
899
+ if obs_seq.has_assimilation_info != first_has_assimilation_info:
900
+ raise ValueError(
901
+ "All observation sequences must have assimilation info."
902
+ )
903
+ if obs_seq.has_posterior != first_has_posterior:
904
+ raise ValueError(
905
+ "All observation sequences must have the posterior info."
906
+ )
907
+ # HK @todo prior only
908
+ combo.loc_mod = first_loc_mod
909
+
910
+ # check the copies are compatible (list of copies to combine?)
911
+ # subset of copies if needed
912
+ if copies:
913
+ start_required_columns = ["obs_num", "observation"]
914
+ end_required_columns = [
915
+ "linked_list",
916
+ "longitude",
917
+ "latitude",
918
+ "vertical",
919
+ "vert_unit",
920
+ "type",
921
+ "metadata",
922
+ "external_FO",
923
+ "seconds",
924
+ "days",
925
+ "time",
926
+ "obs_err_var",
927
+ ]
928
+ required_columns = start_required_columns + end_required_columns
929
+
930
+ requested_columns = (
931
+ start_required_columns
932
+ + [item for item in copies if item not in required_columns]
933
+ + end_required_columns
934
+ )
935
+
936
+ for obs_seq in obs_sequences:
937
+ if not set(requested_columns).issubset(obs_seq.df.columns):
938
+ raise ValueError(
939
+ "All observation sequences must have the selected copies."
940
+ )
941
+
942
+ # go through columns and create header
943
+ remove_list = [
944
+ "obs_num",
945
+ "linked_list",
946
+ "latitude",
947
+ "longitude",
948
+ "vertical",
949
+ "vert_unit",
950
+ "type",
951
+ "metadata",
952
+ "external_FO",
953
+ "time",
954
+ "seconds",
955
+ "days",
956
+ "obs_err_var",
957
+ ]
958
+ # using lists to retain copy order, non_qcs followed by qcs
959
+ combo.copie_names = [
960
+ item for item in requested_columns if item not in remove_list
961
+ ]
962
+ combo.non_qc_copie_names = [
963
+ item
964
+ for item in combo.copie_names
965
+ if item in obs_sequences[0].non_qc_copie_names
966
+ ]
967
+ combo.qc_copie_names = [
968
+ item
969
+ for item in combo.copie_names
970
+ if item in obs_sequences[0].qc_copie_names
971
+ ]
972
+
973
+ combo.n_copies = len(combo.copie_names)
974
+ combo.n_qc = len(combo.qc_copie_names)
975
+ combo.n_non_qc = len(combo.non_qc_copie_names)
976
+
977
+ else:
978
+ for obs_seq in obs_sequences:
979
+ if not obs_sequences[0].df.columns.isin(obs_seq.df.columns).all():
980
+ raise ValueError(
981
+ "All observation sequences must have the same copies."
982
+ )
983
+ combo.n_copies = obs_sequences[0].n_copies
984
+ combo.n_qc = obs_sequences[0].n_qc
985
+ combo.n_non_qc = obs_sequences[0].n_non_qc
986
+ combo.copie_names = obs_sequences[0].copie_names
987
+
988
+ # todo HK @todo combine synonyms for obs?
989
+
990
+ # Initialize combined data
991
+ combined_types = []
992
+ combined_df = pd.DataFrame()
993
+ combo.all_obs = None # set to none to force writing from the dataframe if write_obs_seq is called
994
+
995
+ # Iterate over the list of observation sequences and combine their data
996
+ for obs_seq in obs_sequences:
997
+ if copies:
998
+ combined_df = pd.concat(
999
+ [combined_df, obs_seq.df[requested_columns]], ignore_index=True
1000
+ )
1001
+ else:
1002
+ combined_df = pd.concat([combined_df, obs_seq.df], ignore_index=True)
1003
+ combined_types.extend(list(obs_seq.reverse_types.keys()))
1004
+
1005
+ # create dictionary of types
1006
+ keys = set(combined_types)
1007
+ combo.reverse_types = {item: i + 1 for i, item in enumerate(keys)}
1008
+ combo.types = {v: k for k, v in combo.reverse_types.items()}
1009
+
1010
+ # create linked list for obs
1011
+ combo.df = combined_df.sort_values(by="time").reset_index(drop=True)
1012
+ combo.df["linked_list"] = obs_sequence.generate_linked_list_pattern(
1013
+ len(combo.df)
1014
+ )
1015
+ combo.df["obs_num"] = combined_df.index + 1
1016
+ combo.create_header(len(combo.df))
1017
+
1018
+ # set assimilation info (mean and spread) (prior and posterior)
1019
+ combo.has_assimilation_info = "prior_ensemble_mean".casefold() in map(
1020
+ str.casefold, combo.df.columns
1021
+ )
1022
+ combo.has_assimilation_info = "prior_ensemble_spread".casefold() in map(
1023
+ str.casefold, combo.df.columns
1024
+ )
1025
+ combo.has_posterior = "posterior_ensemble_mean".casefold() in map(
1026
+ str.casefold, combo.df.columns
1027
+ )
1028
+ combo.has_posterior = "posterior_ensemble_spread".casefold() in map(
1029
+ str.casefold, combo.df.columns
1030
+ )
1031
+
1032
+ return combo
1033
+
1034
+ def create_header(self, n):
1035
+ """Create a header for the obs_seq file from the obs_sequence object."""
1036
+ assert (
1037
+ self.n_copies == self.n_non_qc + self.n_qc
1038
+ ), "n_copies must be equal to n_non_qc + n_qc"
1039
+
1040
+ self.header = []
1041
+ self.header.append(f"obs_sequence")
1042
+ self.header.append("obs_type_definitions")
1043
+ self.header.append(f"{len(self.types)}")
1044
+ for key, value in self.types.items():
1045
+ self.header.append(f"{key} {value}")
1046
+ self.header.append(f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}")
1047
+ self.header.append(f"num_obs: {n} max_num_obs: {n}")
1048
+ for copie in self.copie_names:
1049
+ self.header.append(copie)
1050
+ self.header.append(f"first: 1 last: {n}")
1051
+
1052
+
758
1053
  def load_yaml_to_dict(file_path):
759
1054
  """
760
1055
  Load a YAML file and convert it to a dictionary.
761
1056
 
762
- Parameters:
1057
+ Args:
763
1058
  file_path (str): The path to the YAML file.
764
1059
 
765
1060
  Returns:
766
1061
  dict: The YAML file content as a dictionary.
767
1062
  """
768
1063
  try:
769
- with open(file_path, 'r') as file:
1064
+ with open(file_path, "r") as file:
770
1065
  return yaml.safe_load(file)
771
1066
  except Exception as e:
772
1067
  print(f"Error loading YAML file: {e}")
773
- return None
1068
+ return None
774
1069
 
775
1070
 
776
1071
  def convert_dart_time(seconds, days):
@@ -780,18 +1075,19 @@ def convert_dart_time(seconds, days):
780
1075
  - base year for Gregorian calendar is 1601
781
1076
  - dart time is seconds, days since 1601
782
1077
  """
783
- time = dt.datetime(1601,1,1) + dt.timedelta(days=days, seconds=seconds)
1078
+ time = dt.datetime(1601, 1, 1) + dt.timedelta(days=days, seconds=seconds)
784
1079
  return time
785
1080
 
1081
+
786
1082
  def construct_composit(df_comp, composite, components):
787
1083
  """
788
1084
  Construct a composite DataFrame by combining rows from two components.
789
1085
 
790
1086
  This function takes two DataFrames and combines rows from them based on matching
791
- location and time. It creates a new row with a composite type by combining
1087
+ location and time. It creates a new row with a composite type by combining
792
1088
  specified columns using the square root of the sum of squares method.
793
1089
 
794
- Parameters:
1090
+ Args:
795
1091
  df_comp (pd.DataFrame): The DataFrame containing the component rows to be combined.
796
1092
  composite (str): The type name for the new composite rows.
797
1093
  components (list of str): A list containing the type names of the two components to be combined.
@@ -799,27 +1095,29 @@ def construct_composit(df_comp, composite, components):
799
1095
  Returns:
800
1096
  merged_df (pd.DataFrame): The updated DataFrame with the new composite rows added.
801
1097
  """
802
- selected_rows = df_comp[df_comp['type'] == components[0].upper()]
803
- selected_rows_v = df_comp[df_comp['type'] == components[1].upper()]
1098
+ selected_rows = df_comp[df_comp["type"] == components[0].upper()]
1099
+ selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
804
1100
 
805
- columns_to_combine = df_comp.filter(regex='ensemble').columns.tolist()
806
- columns_to_combine.append('observation') # TODO HK: bias, sq_err, obs_err_var
807
- merge_columns = ['latitude', 'longitude', 'vertical', 'time']
1101
+ columns_to_combine = df_comp.filter(regex="ensemble").columns.tolist()
1102
+ columns_to_combine.append("observation") # TODO HK: bias, sq_err, obs_err_var
1103
+ merge_columns = ["latitude", "longitude", "vertical", "time"]
808
1104
 
809
1105
  print("duplicates in u: ", selected_rows[merge_columns].duplicated().sum())
810
- print("duplicates in v: ",selected_rows_v[merge_columns].duplicated().sum())
1106
+ print("duplicates in v: ", selected_rows_v[merge_columns].duplicated().sum())
811
1107
 
812
1108
  # Merge the two DataFrames on location and time columns
813
- merged_df = pd.merge(selected_rows, selected_rows_v, on=merge_columns, suffixes=('', '_v'))
1109
+ merged_df = pd.merge(
1110
+ selected_rows, selected_rows_v, on=merge_columns, suffixes=("", "_v")
1111
+ )
814
1112
 
815
1113
  # Apply the square root of the sum of squares method to the relevant columns
816
1114
  for col in columns_to_combine:
817
- merged_df[col] = np.sqrt(merged_df[col]**2 + merged_df[f'{col}_v']**2)
1115
+ merged_df[col] = np.sqrt(merged_df[col] ** 2 + merged_df[f"{col}_v"] ** 2)
818
1116
 
819
1117
  # Create the new composite rows
820
- merged_df['type'] = composite.upper()
821
- merged_df = merged_df.drop(columns=[col for col in merged_df.columns if col.endswith('_v')])
1118
+ merged_df["type"] = composite.upper()
1119
+ merged_df = merged_df.drop(
1120
+ columns=[col for col in merged_df.columns if col.endswith("_v")]
1121
+ )
822
1122
 
823
1123
  return merged_df
824
-
825
-