pydartdiags 0.0.43__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydartdiags might be problematic. Click here for more details.

@@ -1,3 +1,4 @@
1
+ # SPDX-License-Identifier: Apache-2.0
1
2
  import pandas as pd
2
3
  import datetime as dt
3
4
  import numpy as np
@@ -5,106 +6,152 @@ import os
5
6
  import yaml
6
7
  import struct
7
8
 
9
+
8
10
  def requires_assimilation_info(func):
9
11
  def wrapper(self, *args, **kwargs):
10
- if self.has_assimilation_info:
12
+ if self.has_assimilation_info():
11
13
  return func(self, *args, **kwargs)
12
14
  else:
13
- raise ValueError("Assimilation information is required to call this function.")
14
- return wrapper
15
+ raise ValueError(
16
+ "Assimilation information is required to call this function."
17
+ )
15
18
 
16
- def requires_posterior_info(func):
17
- def wrapper(self, *args, **kwargs):
18
- if self.has_posterior_info:
19
- return func(self, *args, **kwargs)
20
- else:
21
- raise ValueError("Posterior information is required to call this function.")
22
19
  return wrapper
23
20
 
24
21
 
25
22
  class obs_sequence:
26
- """Create an obs_sequence object from an ascii observation sequence file.
23
+ """
24
+ Initialize an obs_sequence object from an ASCII or binary observation sequence file,
25
+ or create an empty obs_sequence object from scratch.
26
+
27
+ Args:
28
+ file (str): The input observation sequence ASCII or binary file.
29
+ If None, an empty obs_sequence object is created from scratch.
30
+
31
+ Returns:
32
+ An obs_sequence object
27
33
 
28
34
  Attributes:
29
- df (pandas.DataFrame): DataFrame containing all the observations.
30
- all_obs (list): List of all observations, each observation is a list.
31
- header (str): Header from the ascii file.
32
- vert (dict): Dictionary of dart vertical units.
33
- types (dict): Dictionary of types in the observation sequence file.
34
- copie_names (list): Names of copies in the observation sequence file.
35
- Spelled 'copie' to avoid conflict with the Python built-in copy function.
35
+ df (pandas.DataFrame): The DataFrame containing the observation sequence data.
36
+ header (list): The header of the observation sequence.
37
+ copie_names (list): The names of the copies in the observation sequence.
38
+ Spelled 'copie' to avoid conflict with the Python built-in 'copy'.
36
39
  Spaces are replaced with underscores in copie_names.
37
-
38
- Parameters:
39
- file : the input observation sequence ascii file
40
-
41
- Example:
42
- Read the observation sequence from file:
43
- ``obs_seq = obs_sequence('/home/data/obs_seq.final.ascii.small')``
44
- Access the resulting pandas DataFrame:
45
- ``obs_seq.df``
46
-
47
- For 3D sphere models: latitude and longitude are in degrees in the DataFrame
48
-
49
- Calculations:
50
-
51
- - sq_err = (mean-obs)**2
52
- - bias = (mean-obs)
53
- - rmse = sqrt( sum((mean-obs)**2)/n )
54
- - bias = sum((mean-obs)/n)
55
- - spread = sum(sd)
56
- - totalspread = sqrt(sum(sd+obs_err_var))
57
-
40
+ non_qc_copie_names (list): The names of the copies not including quality control,
41
+ e.g. observation, mean, ensemble_members
42
+ qc_copie_names (list): The names of the quality control copies, e.g. DART_QC
43
+ n_copies(int): The total number of copies in the observation sequence.
44
+ n_non_qc(int): The number of copies not including quality control.
45
+ n_qc(int): The number of quality control copies.
46
+ vert (dict): A dictionary mapping DART vertical coordinate types to their
47
+ corresponding integer values.
48
+
49
+ - undefined: 'VERTISUNDEF'
50
+ - surface: 'VERTISSURFACE' (value is surface elevation in meters)
51
+ - model level: 'VERTISLEVEL'
52
+ - pressure: 'VERTISPRESSURE' (in Pascals)
53
+ - height: 'VERTISHEIGHT' (in meters)
54
+ - scale height: 'VERTISSCALEHEIGHT' (unitless)
55
+ loc_mod (str): The location model, either 'loc3d' or 'loc1d'.
56
+ For 3D sphere models: latitude and longitude are in degrees in the DataFrame.
57
+ types (dict): Dictionary of types of observations the observation sequence,
58
+ e.g. {23: 'ACARS_TEMPERATURE'},
59
+ reverse_types (dict): Dictionary of types with keys and values reversed, e.g
60
+ {'ACARS_TEMPERATURE': 23}
61
+ synonyms_for_obs (list): List of synonyms for the observation column in the DataFrame.
62
+ The default list is
63
+
64
+ .. code-block:: python
65
+
66
+ [ 'NCEP BUFR observation',
67
+ 'AIRS observation',
68
+ 'GTSPP observation',
69
+ 'SST observation',
70
+ 'observations',
71
+ 'WOD observation']
72
+
73
+ You can add more synonyms by providing a list of strings when
74
+ creating the obs_sequence object.
75
+
76
+ .. code-block:: python
77
+
78
+ obs_sequence(file, synonyms=['synonym1', 'synonym2']).df
79
+
80
+ seq (generator): Generator of observations from the observation sequence file.
81
+ all_obs (list): List of all observations, each observation is a list.
82
+ Valid when the obs_sequence is created from a file.
83
+ Set to None when the obs_sequence is created from scratch or multiple
84
+ obs_sequences are joined.
58
85
  """
59
- ## static variables
60
- # vertrical coordinate:
61
- # undefined 'VERTISUNDEF'
62
- # surface 'VERTISSURFACE' (value is surface elevation in m)
63
- # model level 'VERTISLEVEL'
64
- # pressure 'VERTISPRESSURE' (in pascals)
65
- # height 'VERTISHEIGHT' (in meters)
66
- # scale height 'VERTISSCALEHEIGHT' (unitless)
67
- vert = {-2: 'undefined',
68
- -1: 'surface (m)',
69
- 1: 'model level',
70
- 2: 'pressure (Pa)',
71
- 3: 'height (m)',
72
- 4: 'scale height' }
86
+
87
+ vert = {
88
+ -2: "undefined",
89
+ -1: "surface (m)",
90
+ 1: "model level",
91
+ 2: "pressure (Pa)",
92
+ 3: "height (m)",
93
+ 4: "scale height",
94
+ }
73
95
 
74
96
  reversed_vert = {value: key for key, value in vert.items()}
75
97
 
76
-
77
98
  def __init__(self, file, synonyms=None):
78
- self.loc_mod = 'None'
79
- self.has_assimilation_info = False
80
- self.has_posterior = False
99
+ """
100
+ Create an obs_sequence object from an ASCII or binary observation sequence file,
101
+ or create an empty obs_sequence object from scratch.
102
+
103
+ Args:
104
+ file (str): The input observation sequence ASCII or binary file.
105
+ If None, an empty obs_sequence object is created from scratch.
106
+ synonyms (list, optional): List of synonyms for the observation column in the DataFrame.
107
+
108
+ Returns:
109
+ an obs_sequence object
110
+ 1D observations are given a datetime of days, seconds since 2000-01-01 00:00:00
111
+ 3D observations are given a datetime of days, seconds since 1601-01-01 00:00:00 (DART Gregorian calendar)
112
+
113
+ Examples:
114
+
115
+ .. code-block:: python
116
+
117
+ obs_seq = obs_sequence(file='obs_seq.final')
118
+
119
+ """
120
+
121
+ self.loc_mod = "None"
81
122
  self.file = file
82
- self.synonyms_for_obs = ['NCEP BUFR observation',
83
- 'AIRS observation',
84
- 'GTSPP observation',
85
- 'SST observation',
86
- 'observations',
87
- 'WOD observation']
123
+ self.synonyms_for_obs = [
124
+ "NCEP BUFR observation",
125
+ "AIRS observation",
126
+ "GTSPP observation",
127
+ "SST observation",
128
+ "observations",
129
+ "WOD observation",
130
+ ]
88
131
  if synonyms:
89
132
  if isinstance(synonyms, list):
90
133
  self.synonyms_for_obs.extend(synonyms)
91
134
  else:
92
135
  self.synonyms_for_obs.append(synonyms)
93
136
 
137
+ module_dir = os.path.dirname(__file__)
138
+ self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
139
+
94
140
  if file is None:
95
- # Early exit for testing purposes
141
+ # Early exit - for testing purposes or creating obs_seq objects from scratch
96
142
  self.df = pd.DataFrame()
97
143
  self.types = {}
98
144
  self.reverse_types = {}
99
145
  self.copie_names = []
100
- self.n_copies = 0
146
+ self.non_qc_copie_names = []
147
+ self.qc_copie_names = []
148
+ self.n_copies = 0 # copies including qc
149
+ self.n_non_qc = 0 # copies not including qc
150
+ self.n_qc = 0 # number of qc copies
101
151
  self.seq = []
102
152
  self.all_obs = []
103
153
  return
104
154
 
105
- module_dir = os.path.dirname(__file__)
106
- self.default_composite_types = os.path.join(module_dir,"composite_types.yaml")
107
-
108
155
  if self.is_binary(file):
109
156
  self.header = self.read_binary_header(file)
110
157
  else:
@@ -113,40 +160,41 @@ class obs_sequence:
113
160
  self.types = self.collect_obs_types(self.header)
114
161
  self.reverse_types = {v: k for k, v in self.types.items()}
115
162
  self.copie_names, self.n_copies = self.collect_copie_names(self.header)
163
+ self.n_non_qc, self.n_qc = self.num_qc_non_qc(self.header)
164
+ self.non_qc_copie_names = self.copie_names[: self.n_non_qc]
165
+ self.qc_copie_names = self.copie_names[self.n_non_qc :]
116
166
 
117
167
  if self.is_binary(file):
118
168
  self.seq = self.obs_binary_reader(file, self.n_copies)
119
- self.loc_mod = 'loc3d' # only loc3d supported for binary, & no way to check
169
+ self.loc_mod = "loc3d" # only loc3d supported for binary, & no way to check
120
170
  else:
121
171
  self.seq = self.obs_reader(file, self.n_copies)
122
172
 
123
- self.all_obs = self.create_all_obs() # uses up the generator
173
+ self.all_obs = self.create_all_obs() # uses up the generator
124
174
  # at this point you know if the seq is loc3d or loc1d
125
- if self.loc_mod == 'None':
126
- raise ValueError("Neither 'loc3d' nor 'loc1d' could be found in the observation sequence.")
175
+ if self.loc_mod == "None":
176
+ raise ValueError(
177
+ "Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
178
+ )
127
179
  self.columns = self.column_headers()
128
- self.df = pd.DataFrame(self.all_obs, columns = self.columns)
129
- if self.loc_mod == 'loc3d':
130
- self.df['longitude'] = np.rad2deg(self.df['longitude'])
131
- self.df['latitude'] = np.rad2deg(self.df['latitude'])
180
+ self.df = pd.DataFrame(self.all_obs, columns=self.columns)
181
+ if self.loc_mod == "loc3d":
182
+ self.df["longitude"] = np.rad2deg(self.df["longitude"])
183
+ self.df["latitude"] = np.rad2deg(self.df["latitude"])
132
184
  # rename 'X observation' to observation
133
- self.synonyms_for_obs = [synonym.replace(' ', '_') for synonym in self.synonyms_for_obs]
134
- rename_dict = {old: 'observation' for old in self.synonyms_for_obs if old in self.df.columns}
185
+ self.synonyms_for_obs = [
186
+ synonym.replace(" ", "_") for synonym in self.synonyms_for_obs
187
+ ]
188
+ rename_dict = {
189
+ old: "observation"
190
+ for old in self.synonyms_for_obs
191
+ if old in self.df.columns
192
+ }
135
193
  self.df = self.df.rename(columns=rename_dict)
136
194
 
137
- # calculate bias and sq_err is the obs_seq is an obs_seq.final
138
- if 'prior_ensemble_mean'.casefold() in map(str.casefold, self.columns):
139
- self.has_assimilation_info = True
140
- self.df['prior_bias'] = (self.df['prior_ensemble_mean'] - self.df['observation'])
141
- self.df['prior_sq_err'] = self.df['prior_bias']**2 # squared error
142
- if 'posterior_ensemble_mean'.casefold() in map(str.casefold, self.columns):
143
- self.has_posterior_info = True
144
- self.df['posterior_bias'] = (self.df['posterior_ensemble_mean'] - self.df['observation'])
145
- self.df['posterior_sq_err'] = self.df['posterior_bias']**2
146
-
147
195
  def create_all_obs(self):
148
- """ steps through the generator to create a
149
- list of all observations in the sequence
196
+ """steps through the generator to create a
197
+ list of all observations in the sequence
150
198
  """
151
199
  all_obs = []
152
200
  for obs in self.seq:
@@ -155,49 +203,54 @@ class obs_sequence:
155
203
  return all_obs
156
204
 
157
205
  def obs_to_list(self, obs):
158
- """put single observation into a list
159
-
160
- discards obs_def
161
- """
206
+ """put single observation into a list"""
162
207
  data = []
163
- data.append(obs[0].split()[1]) # obs_num
164
- data.extend(list(map(float,obs[1:self.n_copies+1]))) # all the copies
165
- data.append(obs[self.n_copies+1]) # linked list info
208
+ data.append(obs[0].split()[1]) # obs_num
209
+ data.extend(list(map(float, obs[1 : self.n_copies + 1]))) # all the copies
210
+ data.append(obs[self.n_copies + 1]) # linked list info
166
211
  try: # HK todo only have to check loc3d or loc1d for the first observation, the whole file is the same
167
- locI = obs.index('loc3d')
168
- location = obs[locI+1].split()
212
+ locI = obs.index("loc3d")
213
+ location = obs[locI + 1].split()
169
214
  data.append(float(location[0])) # location x
170
215
  data.append(float(location[1])) # location y
171
216
  data.append(float(location[2])) # location z
172
217
  data.append(obs_sequence.vert[int(location[3])])
173
- self.loc_mod = 'loc3d'
218
+ self.loc_mod = "loc3d"
174
219
  except ValueError:
175
220
  try:
176
- locI = obs.index('loc1d')
177
- location = obs[locI+1]
178
- data.append(float(location)) # 1d location
179
- self.loc_mod = 'loc1d'
221
+ locI = obs.index("loc1d")
222
+ location = obs[locI + 1]
223
+ data.append(float(location)) # 1d location
224
+ self.loc_mod = "loc1d"
180
225
  except ValueError:
181
- raise ValueError("Neither 'loc3d' nor 'loc1d' could be found in the observation sequence.")
182
- typeI = obs.index('kind') # type of observation
226
+ raise ValueError(
227
+ "Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
228
+ )
229
+ typeI = obs.index("kind") # type of observation
183
230
  type_value = obs[typeI + 1]
184
231
  if not self.types:
185
- data.append('Identity')
232
+ data.append("Identity")
186
233
  else:
187
- data.append(self.types[type_value]) # observation type
188
-
234
+ data.append(self.types[type_value]) # observation type
235
+
189
236
  # any observation specific obs def info is between here and the end of the list
190
237
  # can be obs_def & external forward operator
191
- metadata = obs[typeI+2:-2]
238
+ metadata = obs[typeI + 2 : -2]
192
239
  obs_def_metadata, external_metadata = self.split_metadata(metadata)
193
240
  data.append(obs_def_metadata)
194
241
  data.append(external_metadata)
195
242
 
196
243
  time = obs[-2].split()
197
- data.append(int(time[0])) # seconds
198
- data.append(int(time[1])) # days
199
- data.append(convert_dart_time(int(time[0]), int(time[1]))) # datetime # HK todo what is approprate for 1d models?
200
- data.append(float(obs[-1])) # obs error variance ?convert to sd?
244
+ data.append(int(time[0])) # seconds
245
+ data.append(int(time[1])) # days
246
+ if self.loc_mod == "loc3d":
247
+ data.append(convert_dart_time(int(time[0]), int(time[1])))
248
+ else: # HK todo what is appropriate for 1d models?
249
+ data.append(
250
+ dt.datetime(2000, 1, 1)
251
+ + dt.timedelta(seconds=int(time[0]), days=int(time[1]))
252
+ )
253
+ data.append(float(obs[-1])) # obs error variance ?convert to sd?
201
254
 
202
255
  return data
203
256
 
@@ -215,41 +268,49 @@ class obs_sequence:
215
268
  the first sublist contains the entire metadata list, and the second is empty.
216
269
  """
217
270
  for i, item in enumerate(metadata):
218
- if item.startswith('external_FO'):
271
+ if item.startswith("external_FO"):
219
272
  return metadata[:i], metadata[i:]
220
273
  return metadata, []
221
274
 
222
275
  def list_to_obs(self, data):
276
+ """convert a list of data to an observation
277
+
278
+ Assuming the order of the list is obs_seq.copie_names
279
+
280
+ """
223
281
  obs = []
224
- obs.append('OBS ' + str(data[0])) # obs_num lots of space
225
- obs.extend(data[1:self.n_copies+1]) # all the copies
226
- obs.append(data[self.n_copies+1]) # linked list info
227
- obs.append('obdef') # TODO HK: metadata obs_def
282
+ obs.append("OBS " + str(data[0])) # obs_num lots of space
283
+ obs.extend(data[1 : self.n_copies + 1]) # all the copies
284
+ obs.append(data[self.n_copies + 1]) # linked list info
285
+ obs.append("obdef") # TODO HK: extended_FO obs_def
228
286
  obs.append(self.loc_mod)
229
- if self.loc_mod == 'loc3d':
230
- obs.append(' '.join(map(str, data[self.n_copies+2:self.n_copies+5])) + ' ' + str(self.reversed_vert[data[self.n_copies+5]]) ) # location x, y, z, vert
231
- obs.append('kind') # this is type of observation
287
+ if self.loc_mod == "loc3d":
288
+ obs.append(
289
+ " ".join(map(str, data[self.n_copies + 2 : self.n_copies + 5]))
290
+ + " "
291
+ + str(self.reversed_vert[data[self.n_copies + 5]])
292
+ ) # location x, y, z, vert
293
+ obs.append("kind") # this is type of observation
232
294
  obs.append(self.reverse_types[data[self.n_copies + 6]]) # observation type
233
- # Convert metadata to a string and append
295
+ # Convert metadata to a string and append !HK @todo you are not converting to string
234
296
  obs.extend(data[self.n_copies + 7]) # metadata
235
- elif self.loc_mod == 'loc1d':
236
- obs.append(data[self.n_copies+2]) # 1d location
237
- obs.append('kind') # this is type of observation
297
+ obs.extend(data[self.n_copies + 8]) # external forward operator
298
+ elif self.loc_mod == "loc1d":
299
+ obs.append(data[self.n_copies + 2]) # 1d location
300
+ obs.append("kind") # this is type of observation
238
301
  obs.append(self.reverse_types[data[self.n_copies + 3]]) # observation type
239
- # Convert metadata to a string and append
240
- metadata = ' '.join(map(str, data[self.n_copies + 4:-4]))
241
- if metadata:
242
- obs.append(metadata) # metadata
243
- obs.append(' '.join(map(str, data[-4:-2]))) # seconds, days
302
+ obs.extend(data[self.n_copies + 4]) # metadata
303
+ obs.extend(data[self.n_copies + 5]) # external forward operator
304
+ obs.append(" ".join(map(str, data[-4:-2]))) # seconds, days
244
305
  obs.append(data[-1]) # obs error variance
245
306
 
246
307
  return obs
247
308
 
248
309
  @staticmethod
249
310
  def generate_linked_list_pattern(n):
250
- """Create a list of strings with the linked list pattern for n lines."""
311
+ """Create a list of strings with the linked list pattern for n observations."""
251
312
  result = []
252
- for i in range(n-1):
313
+ for i in range(n - 1):
253
314
  col1 = i if i > 0 else -1
254
315
  col2 = i + 2
255
316
  col3 = -1
@@ -257,101 +318,190 @@ class obs_sequence:
257
318
  result.append(f"{n-1:<12}{'-1':<11}{'-1'}")
258
319
  return result
259
320
 
260
- def write_obs_seq(self, file, df=None):
321
+ def write_obs_seq(self, file):
261
322
  """
262
323
  Write the observation sequence to a file.
263
-
264
- This function writes the observation sequence to disk.
265
- If no DataFrame is provided, it writes the obs_sequence object to a file using the
266
- header and all observations stored in the object.
267
- If a DataFrame is provided,it creates a header and linked list from the DataFrame,
268
- then writes the DataFrame obs to an obs_sequence file. Note the DataFrame is assumed
269
- to have been created from obs_sequence object.
270
-
271
-
272
- Parameters:
324
+
325
+ This function writes the observation sequence stored in the obs_seq.DataFrame to a specified file.
326
+ It updates the header with the number of observations, converts coordinates back to radians
327
+ if necessary, drops unnecessary columns, sorts the DataFrame by time, and generates a linked
328
+ list pattern for reading by DART programs.
329
+
330
+ Args:
273
331
  file (str): The path to the file where the observation sequence will be written.
274
- df (pandas.DataFrame, optional): A DataFrame containing the observation data. If not provided, the function uses self.header and self.all_obs.
275
-
332
+
333
+ Notes:
334
+ - Longitude and latitude are converted back to radians if the location model is 'loc3d'.
335
+ - The 'bias' and 'sq_err' columns are dropped if they exist in the DataFrame.
336
+ - The DataFrame is sorted by the 'time' column.
337
+ - An 'obs_num' column is added to the DataFrame to number the observations in time order.
338
+ - A 'linked_list' column is generated to create a linked list pattern for the observations.
339
+
340
+ Example:
341
+ obsq.write_obs_seq('obs_seq.new')
342
+
343
+ """
344
+
345
+ self.create_header_from_dataframe()
346
+
347
+ with open(file, "w") as f:
348
+
349
+ for line in self.header:
350
+ f.write(str(line) + "\n")
351
+
352
+ # TODO HK is there something better than copying the whole thing here?
353
+ df_copy = self.df.copy() # copy since you want to change for writing.
354
+ # back to radians for obs_seq
355
+ if self.loc_mod == "loc3d":
356
+ df_copy["longitude"] = np.deg2rad(self.df["longitude"]).round(16)
357
+ df_copy["latitude"] = np.deg2rad(self.df["latitude"]).round(16)
358
+ if "prior_bias" in df_copy.columns:
359
+ df_copy = df_copy.drop(
360
+ columns=["prior_bias", "prior_sq_err", "prior_totalvar"]
361
+ )
362
+ if "posterior_bias" in df_copy.columns:
363
+ df_copy = df_copy.drop(
364
+ columns=["posterior_bias", "posterior_sq_err", "posterior_totalvar"]
365
+ )
366
+ if "midpoint" in df_copy.columns:
367
+ df_copy = df_copy.drop(columns=["midpoint", "vlevels"])
368
+
369
+ # linked list for reading by dart programs
370
+ df_copy = df_copy.sort_values(
371
+ by=["time"], kind="stable"
372
+ ) # sort the DataFrame by time
373
+ df_copy.reset_index(drop=True, inplace=True)
374
+ df_copy["obs_num"] = df_copy.index + 1 # obs_num in time order
375
+ df_copy["linked_list"] = obs_sequence.generate_linked_list_pattern(
376
+ len(df_copy)
377
+ ) # linked list pattern
378
+
379
+ def write_row(row):
380
+ ob_write = self.list_to_obs(row.tolist())
381
+ for line in ob_write:
382
+ f.write(str(line) + "\n")
383
+
384
+ df_copy.apply(write_row, axis=1)
385
+
386
+ @staticmethod
387
+ def update_types_dicts(df, reverse_types):
388
+ """
389
+ Ensure all unique observation types are in the reverse_types dictionary and create
390
+ the types dictionary.
391
+
392
+ Args:
393
+ df (pd.DataFrame): The DataFrame containing the observation sequence data.
394
+ reverse_types (dict): The dictionary mapping observation types to their corresponding integer values.
395
+
276
396
  Returns:
277
- None
278
-
279
- Examples:
280
- ``obs_seq.write_obs_seq('/path/to/output/file')``
281
- ``obs_seq.write_obs_seq('/path/to/output/file', df=obs_seq.df)``
397
+ dict: The updated reverse_types dictionary.
398
+ dict: The types dictionary with keys sorted in numerical order.
282
399
  """
283
- with open(file, 'w') as f:
284
-
285
- if df is not None:
286
- # If a DataFrame is provided, update the header with the number of observations
287
- num_rows = len(df)
288
- replacement_string = f'num_obs: {num_rows:>10} max_num_obs: {num_rows:>10}'
289
- new_header = [replacement_string if 'num_obs' in element else element for element in self.header]
290
-
291
- for line in new_header[:-1]:
292
- f.write(str(line) + '\n')
293
- first = 1
294
- f.write(f"first: {first:>12} last: {num_rows:>12}\n")
295
-
296
- # TODO HK is there something better than copying the whole thing here?
297
- df_copy = df.copy() # copy since you want to change for writing.
298
- # back to radians for obs_seq
299
- if self.loc_mod == 'loc3d':
300
- df_copy['longitude'] = np.deg2rad(self.df['longitude'])
301
- df_copy['latitude'] = np.deg2rad(self.df['latitude'])
302
- if 'bias' in df_copy.columns:
303
- df_copy = df_copy.drop(columns=['bias', 'sq_err'])
304
-
305
- # linked list for reading by dart programs
306
- df_copy = df_copy.sort_values(by=['time']) # sort the DataFrame by time
307
- df_copy['obs_num'] = df.index + 1 # obs_num in time order
308
- df_copy['linked_list'] = obs_sequence.generate_linked_list_pattern(len(df_copy)) # linked list pattern
309
-
310
- def write_row(row):
311
- ob_write = self.list_to_obs(row.tolist())
312
- for line in ob_write:
313
- f.write(str(line) + '\n')
314
-
315
- df_copy.apply(write_row, axis=1)
316
-
317
- else:
318
- # If no DataFrame is provided, use self.header and self.all_obs
319
- for line in self.header:
320
- f.write(str(line) + '\n')
321
- for obs in self.all_obs:
322
- ob_write = self.list_to_obs(obs)
323
- for line in ob_write:
324
- f.write(str(line) + '\n')
400
+ # Create a dictionary of observation types from the dataframe
401
+ unique_types = df["type"].unique()
325
402
 
403
+ # Ensure all unique types are in reverse_types
404
+ for obs_type in unique_types:
405
+ if obs_type not in reverse_types:
406
+ new_id = int(max(reverse_types.values(), default=0)) + 1
407
+ reverse_types[obs_type] = str(new_id)
408
+
409
+ not_sorted_types = {
410
+ reverse_types[obs_type]: obs_type for obs_type in unique_types
411
+ }
412
+ types = {
413
+ k: not_sorted_types[k] for k in sorted(not_sorted_types)
414
+ } # to get keys in numerical order
415
+
416
+ return reverse_types, types
417
+
418
+ def create_header_from_dataframe(self):
419
+ """
420
+ Create a header for the observation sequence based on the data in the DataFrame.
421
+
422
+ It creates a dictionary of unique observation types, counts the
423
+ number of observations, and constructs the header with necessary information.
424
+
425
+ Example:
426
+ self.create_header_from_dataframe()
427
+
428
+ """
429
+
430
+ self.reverse_types, self.types = self.update_types_dicts(
431
+ self.df, self.reverse_types
432
+ )
433
+
434
+ num_obs = len(self.df)
435
+
436
+ self.header = []
437
+ self.header.append("obs_sequence")
438
+ self.header.append("obs_type_definitions")
439
+ self.header.append(f"{len(self.types)}")
440
+ for key, value in self.types.items():
441
+ self.header.append(f"{key} {value}")
442
+ self.header.append(
443
+ f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}"
444
+ ) # @todo HK not keeping track if num_qc changes
445
+ self.header.append(f"num_obs: {num_obs:>10} max_num_obs: {num_obs:>10}")
446
+ stats_cols = [
447
+ "prior_bias",
448
+ "prior_sq_err",
449
+ "prior_totalvar",
450
+ "posterior_bias",
451
+ "posterior_sq_err",
452
+ "posterior_totalvar",
453
+ ]
454
+ level_cols = ["vlevels", "midpoint"]
455
+ non_copie_cols = [
456
+ "obs_num",
457
+ "linked_list",
458
+ "longitude",
459
+ "latitude",
460
+ "vertical",
461
+ "vert_unit",
462
+ "type",
463
+ "metadata",
464
+ "external_FO",
465
+ "seconds",
466
+ "days",
467
+ "time",
468
+ "obs_err_var",
469
+ "location",
470
+ ]
471
+ for copie in self.df.columns:
472
+ if copie not in stats_cols + non_copie_cols + level_cols:
473
+ self.header.append(copie.replace("_", " "))
474
+ first = 1
475
+ self.header.append(f"first: {first:>12} last: {num_obs:>12}")
326
476
 
327
477
  def column_headers(self):
328
- """define the columns for the dataframe """
478
+ """define the columns for the dataframe"""
329
479
  heading = []
330
- heading.append('obs_num')
480
+ heading.append("obs_num")
331
481
  heading.extend(self.copie_names)
332
- heading.append('linked_list')
333
- if self.loc_mod == 'loc3d':
334
- heading.append('longitude')
335
- heading.append('latitude')
336
- heading.append('vertical')
337
- heading.append('vert_unit')
338
- elif self.loc_mod == 'loc1d':
339
- heading.append('location')
340
- heading.append('type')
341
- heading.append('metadata')
342
- heading.append('external_FO')
343
- heading.append('seconds')
344
- heading.append('days')
345
- heading.append('time')
346
- heading.append('obs_err_var')
482
+ heading.append("linked_list")
483
+ if self.loc_mod == "loc3d":
484
+ heading.append("longitude")
485
+ heading.append("latitude")
486
+ heading.append("vertical")
487
+ heading.append("vert_unit")
488
+ elif self.loc_mod == "loc1d":
489
+ heading.append("location")
490
+ heading.append("type")
491
+ heading.append("metadata")
492
+ heading.append("external_FO")
493
+ heading.append("seconds")
494
+ heading.append("days")
495
+ heading.append("time")
496
+ heading.append("obs_err_var")
347
497
  return heading
348
498
 
349
- @requires_assimilation_info
499
+ @requires_assimilation_info
350
500
  def select_by_dart_qc(self, dart_qc):
351
501
  """
352
502
  Selects rows from a DataFrame based on the DART quality control flag.
353
503
 
354
- Parameters:
504
+ Args:
355
505
  df (DataFrame): A pandas DataFrame.
356
506
  dart_qc (int): The DART quality control flag to select.
357
507
 
@@ -361,20 +511,26 @@ class obs_sequence:
361
511
  Raises:
362
512
  ValueError: If the DART quality control flag is not present in the DataFrame.
363
513
  """
364
- if dart_qc not in self.df['DART_quality_control'].unique():
365
- raise ValueError(f"DART quality control flag '{dart_qc}' not found in DataFrame.")
514
+ if dart_qc not in self.df["DART_quality_control"].unique():
515
+ raise ValueError(
516
+ f"DART quality control flag '{dart_qc}' not found in DataFrame."
517
+ )
366
518
  else:
367
- return self.df[self.df['DART_quality_control'] == dart_qc]
519
+ return self.df[self.df["DART_quality_control"] == dart_qc]
368
520
 
369
521
  @requires_assimilation_info
370
- def select_failed_qcs(self):
522
+ def select_used_qcs(self):
371
523
  """
372
- Select rows from the DataFrame where the DART quality control flag is greater than 0.
524
+ Select rows from the DataFrame where the observation was used.
525
+ Includes observations for which the posterior forward observation operators failed.
373
526
 
374
527
  Returns:
375
- pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
528
+ pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag 0 or 2.
376
529
  """
377
- return self.df[self.df['DART_quality_control'] > 0]
530
+ return self.df[
531
+ (self.df["DART_quality_control"] == 0)
532
+ | (self.df["DART_quality_control"] == 2)
533
+ ]
378
534
 
379
535
  @requires_assimilation_info
380
536
  def possible_vs_used(self):
@@ -383,7 +539,7 @@ class obs_sequence:
383
539
 
384
540
  This function takes a DataFrame containing observation data, including a 'type' column for the observation
385
541
  type and an 'observation' column. The number of used observations ('used'), is the total number
386
- minus the observations that failed quality control checks (as determined by the `select_failed_qcs` function).
542
+ of assimilated observations (as determined by the `select_used_qcs` function).
387
543
  The result is a DataFrame with each observation type, the count of possible observations, and the count of
388
544
  used observations.
389
545
 
@@ -392,31 +548,29 @@ class obs_sequence:
392
548
  'possible' is the count of all observations of that type, and 'used' is the count of observations of that type
393
549
  that passed quality control checks.
394
550
  """
395
- possible = self.df.groupby('type')['observation'].count()
396
- possible.rename('possible', inplace=True)
397
-
398
- failed_qcs = self.select_failed_qcs().groupby('type')['observation'].count()
399
- used = possible - failed_qcs.reindex(possible.index, fill_value=0)
400
- used.rename('used', inplace=True)
401
-
402
- return pd.concat([possible, used], axis=1).reset_index()
551
+ possible = self.df.groupby("type")["observation"].count()
552
+ possible.rename("possible", inplace=True)
553
+
554
+ used_qcs = self.select_used_qcs().groupby("type")["observation"].count()
555
+ used = used_qcs.reindex(possible.index, fill_value=0)
556
+ used.rename("used", inplace=True)
403
557
 
558
+ return pd.concat([possible, used], axis=1).reset_index()
404
559
 
405
560
  @staticmethod
406
561
  def is_binary(file):
407
562
  """Check if a file is binary file."""
408
- with open(file, 'rb') as f:
563
+ with open(file, "rb") as f:
409
564
  chunk = f.read(1024)
410
- if b'\0' in chunk:
565
+ if b"\0" in chunk:
411
566
  return True
412
567
  return False
413
568
 
414
-
415
569
  @staticmethod
416
570
  def read_header(file):
417
571
  """Read the header and number of lines in the header of an ascii obs_seq file"""
418
572
  header = []
419
- with open(file, 'r') as f:
573
+ with open(file, "r") as f:
420
574
  for line in f:
421
575
  if "first:" in line and "last:" in line:
422
576
  header.append(line.strip())
@@ -432,19 +586,19 @@ class obs_sequence:
432
586
  linecount = 0
433
587
  obs_types_definitions = -1000
434
588
  num_obs = 0
435
- max_num_obs = 0
589
+ max_num_obs = 0
436
590
  # need to get:
437
591
  # number of obs_type_definitions
438
592
  # number of copies
439
593
  # number of qcs
440
- with open(file, 'rb') as f:
594
+ with open(file, "rb") as f:
441
595
  while True:
442
596
  # Read the record length
443
597
  record_length = obs_sequence.read_record_length(f)
444
598
  if record_length is None:
445
599
  break
446
600
  record = f.read(record_length)
447
- if not record: # end of file
601
+ if not record: # end of file
448
602
  break
449
603
 
450
604
  # Read the trailing record length (should match the leading one)
@@ -452,17 +606,19 @@ class obs_sequence:
452
606
 
453
607
  linecount += 1
454
608
 
455
- if linecount == 3:
456
- obs_types_definitions = struct.unpack('i', record)[0]
457
- continue
609
+ if linecount == 3:
610
+ obs_types_definitions = struct.unpack("i", record)[0]
611
+ continue
458
612
 
459
- if linecount == 4+obs_types_definitions:
460
- num_copies, num_qcs, num_obs, max_num_obs = struct.unpack('iiii', record)[:16]
613
+ if linecount == 4 + obs_types_definitions:
614
+ num_copies, num_qcs, num_obs, max_num_obs = struct.unpack(
615
+ "iiii", record
616
+ )[:16]
461
617
  break
462
-
618
+
463
619
  # Go back to the beginning of the file
464
620
  f.seek(0)
465
-
621
+
466
622
  for _ in range(2):
467
623
  record_length = obs_sequence.read_record_length(f)
468
624
  if record_length is None:
@@ -472,14 +628,14 @@ class obs_sequence:
472
628
  if not record: # end of file
473
629
  break
474
630
 
475
- obs_sequence.check_trailing_record_length(f, record_length)
476
- header.append(record.decode('utf-8').strip())
631
+ obs_sequence.check_trailing_record_length(f, record_length)
632
+ header.append(record.decode("utf-8").strip())
477
633
 
478
634
  header.append(str(obs_types_definitions))
479
635
 
480
636
  # obs_types_definitions
481
- for _ in range(3,4+obs_types_definitions):
482
- # Read the record length
637
+ for _ in range(3, 4 + obs_types_definitions):
638
+ # Read the record length
483
639
  record_length = obs_sequence.read_record_length(f)
484
640
  if record_length is None:
485
641
  break
@@ -489,21 +645,24 @@ class obs_sequence:
489
645
  if not record: # end of file
490
646
  break
491
647
 
492
- obs_sequence.check_trailing_record_length(f, record_length)
648
+ obs_sequence.check_trailing_record_length(f, record_length)
493
649
 
494
650
  if _ == 3:
495
- continue # num obs_types_definitions
651
+ continue # num obs_types_definitions
496
652
  # Read an integer and a string from the record
497
- integer_value = struct.unpack('i', record[:4])[0]
498
- string_value = record[4:].decode('utf-8').strip()
653
+ integer_value = struct.unpack("i", record[:4])[0]
654
+ string_value = record[4:].decode("utf-8").strip()
499
655
  header.append(f"{integer_value} {string_value}")
500
656
 
501
657
  header.append(f"num_copies: {num_copies} num_qc: {num_qcs}")
502
658
  header.append(f"num_obs: {num_obs} max_num_obs: {max_num_obs}")
503
-
504
- #copie names
505
- for _ in range(5+obs_types_definitions, 5+obs_types_definitions+num_copies+num_qcs+1):
506
- # Read the record length
659
+
660
+ # copie names
661
+ for _ in range(
662
+ 5 + obs_types_definitions,
663
+ 5 + obs_types_definitions + num_copies + num_qcs + 1,
664
+ ):
665
+ # Read the record length
507
666
  record_length = obs_sequence.read_record_length(f)
508
667
  if record_length is None:
509
668
  break
@@ -513,26 +672,26 @@ class obs_sequence:
513
672
  if not record:
514
673
  break
515
674
 
516
- obs_sequence.check_trailing_record_length(f, record_length)
675
+ obs_sequence.check_trailing_record_length(f, record_length)
517
676
 
518
- if _ == 5+obs_types_definitions:
677
+ if _ == 5 + obs_types_definitions:
519
678
  continue
520
679
 
521
680
  # Read the whole record as a string
522
- string_value = record.decode('utf-8').strip()
681
+ string_value = record.decode("utf-8").strip()
523
682
  header.append(string_value)
524
683
 
525
684
  # first and last obs
526
- # Read the record length
685
+ # Read the record length
527
686
  record_length = obs_sequence.read_record_length(f)
528
687
 
529
688
  # Read the actual record
530
689
  record = f.read(record_length)
531
-
532
- obs_sequence.check_trailing_record_length(f, record_length)
690
+
691
+ obs_sequence.check_trailing_record_length(f, record_length)
533
692
 
534
693
  # Read the whole record as a two integers
535
- first, last = struct.unpack('ii', record)[:8]
694
+ first, last = struct.unpack("ii", record)[:8]
536
695
  header.append(f"first: {first} last: {last}")
537
696
 
538
697
  return header
@@ -541,7 +700,7 @@ class obs_sequence:
541
700
  def collect_obs_types(header):
542
701
  """Create a dictionary for the observation types in the obs_seq header"""
543
702
  num_obs_types = int(header[2])
544
- types = dict([x.split() for x in header[3:num_obs_types+3]])
703
+ types = dict([x.split() for x in header[3 : num_obs_types + 3]])
545
704
  return types
546
705
 
547
706
  @staticmethod
@@ -549,32 +708,45 @@ class obs_sequence:
549
708
  """
550
709
  Extracts the names of the copies from the header of an obs_seq file.
551
710
 
552
- Parameters:
711
+ Args:
553
712
  header (list): A list of strings representing the lines in the header of the obs_seq file.
554
713
 
555
714
  Returns:
556
- tuple: A tuple containing two elements:
557
- - copie_names (list): A list of strings representing the copy names with underscores for spaces.
715
+ tuple: A tuple containing two elements:
716
+ - copie_names (list): A list of strings representing the copy names with underscores for spaces.
558
717
  - len(copie_names) (int): The number of copy names.
559
718
  """
560
719
  for i, line in enumerate(header):
561
720
  if "num_obs:" in line and "max_num_obs:" in line:
562
- first_copie = i+1
721
+ first_copie = i + 1
563
722
  break
564
- copie_names = ['_'.join(x.split()) for x in header[first_copie:-1]] # first and last is last line of header
723
+ copie_names = [
724
+ "_".join(x.split()) for x in header[first_copie:-1]
725
+ ] # first and last is last line of header
565
726
  return copie_names, len(copie_names)
566
727
 
728
+ @staticmethod
729
+ def num_qc_non_qc(header):
730
+ """Find the number of qc and non-qc copies in the header"""
731
+ for line in header:
732
+ if "num_copies:" in line and "num_qc:" in line:
733
+ num_non_qc = int(line.split()[1])
734
+ num_qc = int(line.split()[3])
735
+ return num_non_qc, num_qc
736
+
567
737
  @staticmethod
568
738
  def obs_reader(file, n):
569
739
  """Reads the ascii obs sequence file and returns a generator of the obs"""
570
- previous_line = ''
571
- with open(file, 'r') as f:
740
+ previous_line = ""
741
+ with open(file, "r") as f:
572
742
  for line in f:
573
743
  if "OBS" in line or "OBS" in previous_line:
574
744
  if "OBS" in line:
575
745
  obs = []
576
- obs.append(line.strip())
577
- for i in range(n+100): # number of copies + 100. Needs to be bigger than any metadata
746
+ obs.append(line.strip())
747
+ for i in range(
748
+ n + 100
749
+ ): # number of copies + 100. Needs to be bigger than any metadata
578
750
  try:
579
751
  next_line = next(f)
580
752
  except:
@@ -587,11 +759,15 @@ class obs_sequence:
587
759
  else:
588
760
  obs.append(next_line.strip())
589
761
  yield obs
590
- elif "OBS" in previous_line: # previous line is because I cannot use f.tell with next
762
+ elif (
763
+ "OBS" in previous_line
764
+ ): # previous line is because I cannot use f.tell with next
591
765
  obs = []
592
- obs.append(previous_line.strip())
593
- obs.append(line.strip())
594
- for i in range(n+100): # number of copies + 100. Needs to be bigger than any metadata
766
+ obs.append(previous_line.strip())
767
+ obs.append(line.strip())
768
+ for i in range(
769
+ n + 100
770
+ ): # number of copies + 100. Needs to be bigger than any metadata
595
771
  try:
596
772
  next_line = next(f)
597
773
  except:
@@ -608,19 +784,19 @@ class obs_sequence:
608
784
 
609
785
  @staticmethod
610
786
  def check_trailing_record_length(file, expected_length):
611
- """Reads and checks the trailing record length from the binary file written by Fortran.
787
+ """Reads and checks the trailing record length from the binary file written by Fortran.
612
788
 
613
- Parameters:
614
- file (file): The file object.
615
- expected_length (int): The expected length of the trailing record.
789
+ Args:
790
+ file (file): The file object.
791
+ expected_length (int): The expected length of the trailing record.
616
792
 
617
- Assuming 4 bytes:
618
- | Record Length (4 bytes) | Data (N bytes) | Trailing Record Length (4 bytes) |
619
- """
620
- trailing_record_length_bytes = file.read(4)
621
- trailing_record_length = struct.unpack('i', trailing_record_length_bytes)[0]
622
- if expected_length != trailing_record_length:
623
- raise ValueError("Record length mismatch in Fortran binary file")
793
+ Assuming 4 bytes:
794
+ | Record Length (4 bytes) | Data (N bytes) | Trailing Record Length (4 bytes) |
795
+ """
796
+ trailing_record_length_bytes = file.read(4)
797
+ trailing_record_length = struct.unpack("i", trailing_record_length_bytes)[0]
798
+ if expected_length != trailing_record_length:
799
+ raise ValueError("Record length mismatch in Fortran binary file")
624
800
 
625
801
  @staticmethod
626
802
  def read_record_length(file):
@@ -628,18 +804,17 @@ class obs_sequence:
628
804
  record_length_bytes = file.read(4)
629
805
  if not record_length_bytes:
630
806
  return None # End of file
631
- return struct.unpack('i', record_length_bytes)[0]
632
-
807
+ return struct.unpack("i", record_length_bytes)[0]
633
808
 
634
809
  def obs_binary_reader(self, file, n):
635
810
  """Reads the obs sequence binary file and returns a generator of the obs"""
636
811
  header_length = len(self.header)
637
- with open(file, 'rb') as f:
812
+ with open(file, "rb") as f:
638
813
  # Skip the first len(obs_seq.header) lines
639
- for _ in range(header_length-1):
814
+ for _ in range(header_length - 1):
640
815
  # Read the record length
641
816
  record_length = obs_sequence.read_record_length(f)
642
- if record_length is None: # End of file
817
+ if record_length is None: # End of file
643
818
  break
644
819
 
645
820
  # Skip the actual record
@@ -652,79 +827,80 @@ class obs_sequence:
652
827
  while True:
653
828
  obs = []
654
829
  obs_num += 1
655
- obs.append(f"OBS {obs_num}")
656
- for _ in range(n): # number of copies
830
+ obs.append(f"OBS {obs_num}")
831
+ for _ in range(n): # number of copies
657
832
  # Read the record length
658
833
  record_length = obs_sequence.read_record_length(f)
659
834
  if record_length is None:
660
835
  break
661
836
  # Read the actual record (copie)
662
837
  record = f.read(record_length)
663
- obs.append(struct.unpack('d', record)[0])
838
+ obs.append(struct.unpack("d", record)[0])
664
839
 
665
840
  # Read the trailing record length (should match the leading one)
666
841
  obs_sequence.check_trailing_record_length(f, record_length)
667
-
842
+
668
843
  # linked list info
669
844
  record_length = obs_sequence.read_record_length(f)
670
845
  if record_length is None:
671
846
  break
672
847
 
673
848
  record = f.read(record_length)
674
- int1, int2, int3 = struct.unpack('iii', record[:12])
849
+ int1, int2, int3 = struct.unpack("iii", record[:12])
675
850
  linked_list_string = f"{int1:<12} {int2:<10} {int3:<12}"
676
851
  obs.append(linked_list_string)
677
852
 
678
853
  obs_sequence.check_trailing_record_length(f, record_length)
679
854
 
680
855
  # location (note no location header "loc3d" or "loc1d" for binary files)
681
- obs.append('loc3d')
856
+ obs.append("loc3d")
682
857
  record_length = obs_sequence.read_record_length(f)
683
858
  record = f.read(record_length)
684
- x,y,z,vert = struct.unpack('dddi', record[:28])
685
- location_string = f"{x} {y} {z} {vert}"
686
- obs.append(location_string)
859
+ x, y, z, vert = struct.unpack("dddi", record[:28])
860
+ location_string = f"{x} {y} {z} {vert}"
861
+ obs.append(location_string)
687
862
 
688
863
  obs_sequence.check_trailing_record_length(f, record_length)
689
-
864
+
690
865
  # kind (type of observation) value
691
- obs.append('kind')
866
+ obs.append("kind")
692
867
  record_length_bytes = f.read(4)
693
- record_length = struct.unpack('i', record_length_bytes)[0]
868
+ record_length = struct.unpack("i", record_length_bytes)[0]
694
869
  record = f.read(record_length)
695
870
  kind = f"{struct.unpack('i', record)[0]}"
696
871
  obs.append(kind)
697
-
872
+
698
873
  obs_sequence.check_trailing_record_length(f, record_length)
699
874
 
700
875
  # time (seconds, days)
701
876
  record_length = obs_sequence.read_record_length(f)
702
877
  record = f.read(record_length)
703
- seconds, days = struct.unpack('ii', record)[:8]
878
+ seconds, days = struct.unpack("ii", record)[:8]
704
879
  time_string = f"{seconds} {days}"
705
880
  obs.append(time_string)
706
881
 
707
882
  obs_sequence.check_trailing_record_length(f, record_length)
708
-
883
+
709
884
  # obs error variance
710
885
  record_length = obs_sequence.read_record_length(f)
711
886
  record = f.read(record_length)
712
- obs.append(struct.unpack('d', record)[0])
713
-
887
+ obs.append(struct.unpack("d", record)[0])
888
+
714
889
  obs_sequence.check_trailing_record_length(f, record_length)
715
890
 
716
891
  yield obs
717
892
 
718
- def composite_types(self, composite_types='use_default'):
893
+ def composite_types(self, composite_types="use_default"):
719
894
  """
720
895
  Set up and construct composite types for the DataFrame.
721
896
 
722
- This function sets up composite types based on a provided YAML configuration or
723
- a default configuration. It constructs new composite rows by combining specified
897
+ This function sets up composite types based on a provided YAML configuration or
898
+ a default configuration. It constructs new composite rows by combining specified
724
899
  components and adds them to the DataFrame.
725
900
 
726
- Parameters:
727
- composite_types (str, optional): The YAML configuration for composite types. If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
901
+ Args:
902
+ composite_types (str, optional): The YAML configuration for composite types.
903
+ If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
728
904
 
729
905
  Returns:
730
906
  pd.DataFrame: The updated DataFrame with the new composite rows added.
@@ -733,12 +909,12 @@ class obs_sequence:
733
909
  Exception: If there are repeat values in the components.
734
910
  """
735
911
 
736
- if composite_types == 'use_default':
912
+ if composite_types == "use_default":
737
913
  composite_yaml = self.default_composite_types
738
914
  else:
739
915
  composite_yaml = composite_types
740
- self.composite_types_dict = load_yaml_to_dict(composite_yaml)
741
-
916
+ self.composite_types_dict = load_yaml_to_dict(composite_yaml)
917
+
742
918
  components = []
743
919
  for value in self.composite_types_dict.values():
744
920
  components.extend(value["components"])
@@ -746,31 +922,243 @@ class obs_sequence:
746
922
  if len(components) != len(set(components)):
747
923
  raise Exception("There are repeat values in components.")
748
924
 
749
- df_comp = self.df[self.df['type'].str.upper().isin([component.upper() for component in components])]
750
- df_no_comp = self.df[~self.df['type'].str.upper().isin([component.upper() for component in components])]
925
+ # data frame for the composite types
926
+ df_comp = self.df[
927
+ self.df["type"]
928
+ .str.upper()
929
+ .isin([component.upper() for component in components])
930
+ ]
751
931
 
932
+ df = pd.DataFrame()
752
933
  for key in self.composite_types_dict:
753
- df_new = construct_composit(df_comp, key, self.composite_types_dict[key]['components'])
754
- df_no_comp = pd.concat([df_no_comp, df_new], axis=0)
934
+ df_new = construct_composit(
935
+ df_comp, key, self.composite_types_dict[key]["components"]
936
+ )
937
+ df = pd.concat([df, df_new], axis=0)
938
+
939
+ # add the composite types to the DataFrame
940
+ self.df = pd.concat([self.df, df], axis=0)
941
+ return
942
+
943
+ @classmethod
944
+ def join(cls, obs_sequences, copies=None):
945
+ """
946
+ Join a list of observation sequences together.
947
+
948
+ This method combines the headers and observations from a list of obs_sequence objects
949
+ into a single obs_sequence object.
950
+
951
+ Args:
952
+ obs_sequences (list of obs_sequences): The list of observation sequences objects to join.
953
+ copies (list of str, optional): A list of copy names to include in the combined data.
954
+ If not provided, all copies are included.
955
+
956
+ Returns:
957
+ A new obs_sequence object containing the combined data.
958
+
959
+ Example:
960
+ .. code-block:: python
961
+
962
+ obs_seq1 = obs_sequence(file='obs_seq1.final')
963
+ obs_seq2 = obs_sequence(file='obs_seq2.final')
964
+ obs_seq3 = obs_sequence(file='obs_seq3.final')
965
+ combined = obs_sequence.join([obs_seq1, obs_seq2, obs_seq3])
966
+ """
967
+ if not obs_sequences:
968
+ raise ValueError("The list of observation sequences is empty.")
969
+
970
+ # Create a new obs_sequnece object with the combined data
971
+ combo = cls(file=None)
972
+
973
+ # Check if all obs_sequences have compatible attributes
974
+ first_loc_mod = obs_sequences[0].loc_mod
975
+ first_has_assimilation_info = obs_sequences[0].has_assimilation_info()
976
+ first_has_posterior = obs_sequences[0].has_posterior()
977
+ for obs_seq in obs_sequences:
978
+ if obs_seq.loc_mod != first_loc_mod:
979
+ raise ValueError(
980
+ "All observation sequences must have the same loc_mod."
981
+ )
982
+ if obs_seq.has_assimilation_info() != first_has_assimilation_info:
983
+ raise ValueError(
984
+ "All observation sequences must have assimilation info."
985
+ )
986
+ if obs_seq.has_posterior() != first_has_posterior:
987
+ raise ValueError(
988
+ "All observation sequences must have the posterior info."
989
+ )
990
+ # HK @todo prior only
991
+ combo.loc_mod = first_loc_mod
992
+
993
+ # check the copies are compatible (list of copies to combine?)
994
+ # subset of copies if needed # @todo HK 1d or 3d
995
+ if copies:
996
+ start_required_columns = ["obs_num", "observation"]
997
+ end_required_columns = [
998
+ "linked_list",
999
+ "longitude",
1000
+ "latitude",
1001
+ "vertical",
1002
+ "vert_unit",
1003
+ "type",
1004
+ "metadata",
1005
+ "external_FO",
1006
+ "seconds",
1007
+ "days",
1008
+ "time",
1009
+ "obs_err_var",
1010
+ ]
1011
+ required_columns = start_required_columns + end_required_columns
1012
+
1013
+ requested_columns = (
1014
+ start_required_columns
1015
+ + [item for item in copies if item not in required_columns]
1016
+ + end_required_columns
1017
+ )
1018
+
1019
+ for obs_seq in obs_sequences:
1020
+ if not set(requested_columns).issubset(obs_seq.df.columns):
1021
+ raise ValueError(
1022
+ "All observation sequences must have the selected copies."
1023
+ )
1024
+
1025
+ # go through columns and create header
1026
+ remove_list = [
1027
+ "obs_num",
1028
+ "linked_list",
1029
+ "latitude",
1030
+ "longitude",
1031
+ "vertical",
1032
+ "vert_unit",
1033
+ "type",
1034
+ "metadata",
1035
+ "external_FO",
1036
+ "time",
1037
+ "seconds",
1038
+ "days",
1039
+ "obs_err_var",
1040
+ ]
1041
+ # using lists to retain copy order, non_qcs followed by qcs
1042
+ combo.copie_names = [
1043
+ item for item in requested_columns if item not in remove_list
1044
+ ]
1045
+ combo.non_qc_copie_names = [
1046
+ item
1047
+ for item in combo.copie_names
1048
+ if item in obs_sequences[0].non_qc_copie_names
1049
+ ]
1050
+ combo.qc_copie_names = [
1051
+ item
1052
+ for item in combo.copie_names
1053
+ if item in obs_sequences[0].qc_copie_names
1054
+ ]
1055
+
1056
+ combo.n_copies = len(combo.copie_names)
1057
+ combo.n_qc = len(combo.qc_copie_names)
1058
+ combo.n_non_qc = len(combo.non_qc_copie_names)
1059
+
1060
+ else:
1061
+ for obs_seq in obs_sequences:
1062
+ if not obs_sequences[0].df.columns.isin(obs_seq.df.columns).all():
1063
+ raise ValueError(
1064
+ "All observation sequences must have the same copies."
1065
+ )
1066
+ combo.n_copies = obs_sequences[0].n_copies
1067
+ combo.n_qc = obs_sequences[0].n_qc
1068
+ combo.n_non_qc = obs_sequences[0].n_non_qc
1069
+ combo.copie_names = obs_sequences[0].copie_names
1070
+
1071
+ # todo HK @todo combine synonyms for obs?
1072
+
1073
+ # Initialize combined data
1074
+ combined_types = []
1075
+ combined_df = pd.DataFrame()
1076
+ combo.all_obs = None # set to none to force writing from the dataframe if write_obs_seq is called
1077
+
1078
+ # Iterate over the list of observation sequences and combine their data
1079
+ for obs_seq in obs_sequences:
1080
+ if copies:
1081
+ combined_df = pd.concat(
1082
+ [combined_df, obs_seq.df[requested_columns]], ignore_index=True
1083
+ )
1084
+ else:
1085
+ combined_df = pd.concat([combined_df, obs_seq.df], ignore_index=True)
1086
+ combined_types.extend(list(obs_seq.reverse_types.keys()))
1087
+
1088
+ # create dictionary of types
1089
+ keys = set(combined_types)
1090
+ combo.reverse_types = {item: i + 1 for i, item in enumerate(keys)}
1091
+ combo.types = {v: k for k, v in combo.reverse_types.items()}
1092
+
1093
+ # create linked list for obs
1094
+ combo.df = combined_df.sort_values(by="time").reset_index(drop=True)
1095
+ combo.df["linked_list"] = obs_sequence.generate_linked_list_pattern(
1096
+ len(combo.df)
1097
+ )
1098
+ combo.df["obs_num"] = combined_df.index + 1
1099
+ combo.create_header(len(combo.df))
1100
+
1101
+ return combo
1102
+
1103
+ def has_assimilation_info(self):
1104
+ """
1105
+ Check if the DataFrame has prior information.
1106
+
1107
+ Returns:
1108
+ bool: True if both 'prior_ensemble_mean' and 'prior_ensemble_spread' columns are present, False otherwise.
1109
+ """
1110
+ return "prior_ensemble_mean".casefold() in map(
1111
+ str.casefold, self.df.columns
1112
+ ) and "prior_ensemble_spread".casefold() in map(str.casefold, self.df.columns)
1113
+
1114
+ def has_posterior(self):
1115
+ """
1116
+ Check if the DataFrame has posterior information.
1117
+
1118
+ Returns:
1119
+ bool: True if both 'posterior_ensemble_mean' and 'posterior_ensemble_spread' columns are present, False otherwise.
1120
+ """
1121
+ return "posterior_ensemble_mean".casefold() in map(
1122
+ str.casefold, self.df.columns
1123
+ ) and "posterior_ensemble_spread".casefold() in map(
1124
+ str.casefold, self.df.columns
1125
+ )
1126
+
1127
+ def create_header(self, n):
1128
+ """Create a header for the obs_seq file from the obs_sequence object."""
1129
+ assert (
1130
+ self.n_copies == self.n_non_qc + self.n_qc
1131
+ ), "n_copies must be equal to n_non_qc + n_qc"
1132
+
1133
+ self.header = []
1134
+ self.header.append(f"obs_sequence")
1135
+ self.header.append("obs_type_definitions")
1136
+ self.header.append(f"{len(self.types)}")
1137
+ for key, value in self.types.items():
1138
+ self.header.append(f"{key} {value}")
1139
+ self.header.append(f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}")
1140
+ self.header.append(f"num_obs: {n} max_num_obs: {n}")
1141
+ for copie in self.copie_names:
1142
+ self.header.append(copie)
1143
+ self.header.append(f"first: 1 last: {n}")
1144
+
755
1145
 
756
- return df_no_comp
757
-
758
1146
  def load_yaml_to_dict(file_path):
759
1147
  """
760
1148
  Load a YAML file and convert it to a dictionary.
761
1149
 
762
- Parameters:
1150
+ Args:
763
1151
  file_path (str): The path to the YAML file.
764
1152
 
765
1153
  Returns:
766
1154
  dict: The YAML file content as a dictionary.
767
1155
  """
768
1156
  try:
769
- with open(file_path, 'r') as file:
1157
+ with open(file_path, "r") as file:
770
1158
  return yaml.safe_load(file)
771
1159
  except Exception as e:
772
1160
  print(f"Error loading YAML file: {e}")
773
- return None
1161
+ raise
774
1162
 
775
1163
 
776
1164
  def convert_dart_time(seconds, days):
@@ -780,46 +1168,71 @@ def convert_dart_time(seconds, days):
780
1168
  - base year for Gregorian calendar is 1601
781
1169
  - dart time is seconds, days since 1601
782
1170
  """
783
- time = dt.datetime(1601,1,1) + dt.timedelta(days=days, seconds=seconds)
1171
+ time = dt.datetime(1601, 1, 1) + dt.timedelta(days=days, seconds=seconds)
784
1172
  return time
785
1173
 
1174
+
786
1175
  def construct_composit(df_comp, composite, components):
787
1176
  """
788
1177
  Construct a composite DataFrame by combining rows from two components.
789
1178
 
790
1179
  This function takes two DataFrames and combines rows from them based on matching
791
- location and time. It creates a new row with a composite type by combining
1180
+ location and time. It creates a new row with a composite type by combining
792
1181
  specified columns using the square root of the sum of squares method.
793
1182
 
794
- Parameters:
1183
+ Args:
795
1184
  df_comp (pd.DataFrame): The DataFrame containing the component rows to be combined.
796
1185
  composite (str): The type name for the new composite rows.
797
1186
  components (list of str): A list containing the type names of the two components to be combined.
798
1187
 
799
1188
  Returns:
800
- merged_df (pd.DataFrame): The updated DataFrame with the new composite rows added.
1189
+ merged_df (pd.DataFrame): A DataFrame containing the new composite rows.
801
1190
  """
802
- selected_rows = df_comp[df_comp['type'] == components[0].upper()]
803
- selected_rows_v = df_comp[df_comp['type'] == components[1].upper()]
804
-
805
- columns_to_combine = df_comp.filter(regex='ensemble').columns.tolist()
806
- columns_to_combine.append('observation') # TODO HK: bias, sq_err, obs_err_var
807
- merge_columns = ['latitude', 'longitude', 'vertical', 'time']
808
-
809
- print("duplicates in u: ", selected_rows[merge_columns].duplicated().sum())
810
- print("duplicates in v: ",selected_rows_v[merge_columns].duplicated().sum())
1191
+ selected_rows = df_comp[df_comp["type"] == components[0].upper()]
1192
+ selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
1193
+
1194
+ prior_columns_to_combine = df_comp.filter(regex="prior_ensemble").columns.tolist()
1195
+ posterior_columns_to_combine = df_comp.filter(
1196
+ regex="posterior_ensemble"
1197
+ ).columns.tolist()
1198
+ columns_to_combine = (
1199
+ prior_columns_to_combine
1200
+ + posterior_columns_to_combine
1201
+ + ["observation", "obs_err_var"]
1202
+ )
1203
+ merge_columns = ["latitude", "longitude", "vertical", "time"]
1204
+ same_obs_columns = merge_columns + [
1205
+ "observation",
1206
+ "obs_err_var",
1207
+ ] # same observation is duplicated
1208
+
1209
+ if (
1210
+ selected_rows[same_obs_columns].duplicated().sum() > 0
1211
+ or selected_rows_v[same_obs_columns].duplicated().sum() > 0
1212
+ ):
1213
+ print(
1214
+ f"{selected_rows[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
1215
+ )
1216
+ print(f"{selected_rows[same_obs_columns]}")
1217
+ print(
1218
+ f"{selected_rows_v[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
1219
+ )
1220
+ print(f"{selected_rows_v[same_obs_columns]}")
1221
+ raise Exception("There are duplicates in the components.")
811
1222
 
812
1223
  # Merge the two DataFrames on location and time columns
813
- merged_df = pd.merge(selected_rows, selected_rows_v, on=merge_columns, suffixes=('', '_v'))
1224
+ merged_df = pd.merge(
1225
+ selected_rows, selected_rows_v, on=merge_columns, suffixes=("", "_v")
1226
+ )
814
1227
 
815
1228
  # Apply the square root of the sum of squares method to the relevant columns
816
1229
  for col in columns_to_combine:
817
- merged_df[col] = np.sqrt(merged_df[col]**2 + merged_df[f'{col}_v']**2)
1230
+ merged_df[col] = np.sqrt(merged_df[col] ** 2 + merged_df[f"{col}_v"] ** 2)
818
1231
 
819
1232
  # Create the new composite rows
820
- merged_df['type'] = composite.upper()
821
- merged_df = merged_df.drop(columns=[col for col in merged_df.columns if col.endswith('_v')])
1233
+ merged_df["type"] = composite.upper()
1234
+ merged_df = merged_df.drop(
1235
+ columns=[col for col in merged_df.columns if col.endswith("_v")]
1236
+ )
822
1237
 
823
1238
  return merged_df
824
-
825
-