pydartdiags 0.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1360 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ import pandas as pd
3
+ import datetime as dt
4
+ import numpy as np
5
+ import os
6
+ import yaml
7
+ import struct
8
+ import functools
9
+
10
+
11
+ def _requires_assimilation_info(func):
12
+ @functools.wraps(func)
13
+ def wrapper(self, *args, **kwargs):
14
+ if self.has_assimilation_info():
15
+ return func(self, *args, **kwargs)
16
+ else:
17
+ raise ValueError(
18
+ "Assimilation information is required to call this function."
19
+ )
20
+
21
+ return wrapper
22
+
23
+
24
+ class ObsSequence:
25
+ """
26
+ Initialize an ObsSequence object from an ASCII or binary observation sequence file,
27
+ or create an empty ObsSequence object from scratch.
28
+
29
+ 1D observations are given a datetime of days, seconds since 2000-01-01 00:00:00
30
+
31
+ 3D observations are given a datetime of days, seconds since 1601-01-01 00:00:00 (DART Gregorian calendar)
32
+
33
+ Args:
34
+ file (str): The input observation sequence ASCII or binary file.
35
+ If None, an empty ObsSequence object is created from scratch.
36
+ synonyms (list, optional): List of additional synonyms for the observation column in the DataFrame.
37
+ The default list is
38
+
39
+ .. code-block:: python
40
+
41
+ ['NCEP BUFR observation',
42
+ 'AIRS observation',
43
+ 'GTSPP observation',
44
+ 'SST observation',
45
+ 'observations',
46
+ 'WOD observation']
47
+
48
+ You can add more synonyms by providing a list of strings when
49
+ creating the ObsSequence object.
50
+
51
+ .. code-block:: python
52
+
53
+ ObsSequence(file, synonyms=['synonym1', 'synonym2'])
54
+
55
+ Raises:
56
+ ValueError: If neither 'loc3d' nor 'loc1d' could be found in the observation sequence.
57
+
58
+ Examples:
59
+
60
+ .. code-block:: python
61
+
62
+ obs_seq = ObsSequence(file='obs_seq.final')
63
+ empty_obs_seq = ObsSequence(file=None)
64
+
65
+ """
66
+
67
+ vert = {
68
+ -2: "undefined",
69
+ -1: "surface (m)",
70
+ 1: "model level",
71
+ 2: "pressure (Pa)",
72
+ 3: "height (m)",
73
+ 4: "scale height",
74
+ }
75
+
76
+ reversed_vert = {value: key for key, value in vert.items()}
77
+
78
+ def __init__(self, file, synonyms=None):
79
+
80
+ self.loc_mod = "None"
81
+ self.file = file
82
+ self.synonyms_for_obs = [
83
+ "NCEP BUFR observation",
84
+ "AIRS observation",
85
+ "GTSPP observation",
86
+ "SST observation",
87
+ "observations",
88
+ "WOD observation",
89
+ ]
90
+ if synonyms:
91
+ if isinstance(synonyms, list):
92
+ self.synonyms_for_obs.extend(synonyms)
93
+ else:
94
+ self.synonyms_for_obs.append(synonyms)
95
+
96
+ module_dir = os.path.dirname(__file__)
97
+ self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
98
+
99
+ if file is None:
100
+ # Early exit - for testing purposes or creating obs_seq objects from scratch
101
+ self.df = pd.DataFrame()
102
+ self.types = {}
103
+ self.reverse_types = {}
104
+ self.copie_names = []
105
+ self.non_qc_copie_names = []
106
+ self.qc_copie_names = []
107
+ self.n_copies = 0 # copies including qc
108
+ self.n_non_qc = 0 # copies not including qc
109
+ self.n_qc = 0 # number of qc copies
110
+ self.seq = []
111
+ self.all_obs = []
112
+ return
113
+
114
+ if self._is_binary(file):
115
+ self.header = self._read_binary_header(file)
116
+ else:
117
+ self.header = self._read_header(file)
118
+
119
+ self.types = self._collect_obs_types(self.header)
120
+ self.reverse_types = {v: k for k, v in self.types.items()}
121
+ self.copie_names, self.n_copies = self._collect_copie_names(self.header)
122
+ self.n_non_qc, self.n_qc = self._num_qc_non_qc(self.header)
123
+ self.non_qc_copie_names = self.copie_names[: self.n_non_qc]
124
+ self.qc_copie_names = self.copie_names[self.n_non_qc :]
125
+
126
+ if self._is_binary(file):
127
+ self.seq = self._obs_binary_reader(file, self.n_copies)
128
+ self.loc_mod = "loc3d" # only loc3d supported for binary, & no way to check
129
+ else:
130
+ self.seq = self._obs_reader(file, self.n_copies)
131
+
132
+ self.all_obs = self._create_all_obs() # uses up the generator
133
+ # at this point you know if the seq is loc3d or loc1d
134
+ if self.loc_mod == "None":
135
+ raise ValueError(
136
+ "Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
137
+ )
138
+ self.columns = self._column_headers()
139
+ self.df = pd.DataFrame(self.all_obs, columns=self.columns)
140
+ if self.loc_mod == "loc3d":
141
+ self.df["longitude"] = np.rad2deg(self.df["longitude"])
142
+ self.df["latitude"] = np.rad2deg(self.df["latitude"])
143
+ # rename 'X observation' to observation
144
+ self.synonyms_for_obs = [
145
+ synonym.replace(" ", "_") for synonym in self.synonyms_for_obs
146
+ ]
147
+ rename_dict = {
148
+ old: "observation"
149
+ for old in self.synonyms_for_obs
150
+ if old in self.df.columns
151
+ }
152
+ self.df = self.df.rename(columns=rename_dict)
153
+
154
+ if self._is_binary(file):
155
+ # binary files do not have "OBS X" in, so set linked list from df.
156
+ self.update_attributes_from_df()
157
+
158
+ # Replace MISSING_R8s with NaNs in posterior stats where DART_quality_control = 2
159
+ if self.has_posterior():
160
+ ObsSequence._replace_qc2_nan(self.df)
161
+
162
+ def _create_all_obs(self):
163
+ """steps through the generator to create a
164
+ list of all observations in the sequence
165
+ """
166
+ all_obs = []
167
+ for obs in self.seq:
168
+ data = self._obs_to_list(obs)
169
+ all_obs.append(data)
170
+ return all_obs
171
+
172
+ def _obs_to_list(self, obs):
173
+ """put single observation into a list"""
174
+ data = []
175
+ data.append(int(obs[0].split()[1])) # obs_num
176
+ data.extend(list(map(float, obs[1 : self.n_copies + 1]))) # all the copies
177
+ data.append(obs[self.n_copies + 1]) # linked list info
178
+ try: # HK todo only have to check loc3d or loc1d for the first observation, the whole file is the same
179
+ locI = obs.index("loc3d")
180
+ location = obs[locI + 1].split()
181
+ data.append(float(location[0])) # location x
182
+ data.append(float(location[1])) # location y
183
+ data.append(float(location[2])) # location z
184
+ data.append(ObsSequence.vert[int(location[3])])
185
+ self.loc_mod = "loc3d"
186
+ except ValueError:
187
+ try:
188
+ locI = obs.index("loc1d")
189
+ location = obs[locI + 1]
190
+ data.append(float(location)) # 1d location
191
+ self.loc_mod = "loc1d"
192
+ except ValueError:
193
+ raise ValueError(
194
+ "Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
195
+ )
196
+ typeI = obs.index("kind") # type of observation
197
+ type_value = int(obs[typeI + 1])
198
+ if type_value < 0:
199
+ data.append(type_value)
200
+ else:
201
+ data.append(self.types[type_value]) # observation type
202
+
203
+ # any observation specific obs def info is between here and the end of the list
204
+ # can be obs_def & external forward operator
205
+ metadata = obs[typeI + 2 : -2]
206
+ obs_def_metadata, external_metadata = self._split_metadata(metadata)
207
+ data.append(obs_def_metadata)
208
+ data.append(external_metadata)
209
+
210
+ time = obs[-2].split()
211
+ data.append(int(time[0])) # seconds
212
+ data.append(int(time[1])) # days
213
+ if self.loc_mod == "loc3d":
214
+ data.append(_convert_dart_time(int(time[0]), int(time[1])))
215
+ else: # HK todo what is appropriate for 1d models?
216
+ data.append(
217
+ dt.datetime(2000, 1, 1)
218
+ + dt.timedelta(seconds=int(time[0]), days=int(time[1]))
219
+ )
220
+ data.append(float(obs[-1])) # obs error variance ?convert to sd?
221
+
222
+ return data
223
+
224
+ @staticmethod
225
+ def _split_metadata(metadata):
226
+ """
227
+ Split the metadata list at the first occurrence of an element starting with 'externalF0'.
228
+
229
+ Args:
230
+ metadata (list of str): The metadata list to be split.
231
+
232
+ Returns:
233
+ tuple: Two sublists, the first containing elements before 'externalF0', and the second
234
+ containing 'externalF0' and all elements after it. If 'externalF0' is not found,
235
+ the first sublist contains the entire metadata list, and the second is empty.
236
+ """
237
+ for i, item in enumerate(metadata):
238
+ if item.startswith("external_FO"):
239
+ return metadata[:i], metadata[i:]
240
+ return metadata, []
241
+
242
+ def _list_to_obs(self, data):
243
+ """convert a list of data to an observation
244
+
245
+ Assuming the order of the list is obs_seq.copie_names
246
+
247
+ """
248
+ obs = []
249
+ obs.append("OBS " + str(data[0])) # obs_num lots of space
250
+ obs.extend(data[1 : self.n_copies + 1]) # all the copies
251
+ obs.append(data[self.n_copies + 1]) # linked list info
252
+ obs.append("obdef") # TODO HK: extended_FO obs_def
253
+ obs.append(self.loc_mod)
254
+ if self.loc_mod == "loc3d":
255
+ obs.append(
256
+ " ".join(map(str, data[self.n_copies + 2 : self.n_copies + 5]))
257
+ + " "
258
+ + str(self.reversed_vert[data[self.n_copies + 5]])
259
+ ) # location x, y, z, vert
260
+ obs.append("kind") # this is type of observation
261
+ obs_type = data[self.n_copies + 6]
262
+ if isinstance(obs_type, str):
263
+ obs.append(self.reverse_types[obs_type]) # observation type
264
+ else:
265
+ obs.append(obs_type) # Identity obs negative integer
266
+ # Convert metadata to a string and append !HK @todo you are not converting to string
267
+ obs.extend(data[self.n_copies + 7]) # metadata
268
+ obs.extend(data[self.n_copies + 8]) # external forward operator
269
+ elif self.loc_mod == "loc1d":
270
+ obs.append(data[self.n_copies + 2]) # 1d location
271
+ obs.append("kind") # this is type of observation
272
+ obs_type = data[self.n_copies + 3]
273
+ if isinstance(obs_type, str):
274
+ obs.append(self.reverse_types[obs_type]) # observation type
275
+ else:
276
+ obs.append(obs_type) # Identity obs negative integer
277
+ obs.extend(data[self.n_copies + 4]) # metadata
278
+ obs.extend(data[self.n_copies + 5]) # external forward operator
279
+ obs.append(" ".join(map(str, data[-4:-2]))) # seconds, days
280
+ obs.append(data[-1]) # obs error variance
281
+
282
+ return obs
283
+
284
+ @staticmethod
285
+ def _generate_linked_list_pattern(n):
286
+ """Create a list of strings with the linked list pattern for n observations."""
287
+ result = []
288
+ for i in range(n - 1):
289
+ col1 = i if i > 0 else -1
290
+ col2 = i + 2
291
+ col3 = -1
292
+ result.append(f"{col1:<12}{col2:<11}{col3}")
293
+ result.append(f"{n-1:<12}{'-1':<11}{'-1'}")
294
+ return result
295
+
296
+ def write_obs_seq(self, file):
297
+ """
298
+ Write the observation sequence to a file.
299
+
300
+ This function writes the observation sequence stored in the obs_seq.DataFrame to a specified file.
301
+ It updates the header with the number of observations, converts coordinates back to radians
302
+ if necessary, reverts NaNs back to MISSING_R8 for observations with QC=2, drops unnecessary
303
+ columns, sorts the DataFrame by time, and generates a linked list pattern for reading by DART
304
+ programs.
305
+
306
+ Args:
307
+ file (str): The path to the file where the observation sequence will be written.
308
+
309
+ Notes:
310
+ - Longitude and latitude are converted back to radians if the location model is 'loc3d'.
311
+ - The replacement of MISSING_R8 values with NaNs for any obs that failed the posterior
312
+ forward observation operators (QC2) is reverted.
313
+ - The 'bias' and 'sq_err' columns are dropped if they exist in the DataFrame.
314
+ - The DataFrame is sorted by the 'time' column.
315
+ - An 'obs_num' column is added to the DataFrame to number the observations in time order.
316
+ - A 'linked_list' column is generated to create a linked list pattern for the observations.
317
+
318
+ Example:
319
+ .. code-block:: python
320
+
321
+ obsq.write_obs_seq('obs_seq.new')
322
+
323
+ """
324
+
325
+ # Update attributes, header, and linked list from dataframe
326
+ self.update_attributes_from_df()
327
+
328
+ with open(file, "w") as f:
329
+
330
+ for line in self.header:
331
+ f.write(str(line) + "\n")
332
+
333
+ # TODO HK is there something better than copying the whole thing here?
334
+ df_copy = self.df.copy() # copy since you want to change for writing.
335
+ # back to radians for obs_seq
336
+ if self.loc_mod == "loc3d":
337
+ df_copy["longitude"] = np.deg2rad(self.df["longitude"]).round(16)
338
+ df_copy["latitude"] = np.deg2rad(self.df["latitude"]).round(16)
339
+ if "prior_bias" in df_copy.columns:
340
+ df_copy = df_copy.drop(
341
+ columns=["prior_bias", "prior_sq_err", "prior_totalvar"]
342
+ )
343
+ if "posterior_bias" in df_copy.columns:
344
+ df_copy = df_copy.drop(
345
+ columns=["posterior_bias", "posterior_sq_err", "posterior_totalvar"]
346
+ )
347
+ if "midpoint" in df_copy.columns:
348
+ df_copy = df_copy.drop(columns=["midpoint", "vlevels"])
349
+
350
+ # Revert NaNs back to MISSING_R8s
351
+ if self.has_posterior():
352
+ ObsSequence._revert_qc2_nan(df_copy)
353
+
354
+ def write_row(row):
355
+ ob_write = self._list_to_obs(row.tolist())
356
+ for line in ob_write:
357
+ f.write(str(line) + "\n")
358
+
359
+ df_copy.apply(write_row, axis=1)
360
+
361
+ @staticmethod
362
+ def _update_types_dicts(df, reverse_types):
363
+ """
364
+ Ensure all unique observation types are in the reverse_types dictionary and create
365
+ the types dictionary.
366
+
367
+ Args:
368
+ df (pd.DataFrame): The DataFrame containing the observation sequence data.
369
+ reverse_types (dict): The dictionary mapping observation types to their corresponding integer values.
370
+
371
+ Returns:
372
+ dict: The updated reverse_types dictionary.
373
+ dict: The types dictionary with keys sorted in numerical order.
374
+ """
375
+ # Create a dictionary of observation types from the dataframe
376
+ # Ignore Identity obs (negative integers)
377
+ unique_types = df.loc[
378
+ df["type"].apply(lambda x: isinstance(x, str)), "type"
379
+ ].unique()
380
+
381
+ # Ensure all unique types are in reverse_types
382
+ for obs_type in unique_types:
383
+ if obs_type not in reverse_types:
384
+ new_id = max(reverse_types.values(), default=0) + 1
385
+ reverse_types[obs_type] = new_id
386
+
387
+ not_sorted_types = {
388
+ reverse_types[obs_type]: obs_type for obs_type in unique_types
389
+ }
390
+ types = {
391
+ k: not_sorted_types[k] for k in sorted(not_sorted_types)
392
+ } # to get keys in numerical order
393
+
394
+ return reverse_types, types
395
+
396
+ def create_header_from_dataframe(self):
397
+ """
398
+ Create a header for the observation sequence based on the data in the DataFrame.
399
+
400
+ It creates a dictionary of unique observation types, counts the
401
+ number of observations, and constructs the header with necessary information.
402
+
403
+ Example:
404
+ .. code-block:: python
405
+
406
+ self.create_header_from_dataframe()
407
+
408
+ """
409
+
410
+ self.reverse_types, self.types = self._update_types_dicts(
411
+ self.df, self.reverse_types
412
+ )
413
+
414
+ num_obs = len(self.df)
415
+
416
+ self.header = []
417
+ self.header.append("obs_sequence")
418
+ self.header.append("obs_type_definitions")
419
+ self.header.append(f"{len(self.types)}")
420
+ for key, value in self.types.items():
421
+ self.header.append(f"{key} {value}")
422
+ self.header.append(f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}")
423
+ self.header.append(f"num_obs: {num_obs:>10} max_num_obs: {num_obs:>10}")
424
+ stats_cols = [
425
+ "prior_bias",
426
+ "prior_sq_err",
427
+ "prior_totalvar",
428
+ "posterior_bias",
429
+ "posterior_sq_err",
430
+ "posterior_totalvar",
431
+ ]
432
+ level_cols = ["vlevels", "midpoint"]
433
+ non_copie_cols = [
434
+ "obs_num",
435
+ "linked_list",
436
+ "longitude",
437
+ "latitude",
438
+ "vertical",
439
+ "vert_unit",
440
+ "type",
441
+ "metadata",
442
+ "external_FO",
443
+ "seconds",
444
+ "days",
445
+ "time",
446
+ "obs_err_var",
447
+ "location",
448
+ ]
449
+ for copie in self.df.columns:
450
+ if copie not in stats_cols + non_copie_cols + level_cols:
451
+ self.header.append(copie.replace("_", " "))
452
+ first = 1
453
+ self.header.append(f"first: {first:>12} last: {num_obs:>12}")
454
+
455
+ def _column_headers(self):
456
+ """define the columns for the dataframe"""
457
+ heading = []
458
+ heading.append("obs_num")
459
+ heading.extend(self.copie_names)
460
+ heading.append("linked_list")
461
+ if self.loc_mod == "loc3d":
462
+ heading.append("longitude")
463
+ heading.append("latitude")
464
+ heading.append("vertical")
465
+ heading.append("vert_unit")
466
+ elif self.loc_mod == "loc1d":
467
+ heading.append("location")
468
+ heading.append("type")
469
+ heading.append("metadata")
470
+ heading.append("external_FO")
471
+ heading.append("seconds")
472
+ heading.append("days")
473
+ heading.append("time")
474
+ heading.append("obs_err_var")
475
+ return heading
476
+
477
+ @_requires_assimilation_info
478
+ def select_by_dart_qc(self, dart_qc):
479
+ """
480
+ Selects rows from a DataFrame based on the DART quality control flag.
481
+
482
+ Args:
483
+ df (DataFrame): A pandas DataFrame.
484
+ dart_qc (int): The DART quality control flag to select.
485
+
486
+ Returns:
487
+ DataFrame: A DataFrame containing only the rows with the specified DART quality control flag.
488
+
489
+ Raises:
490
+ ValueError: If the DART quality control flag is not present in the DataFrame.
491
+ """
492
+ if dart_qc not in self.df["DART_quality_control"].unique():
493
+ raise ValueError(
494
+ f"DART quality control flag '{dart_qc}' not found in DataFrame."
495
+ )
496
+ else:
497
+ return self.df[self.df["DART_quality_control"] == dart_qc]
498
+
499
+ @_requires_assimilation_info
500
+ def select_used_qcs(self):
501
+ """
502
+ Select rows from the DataFrame where the observation was used.
503
+ Includes observations for which the posterior forward observation operators failed.
504
+
505
+ Returns:
506
+ pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag 0 or 2.
507
+ """
508
+ return self.df[
509
+ (self.df["DART_quality_control"] == 0)
510
+ | (self.df["DART_quality_control"] == 2)
511
+ ]
512
+
513
+ @_requires_assimilation_info
514
+ def possible_vs_used(self):
515
+ """
516
+ Calculates the count of possible vs. used observations by type.
517
+
518
+ The number of used observations ('used'), is the total number
519
+ of assimilated observations (as determined by the `select_used_qcs` function).
520
+ The result is a DataFrame with each observation type, the count of possible observations, and the count of
521
+ used observations.
522
+
523
+ Returns:
524
+ pd.DataFrame: A DataFrame with three columns: 'type', 'possible', and 'used'. 'type' is the observation type,
525
+ 'possible' is the count of all observations of that type, and 'used' is the count of observations of that type
526
+ that passed quality control checks.
527
+ """
528
+ possible = self.df.groupby("type")["observation"].count()
529
+ possible.rename("possible", inplace=True)
530
+
531
+ used_qcs = self.select_used_qcs().groupby("type")["observation"].count()
532
+ used = used_qcs.reindex(possible.index, fill_value=0)
533
+ used.rename("used", inplace=True)
534
+
535
+ return pd.concat([possible, used], axis=1).reset_index()
536
+
537
+ @staticmethod
538
+ def _is_binary(file):
539
+ """Check if a file is binary file."""
540
+ with open(file, "rb") as f:
541
+ chunk = f.read(1024)
542
+ if b"\0" in chunk:
543
+ return True
544
+ return False
545
+
546
+ @staticmethod
547
+ def _read_header(file):
548
+ """Read the header and number of lines in the header of an ascii obs_seq file"""
549
+ header = []
550
+ with open(file, "r") as f:
551
+ for line in f:
552
+ if "first:" in line and "last:" in line:
553
+ header.append(line.strip())
554
+ break
555
+ else:
556
+ header.append(line.strip())
557
+ return header
558
+
559
+ @staticmethod
560
+ def _read_binary_header(file):
561
+ """Read the header and number of lines in the header of a binary obs_seq file from Fortran output"""
562
+ header = []
563
+ linecount = 0
564
+ obs_types_definitions = -1000
565
+ num_obs = 0
566
+ max_num_obs = 0
567
+ # need to get:
568
+ # number of obs_type_definitions
569
+ # number of copies
570
+ # number of qcs
571
+ with open(file, "rb") as f:
572
+ while True:
573
+ # Read the record length
574
+ record_length = ObsSequence._read_record_length(f)
575
+ if record_length is None:
576
+ break
577
+ record = f.read(record_length)
578
+ if not record: # end of file
579
+ break
580
+
581
+ # Read the trailing record length (should match the leading one)
582
+ ObsSequence._check_trailing_record_length(f, record_length)
583
+
584
+ linecount += 1
585
+
586
+ if linecount == 3:
587
+ obs_types_definitions = struct.unpack("i", record)[0]
588
+ continue
589
+
590
+ if linecount == 4 + obs_types_definitions:
591
+ num_copies, num_qcs, num_obs, max_num_obs = struct.unpack(
592
+ "iiii", record
593
+ )[:16]
594
+ break
595
+
596
+ # Go back to the beginning of the file
597
+ f.seek(0)
598
+
599
+ for _ in range(2):
600
+ record_length = ObsSequence._read_record_length(f)
601
+ if record_length is None:
602
+ break
603
+
604
+ record = f.read(record_length)
605
+ if not record: # end of file
606
+ break
607
+
608
+ ObsSequence._check_trailing_record_length(f, record_length)
609
+ header.append(record.decode("utf-8").strip())
610
+
611
+ header.append(str(obs_types_definitions))
612
+
613
+ # obs_types_definitions
614
+ for _ in range(3, 4 + obs_types_definitions):
615
+ # Read the record length
616
+ record_length = ObsSequence._read_record_length(f)
617
+ if record_length is None:
618
+ break
619
+
620
+ # Read the actual record
621
+ record = f.read(record_length)
622
+ if not record: # end of file
623
+ break
624
+
625
+ ObsSequence._check_trailing_record_length(f, record_length)
626
+
627
+ if _ == 3:
628
+ continue # num obs_types_definitions
629
+ # Read an integer and a string from the record
630
+ integer_value = struct.unpack("i", record[:4])[0]
631
+ string_value = record[4:].decode("utf-8").strip()
632
+ header.append(f"{integer_value} {string_value}")
633
+
634
+ header.append(f"num_copies: {num_copies} num_qc: {num_qcs}")
635
+ header.append(f"num_obs: {num_obs} max_num_obs: {max_num_obs}")
636
+
637
+ # copie names
638
+ for _ in range(
639
+ 5 + obs_types_definitions,
640
+ 5 + obs_types_definitions + num_copies + num_qcs + 1,
641
+ ):
642
+ # Read the record length
643
+ record_length = ObsSequence._read_record_length(f)
644
+ if record_length is None:
645
+ break
646
+
647
+ # Read the actual record
648
+ record = f.read(record_length)
649
+ if not record:
650
+ break
651
+
652
+ ObsSequence._check_trailing_record_length(f, record_length)
653
+
654
+ if _ == 5 + obs_types_definitions:
655
+ continue
656
+
657
+ # Read the whole record as a string
658
+ string_value = record.decode("utf-8").strip()
659
+ header.append(string_value)
660
+
661
+ # first and last obs
662
+ # Read the record length
663
+ record_length = ObsSequence._read_record_length(f)
664
+
665
+ # Read the actual record
666
+ record = f.read(record_length)
667
+
668
+ ObsSequence._check_trailing_record_length(f, record_length)
669
+
670
+ # Read the whole record as a two integers
671
+ first, last = struct.unpack("ii", record)[:8]
672
+ header.append(f"first: {first} last: {last}")
673
+
674
+ return header
675
+
676
+ @staticmethod
677
+ def _collect_obs_types(header):
678
+ """Create a dictionary for the observation types in the obs_seq header"""
679
+ num_obs_types = int(header[2])
680
+ # The first line containing obs types is the 4th line in an obs_seq file.
681
+ types = {int(x.split()[0]): x.split()[1] for x in header[3 : num_obs_types + 3]}
682
+ return types
683
+
684
+ @staticmethod
685
+ def _collect_copie_names(header):
686
+ """
687
+ Extracts the names of the copies from the header of an obs_seq file.
688
+
689
+ Args:
690
+ header (list): A list of strings representing the lines in the header of the obs_seq file.
691
+
692
+ Returns:
693
+ tuple: A tuple containing two elements:
694
+ - copie_names (list): A list of strings representing the copy names with underscores for spaces.
695
+ - len(copie_names) (int): The number of copy names.
696
+ """
697
+ for i, line in enumerate(header):
698
+ if "num_obs:" in line and "max_num_obs:" in line:
699
+ first_copie = i + 1
700
+ break
701
+ copie_names = [
702
+ "_".join(x.split()) for x in header[first_copie:-1]
703
+ ] # first and last is last line of header
704
+ return copie_names, len(copie_names)
705
+
706
+ @staticmethod
707
+ def _num_qc_non_qc(header):
708
+ """Find the number of qc and non-qc copies in the header"""
709
+ for line in header:
710
+ if "num_copies:" in line and "num_qc:" in line:
711
+ num_non_qc = int(line.split()[1])
712
+ num_qc = int(line.split()[3])
713
+ return num_non_qc, num_qc
714
+
715
+ @staticmethod
716
+ def _obs_reader(file, n):
717
+ """Reads the ascii obs sequence file and returns a generator of the obs"""
718
+ previous_line = ""
719
+ with open(file, "r") as f:
720
+ for line in f:
721
+ if "OBS" in line or "OBS" in previous_line:
722
+ if "OBS" in line:
723
+ obs = []
724
+ obs.append(line.strip())
725
+ for i in range(
726
+ n + 100
727
+ ): # number of copies + 100. Needs to be bigger than any metadata
728
+ try:
729
+ next_line = next(f)
730
+ except:
731
+ yield obs
732
+ StopIteration
733
+ return
734
+ if "OBS" in next_line:
735
+ previous_line = next_line
736
+ break
737
+ else:
738
+ obs.append(next_line.strip())
739
+ yield obs
740
+ elif (
741
+ "OBS" in previous_line
742
+ ): # previous line is because I cannot use f.tell with next
743
+ obs = []
744
+ obs.append(previous_line.strip())
745
+ obs.append(line.strip())
746
+ for i in range(
747
+ n + 100
748
+ ): # number of copies + 100. Needs to be bigger than any metadata
749
+ try:
750
+ next_line = next(f)
751
+ except:
752
+ yield obs
753
+ StopIteration
754
+ return
755
+ if "OBS" in next_line:
756
+ previous_line = next_line
757
+ break
758
+ else:
759
+ obs.append(next_line.strip())
760
+ previous_line = next_line
761
+ yield obs
762
+
763
+ @staticmethod
764
+ def _check_trailing_record_length(file, expected_length):
765
+ """Reads and checks the trailing record length from the binary file written by Fortran.
766
+
767
+ Args:
768
+ file (file): The file object.
769
+ expected_length (int): The expected length of the trailing record.
770
+
771
+ Assuming 4 bytes:
772
+ | Record Length (4 bytes) | Data (N bytes) | Trailing Record Length (4 bytes) |
773
+ """
774
+ trailing_record_length_bytes = file.read(4)
775
+ trailing_record_length = struct.unpack("i", trailing_record_length_bytes)[0]
776
+ if expected_length != trailing_record_length:
777
+ raise ValueError("Record length mismatch in Fortran binary file")
778
+
779
+ @staticmethod
780
+ def _read_record_length(file):
781
+ """Reads and unpacks the record length from the file."""
782
+ record_length_bytes = file.read(4)
783
+ if not record_length_bytes:
784
+ return None # End of file
785
+ return struct.unpack("i", record_length_bytes)[0]
786
+
787
+ def _obs_binary_reader(self, file, n):
788
+ """Reads the obs sequence binary file and returns a generator of the obs"""
789
+ header_length = len(self.header)
790
+ with open(file, "rb") as f:
791
+ # Skip the first len(obs_seq.header) lines
792
+ for _ in range(header_length - 1):
793
+ # Read the record length
794
+ record_length = ObsSequence._read_record_length(f)
795
+ if record_length is None: # End of file
796
+ break
797
+
798
+ # Skip the actual record
799
+ f.seek(record_length, 1)
800
+
801
+ # Skip the trailing record length
802
+ f.seek(4, 1)
803
+
804
+ obs_num = 0
805
+ while True:
806
+ obs = []
807
+ obs_num += 1
808
+ obs.append(f"OBS {obs_num}")
809
+ for _ in range(n): # number of copies
810
+ # Read the record length
811
+ record_length = ObsSequence._read_record_length(f)
812
+ if record_length is None:
813
+ break
814
+ # Read the actual record (copie)
815
+ record = f.read(record_length)
816
+ obs.append(struct.unpack("d", record)[0])
817
+
818
+ # Read the trailing record length (should match the leading one)
819
+ ObsSequence._check_trailing_record_length(f, record_length)
820
+
821
+ # linked list info
822
+ record_length = ObsSequence._read_record_length(f)
823
+ if record_length is None:
824
+ break
825
+
826
+ record = f.read(record_length)
827
+ int1, int2, int3 = struct.unpack("iii", record[:12])
828
+ linked_list_string = f"{int1:<12} {int2:<10} {int3:<12}"
829
+ obs.append(linked_list_string)
830
+
831
+ ObsSequence._check_trailing_record_length(f, record_length)
832
+
833
+ # location (note no location header "loc3d" or "loc1d" for binary files)
834
+ obs.append("loc3d")
835
+ record_length = ObsSequence._read_record_length(f)
836
+ record = f.read(record_length)
837
+ x, y, z, vert = struct.unpack("dddi", record[:28])
838
+ location_string = f"{x} {y} {z} {vert}"
839
+ obs.append(location_string)
840
+
841
+ ObsSequence._check_trailing_record_length(f, record_length)
842
+
843
+ # kind (type of observation) value
844
+ obs.append("kind")
845
+ record_length = ObsSequence._read_record_length(f)
846
+ record = f.read(record_length)
847
+ kind = f"{struct.unpack('i', record)[0]}"
848
+ obs.append(kind)
849
+
850
+ ObsSequence._check_trailing_record_length(f, record_length)
851
+
852
+ # Skip metadata (obs_def) and go directly to the time record
853
+ while True:
854
+ pos = f.tell()
855
+ record_length = ObsSequence._read_record_length(f)
856
+ if record_length is None:
857
+ break # End of file
858
+
859
+ record = f.read(record_length)
860
+ # Check if this record is likely the "time" record (8 bytes, can be unpacked as two ints)
861
+ if record_length == 8:
862
+ try:
863
+ seconds, days = struct.unpack("ii", record)
864
+ # If unpack succeeds, this is the time record
865
+ f.seek(pos) # Seek back so the main loop can process it
866
+ break
867
+ except struct.error:
868
+ pass # Not the time record, keep skipping
869
+
870
+ ObsSequence._check_trailing_record_length(f, record_length)
871
+
872
+ # time (seconds, days)
873
+ record_length = ObsSequence._read_record_length(f)
874
+ record = f.read(record_length)
875
+ try: # This is incase the record is not the time record because of metadata funkyness
876
+ seconds, days = struct.unpack("ii", record)
877
+ except struct.error as e:
878
+ print(
879
+ f"Reading observation {obs_num}... record length: {record_length} kind {kind}"
880
+ )
881
+ print(f"")
882
+ print(f"Error unpacking seconds and days: {e}")
883
+ raise
884
+ time_string = f"{seconds} {days}"
885
+ obs.append(time_string)
886
+
887
+ ObsSequence._check_trailing_record_length(f, record_length)
888
+
889
+ # obs error variance
890
+ record_length = ObsSequence._read_record_length(f)
891
+ record = f.read(record_length)
892
+ obs.append(struct.unpack("d", record)[0])
893
+
894
+ ObsSequence._check_trailing_record_length(f, record_length)
895
+
896
+ yield obs
897
+
898
+ def composite_types(self, composite_types="use_default", raise_on_duplicate=False):
899
+ """
900
+ Set up and construct composite observation types for the DataFrame.
901
+
902
+ This function sets up composite observation types based on a provided YAML configuration or
903
+ a default configuration. It constructs new composite rows by combining specified
904
+ components and adds them to the DataFrame in place.
905
+
906
+ Args:
907
+ composite_types (str, optional): The YAML configuration for composite types.
908
+ If 'use_default', the default configuration is used. Otherwise, a custom YAML
909
+ configuration can be provided.
910
+ raise_on_duplicate (bool, optional): If True, raises an exception if there are
911
+ duplicates in the components. otherwise default False, deals with duplicates as though
912
+ they are distinct observations.
913
+
914
+ Returns:
915
+ pd.DataFrame: The updated DataFrame with the new composite rows added.
916
+
917
+ Raises:
918
+ Exception: If there are repeat values in the components and raise_on_duplicate = True
919
+ """
920
+
921
+ if composite_types == "use_default":
922
+ composite_yaml = self.default_composite_types
923
+ else:
924
+ composite_yaml = composite_types
925
+ self.composite_types_dict = _load_yaml_to_dict(composite_yaml)
926
+
927
+ components = []
928
+ for value in self.composite_types_dict.values():
929
+ components.extend(value["components"])
930
+
931
+ if len(components) != len(set(components)):
932
+ raise Exception("There are repeat values in components.")
933
+
934
+ # data frame for the composite types
935
+ df_comp = self.df[
936
+ self.df["type"]
937
+ .str.upper()
938
+ .isin([component.upper() for component in components])
939
+ ]
940
+
941
+ df = pd.DataFrame()
942
+ for key in self.composite_types_dict:
943
+ df_new = _construct_composit(
944
+ df_comp,
945
+ key,
946
+ self.composite_types_dict[key]["components"],
947
+ raise_on_duplicate,
948
+ )
949
+ df = pd.concat([df, df_new], axis=0)
950
+
951
+ # add the composite types to the DataFrame
952
+ self.df = pd.concat([self.df, df], axis=0)
953
+ return
954
+
955
+ @classmethod
956
+ def join(cls, obs_sequences, copies=None):
957
+ """
958
+ Join a list of observation sequences together.
959
+
960
+ This method combines the headers and observations from a list of ObsSequence objects
961
+ into a single ObsSequence object.
962
+
963
+ Args:
964
+ obs_sequences (list of ObsSequences): The list of observation sequences objects to join.
965
+ copies (list of str, optional): A list of copy names to include in the combined data.
966
+ If not provided, all copies are included.
967
+
968
+ Returns:
969
+ A new ObsSequence object containing the combined data.
970
+
971
+ Examples:
972
+ .. code-block:: python
973
+
974
+ obs_seq1 = ObsSequence(file='obs_seq1.final')
975
+ obs_seq2 = ObsSequence(file='obs_seq2.final')
976
+ obs_seq3 = ObsSequence(file='obs_seq3.final')
977
+ combined = ObsSequence.join([obs_seq1, obs_seq2, obs_seq3])
978
+ """
979
+ if not obs_sequences:
980
+ raise ValueError("The list of observation sequences is empty.")
981
+
982
+ # Create a new ObsSequence object with the combined data
983
+ combo = cls(file=None)
984
+
985
+ # Check if all obs_sequences have compatible attributes
986
+ first_loc_mod = obs_sequences[0].loc_mod
987
+ first_has_assimilation_info = obs_sequences[0].has_assimilation_info()
988
+ first_has_posterior = obs_sequences[0].has_posterior()
989
+ for obs_seq in obs_sequences:
990
+ if obs_seq.loc_mod != first_loc_mod:
991
+ raise ValueError(
992
+ "All observation sequences must have the same loc_mod."
993
+ )
994
+ if obs_seq.has_assimilation_info() != first_has_assimilation_info:
995
+ raise ValueError(
996
+ "All observation sequences must have assimilation info."
997
+ )
998
+ if obs_seq.has_posterior() != first_has_posterior:
999
+ raise ValueError(
1000
+ "All observation sequences must have the posterior info."
1001
+ )
1002
+ # HK @todo prior only
1003
+ combo.loc_mod = first_loc_mod
1004
+
1005
+ # check the copies are compatible (list of copies to combine?)
1006
+ # subset of copies if needed # @todo HK 1d or 3d
1007
+ if copies:
1008
+ start_required_columns = ["obs_num", "observation"]
1009
+ end_required_columns = [
1010
+ "linked_list",
1011
+ "longitude",
1012
+ "latitude",
1013
+ "vertical",
1014
+ "vert_unit",
1015
+ "type",
1016
+ "metadata",
1017
+ "external_FO",
1018
+ "seconds",
1019
+ "days",
1020
+ "time",
1021
+ "obs_err_var",
1022
+ ]
1023
+ required_columns = start_required_columns + end_required_columns
1024
+
1025
+ requested_columns = (
1026
+ start_required_columns
1027
+ + [item for item in copies if item not in required_columns]
1028
+ + end_required_columns
1029
+ )
1030
+
1031
+ for obs_seq in obs_sequences:
1032
+ if not set(requested_columns).issubset(obs_seq.df.columns):
1033
+ raise ValueError(
1034
+ "All observation sequences must have the selected copies."
1035
+ )
1036
+
1037
+ # go through columns and create header
1038
+ remove_list = [
1039
+ "obs_num",
1040
+ "linked_list",
1041
+ "latitude",
1042
+ "longitude",
1043
+ "vertical",
1044
+ "vert_unit",
1045
+ "type",
1046
+ "metadata",
1047
+ "external_FO",
1048
+ "time",
1049
+ "seconds",
1050
+ "days",
1051
+ "obs_err_var",
1052
+ ]
1053
+ # using lists to retain copy order, non_qcs followed by qcs
1054
+ combo.copie_names = [
1055
+ item for item in requested_columns if item not in remove_list
1056
+ ]
1057
+ combo.non_qc_copie_names = [
1058
+ item
1059
+ for item in combo.copie_names
1060
+ if item in obs_sequences[0].non_qc_copie_names
1061
+ ]
1062
+ combo.qc_copie_names = [
1063
+ item
1064
+ for item in combo.copie_names
1065
+ if item in obs_sequences[0].qc_copie_names
1066
+ ]
1067
+
1068
+ else:
1069
+ for obs_seq in obs_sequences:
1070
+ if not obs_sequences[0].df.columns.isin(obs_seq.df.columns).all():
1071
+ raise ValueError(
1072
+ "All observation sequences must have the same copies."
1073
+ )
1074
+ combo.copie_names = obs_sequences[0].copie_names
1075
+ combo.non_qc_copie_names = obs_sequences[0].non_qc_copie_names
1076
+ combo.qc_copie_names = obs_sequences[0].qc_copie_names
1077
+ combo.n_copies = len(combo.copie_names)
1078
+
1079
+ # todo HK @todo combine synonyms for obs?
1080
+
1081
+ # Initialize combined data
1082
+ combo.df = pd.DataFrame()
1083
+
1084
+ # Iterate over the list of observation sequences and combine their data
1085
+ for obs_seq in obs_sequences:
1086
+ if copies:
1087
+ combo.df = pd.concat(
1088
+ [combo.df, obs_seq.df[requested_columns]], ignore_index=True
1089
+ )
1090
+ else:
1091
+ combo.df = pd.concat([combo.df, obs_seq.df], ignore_index=True)
1092
+
1093
+ # update ObsSequence attributes from the combined DataFrame
1094
+ combo.update_attributes_from_df()
1095
+
1096
+ return combo
1097
+
1098
+ @staticmethod
1099
+ def _update_linked_list(df):
1100
+ """
1101
+ Sorts the DataFrame by 'time', resets the index, and adds/updates 'linked_list'
1102
+ and 'obs_num' columns in place.
1103
+ Modifies the input DataFrame directly.
1104
+ """
1105
+ df.sort_values(by="time", inplace=True, kind="stable")
1106
+ df.reset_index(drop=True, inplace=True)
1107
+ df["linked_list"] = ObsSequence._generate_linked_list_pattern(len(df))
1108
+ df["obs_num"] = df.index + 1
1109
+ return None
1110
+
1111
+ def has_assimilation_info(self):
1112
+ """
1113
+ Check if the DataFrame has prior information.
1114
+
1115
+ Returns:
1116
+ bool: True if both 'prior_ensemble_mean' and 'prior_ensemble_spread' columns are present, False otherwise.
1117
+ """
1118
+ return "prior_ensemble_mean".casefold() in map(
1119
+ str.casefold, self.df.columns
1120
+ ) and "prior_ensemble_spread".casefold() in map(str.casefold, self.df.columns)
1121
+
1122
+ def has_posterior(self):
1123
+ """
1124
+ Check if the DataFrame has posterior information.
1125
+
1126
+ Returns:
1127
+ bool: True if both 'posterior_ensemble_mean' and 'posterior_ensemble_spread' columns are present, False otherwise.
1128
+ """
1129
+ return "posterior_ensemble_mean".casefold() in map(
1130
+ str.casefold, self.df.columns
1131
+ ) and "posterior_ensemble_spread".casefold() in map(
1132
+ str.casefold, self.df.columns
1133
+ )
1134
+
1135
+ def create_header(self, n):
1136
+ """Create a header for the obs_seq file from the ObsSequence object."""
1137
+ assert (
1138
+ self.n_copies == self.n_non_qc + self.n_qc
1139
+ ), "n_copies must be equal to n_non_qc + n_qc"
1140
+
1141
+ self.header = []
1142
+ self.header.append(f"obs_sequence")
1143
+ self.header.append("obs_type_definitions")
1144
+ self.header.append(f"{len(self.types)}")
1145
+ for key, value in self.types.items():
1146
+ self.header.append(f"{key} {value}")
1147
+ self.header.append(f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}")
1148
+ self.header.append(f"num_obs: {n} max_num_obs: {n}")
1149
+ for copie in self.copie_names:
1150
+ self.header.append(copie)
1151
+ self.header.append(f"first: 1 last: {n}")
1152
+
1153
+ @staticmethod
1154
+ def _replace_qc2_nan(df):
1155
+ """
1156
+ Replace MISSING_R8 values with NaNs in posterior columns for observations where
1157
+ DART_quality_control = 2 (posterior forward observation operators failed)
1158
+
1159
+ This causes these observations to be ignored in the calculations of posterior statistics
1160
+ """
1161
+ df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_mean"] = np.nan
1162
+ df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_spread"] = np.nan
1163
+ num_post_members = len(
1164
+ df.columns[df.columns.str.startswith("posterior_ensemble_member_")]
1165
+ )
1166
+ for i in range(1, num_post_members + 1):
1167
+ df.loc[
1168
+ df["DART_quality_control"] == 2.0,
1169
+ "posterior_ensemble_member_" + str(i),
1170
+ ] = np.nan
1171
+
1172
+ @staticmethod
1173
+ def _revert_qc2_nan(df):
1174
+ """
1175
+ Revert NaNs back to MISSING_R8s for observations where DART_quality_control = 2
1176
+ (posterior forward observation operators failed)
1177
+ """
1178
+ df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_mean"] = (
1179
+ -888888.000000
1180
+ )
1181
+ df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_spread"] = (
1182
+ -888888.000000
1183
+ )
1184
+ num_post_members = len(
1185
+ df.columns[df.columns.str.startswith("posterior_ensemble_member_")]
1186
+ )
1187
+ for i in range(1, num_post_members + 1):
1188
+ df.loc[
1189
+ df["DART_quality_control"] == 2.0, "posterior_ensemble_member_" + str(i)
1190
+ ] = -888888.000000
1191
+
1192
+ def update_attributes_from_df(self):
1193
+ """
1194
+ Update all internal data (fields/properties) of the ObsSequence object that
1195
+ depend on the DataFrame (self.df).
1196
+ Call this after self.df is replaced or its structure changes.
1197
+
1198
+ Important:
1199
+
1200
+ Assumes copies are all columns between 'obs_num' and 'linked_list' (if present)
1201
+
1202
+ """
1203
+ # Update columns
1204
+ self.columns = list(self.df.columns)
1205
+
1206
+ # Update all_obs (list of lists, each row) @todo HK do we need this?
1207
+ self.all_obs = None
1208
+
1209
+ # Update copie_names, non_qc_copie_names, qc_copie_names, n_copies, n_non_qc, n_qc
1210
+ # Try to infer from columns if possible, else leave as is
1211
+ # Assume copies are all columns between 'obs_num' and 'linked_list' (if present)
1212
+ if "obs_num" in self.df.columns and "linked_list" in self.df.columns:
1213
+ obs_num_idx = self.df.columns.get_loc("obs_num")
1214
+ linked_list_idx = self.df.columns.get_loc("linked_list")
1215
+ self.copie_names = list(self.df.columns[obs_num_idx + 1 : linked_list_idx])
1216
+ else:
1217
+ # Fallback: use previous value or empty
1218
+ self.copie_names = getattr(self, "copie_names", [])
1219
+ self.n_copies = len(self.copie_names)
1220
+
1221
+ # Try to infer non_qc and qc copies from previous names if possible
1222
+ # Find qc copies first
1223
+ self.qc_copie_names = [c for c in self.copie_names if c in self.qc_copie_names]
1224
+ if self.qc_copie_names == []: # If no qc copies found, assume all are non-qc
1225
+ self.non_qc_copie_names = self.copie_names
1226
+ else: # pull out non-qc copies from the copie_names
1227
+ self.non_qc_copie_names = [
1228
+ c for c in self.copie_names if c not in self.qc_copie_names
1229
+ ]
1230
+ self.n_qc = len(self.qc_copie_names)
1231
+ self.n_non_qc = len(self.non_qc_copie_names)
1232
+
1233
+ # Update header and types and reverse_types
1234
+ self.create_header_from_dataframe()
1235
+
1236
+ # Update seq (generator should be empty or None if not from file)
1237
+ self.seq = []
1238
+ # Update loc_mod
1239
+ if "vertical" in self.df.columns:
1240
+ self.loc_mod = "loc3d"
1241
+ else:
1242
+ self.loc_mod = "loc1d"
1243
+
1244
+ # update linked list for obs and obs_nums
1245
+ ObsSequence._update_linked_list(self.df)
1246
+
1247
+
1248
+ def _load_yaml_to_dict(file_path):
1249
+ """
1250
+ Load a YAML file and convert it to a dictionary.
1251
+
1252
+ Args:
1253
+ file_path (str): The path to the YAML file.
1254
+
1255
+ Returns:
1256
+ dict: The YAML file content as a dictionary.
1257
+ """
1258
+ try:
1259
+ with open(file_path, "r") as file:
1260
+ return yaml.safe_load(file)
1261
+ except Exception as e:
1262
+ print(f"Error loading YAML file: {e}")
1263
+ raise
1264
+
1265
+
1266
+ def _convert_dart_time(seconds, days):
1267
+ """covert from seconds, days after 1601 to datetime object
1268
+
1269
+ Note:
1270
+ - base year for Gregorian calendar is 1601
1271
+ - dart time is seconds, days since 1601
1272
+ """
1273
+ time = dt.datetime(1601, 1, 1) + dt.timedelta(days=days, seconds=seconds)
1274
+ return time
1275
+
1276
+
1277
+ def _construct_composit(df_comp, composite, components, raise_on_duplicate):
1278
+ """
1279
+ Creates a new DataFrame by combining pairs of rows from two specified component
1280
+ types in an observation DataFrame. It matches rows based on location and time,
1281
+ and then combines certain columns using the square root of the sum of squares
1282
+ of the components.
1283
+
1284
+ Args:
1285
+ df_comp (pd.DataFrame): The DataFrame containing the component rows to be combined.
1286
+ composite (str): The type name for the new composite rows.
1287
+ components (list of str): A list containing the type names of the two components to be combined.
1288
+ raise_on_duplicate (bool): If False, raises an exception if there are duplicates in the components.
1289
+ otherwise deals with duplicates as though they are distinct observations.
1290
+
1291
+
1292
+ Returns:
1293
+ merged_df (pd.DataFrame): A DataFrame containing the new composite rows.
1294
+ """
1295
+ # select rows for the two components
1296
+ if len(components) != 2:
1297
+ raise ValueError("components must be a list of two component types.")
1298
+ selected_rows = df_comp[df_comp["type"] == components[0].upper()]
1299
+ selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
1300
+ selected_rows = selected_rows.copy()
1301
+ selected_rows_v = selected_rows_v.copy()
1302
+
1303
+ prior_columns_to_combine = df_comp.filter(regex="prior_ensemble").columns.tolist()
1304
+ posterior_columns_to_combine = df_comp.filter(
1305
+ regex="posterior_ensemble"
1306
+ ).columns.tolist()
1307
+ columns_to_combine = (
1308
+ prior_columns_to_combine
1309
+ + posterior_columns_to_combine
1310
+ + ["observation", "obs_err_var"]
1311
+ )
1312
+ merge_columns = ["latitude", "longitude", "vertical", "time"] # @todo HK 1d or 3d
1313
+ same_obs_columns = merge_columns + [
1314
+ "observation",
1315
+ "obs_err_var",
1316
+ ] # same observation is duplicated
1317
+
1318
+ if (
1319
+ selected_rows[same_obs_columns].duplicated().sum() > 0
1320
+ or selected_rows_v[same_obs_columns].duplicated().sum() > 0
1321
+ ):
1322
+
1323
+ if raise_on_duplicate:
1324
+ print(
1325
+ f"{selected_rows[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
1326
+ )
1327
+ print(f"{selected_rows[same_obs_columns]}")
1328
+ print(
1329
+ f"{selected_rows_v[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
1330
+ )
1331
+ print(f"{selected_rows_v[same_obs_columns]}")
1332
+ raise Exception("There are duplicates in the components.")
1333
+
1334
+ else:
1335
+ selected_rows["dup_num"] = selected_rows.groupby(
1336
+ same_obs_columns
1337
+ ).cumcount()
1338
+ selected_rows_v["dup_num"] = selected_rows_v.groupby(
1339
+ same_obs_columns
1340
+ ).cumcount()
1341
+
1342
+ # Merge the two DataFrames on location and time columns
1343
+ merged_df = pd.merge(
1344
+ selected_rows, selected_rows_v, on=merge_columns, suffixes=("", "_v")
1345
+ )
1346
+
1347
+ # Apply the square root of the sum of squares method to the relevant columns
1348
+ for col in columns_to_combine:
1349
+ merged_df[col] = np.sqrt(merged_df[col] ** 2 + merged_df[f"{col}_v"] ** 2)
1350
+
1351
+ # Create the new composite rows
1352
+ merged_df["type"] = composite.upper()
1353
+ merged_df = merged_df.drop(
1354
+ columns=[col for col in merged_df.columns if col.endswith("_v")]
1355
+ )
1356
+
1357
+ if "dup_num" in merged_df.columns:
1358
+ merged_df = merged_df.drop(columns=["dup_num"])
1359
+
1360
+ return merged_df