pydartdiags 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydartdiags might be problematic. Click here for more details.

@@ -9,7 +9,7 @@ import struct
9
9
 
10
10
  def requires_assimilation_info(func):
11
11
  def wrapper(self, *args, **kwargs):
12
- if self.has_assimilation_info:
12
+ if self.has_assimilation_info():
13
13
  return func(self, *args, **kwargs)
14
14
  else:
15
15
  raise ValueError(
@@ -19,27 +19,46 @@ def requires_assimilation_info(func):
19
19
  return wrapper
20
20
 
21
21
 
22
- def requires_posterior_info(func):
23
- def wrapper(self, *args, **kwargs):
24
- if self.has_posterior:
25
- return func(self, *args, **kwargs)
26
- else:
27
- raise ValueError("Posterior information is required to call this function.")
28
-
29
- return wrapper
22
+ class ObsSequence:
23
+ """
24
+ Initialize an ObsSequence object from an ASCII or binary observation sequence file,
25
+ or create an empty ObsSequence object from scratch.
30
26
 
27
+ 1D observations are given a datetime of days, seconds since 2000-01-01 00:00:00
31
28
 
32
- class obs_sequence:
33
- """
34
- Initialize an obs_sequence object from an ASCII or binary observation sequence file,
35
- or create an empty obs_sequence object from scratch.
29
+ 3D observations are given a datetime of days, seconds since 1601-01-01 00:00:00 (DART Gregorian calendar)
36
30
 
37
31
  Args:
38
32
  file (str): The input observation sequence ASCII or binary file.
39
- If None, an empty obs_sequence object is created from scratch.
33
+ If None, an empty ObsSequence object is created from scratch.
34
+ synonyms (list, optional): List of additional synonyms for the observation column in the DataFrame.
35
+ The default list is
36
+
37
+ .. code-block:: python
38
+
39
+ ['NCEP BUFR observation',
40
+ 'AIRS observation',
41
+ 'GTSPP observation',
42
+ 'SST observation',
43
+ 'observations',
44
+ 'WOD observation']
45
+
46
+ You can add more synonyms by providing a list of strings when
47
+ creating the ObsSequence object.
48
+
49
+ .. code-block:: python
50
+
51
+ ObsSequence(file, synonyms=['synonym1', 'synonym2'])
52
+
53
+ Raises:
54
+ ValueError: If neither 'loc3d' nor 'loc1d' could be found in the observation sequence.
55
+
56
+ Examples:
57
+
58
+ .. code-block:: python
59
+
60
+ obs_seq = ObsSequence(file='obs_seq.final')
40
61
 
41
- Returns:
42
- An obs_sequence object
43
62
 
44
63
  Attributes:
45
64
  df (pandas.DataFrame): The DataFrame containing the observation sequence data.
@@ -64,36 +83,18 @@ class obs_sequence:
64
83
  - scale height: 'VERTISSCALEHEIGHT' (unitless)
65
84
  loc_mod (str): The location model, either 'loc3d' or 'loc1d'.
66
85
  For 3D sphere models: latitude and longitude are in degrees in the DataFrame.
67
- types (dict): Dictionary of types of observations the observation sequence,
86
+ types (dict): Dictionary of types of observations in the observation sequence,
68
87
  e.g. {23: 'ACARS_TEMPERATURE'},
69
88
  reverse_types (dict): Dictionary of types with keys and values reversed, e.g
70
89
  {'ACARS_TEMPERATURE': 23}
71
90
  synonyms_for_obs (list): List of synonyms for the observation column in the DataFrame.
72
- The defualt list is
73
91
 
74
- .. code-block:: python
75
92
 
76
- [ 'NCEP BUFR observation',
77
- 'AIRS observation',
78
- 'GTSPP observation',
79
- 'SST observation',
80
- 'observations',
81
- 'WOD observation']
82
-
83
- You can add more synonyms by providing a list of strings when
84
- creating the obs_sequence object.
85
-
86
- .. code-block:: python
87
-
88
- obs_sequence(file, synonyms=['synonym1', 'synonym2']).df
89
-
90
- has_assimilation_info (bool): Indicates if assimilation information is present.
91
- has_posterior (bool): Indicates if posterior information is present.
92
93
  seq (generator): Generator of observations from the observation sequence file.
93
94
  all_obs (list): List of all observations, each observation is a list.
94
- Valid when the obs_sequence is created from a file.
95
- Set to None when the obs_sequence is created from scratch or multiple
96
- obs_sequences are joined.
95
+ Valid when the ObsSequence is created from a file.
96
+ Set to None when the ObsSequence is created from scratch or multiple
97
+ ObsSequences are joined.
97
98
  """
98
99
 
99
100
  vert = {
@@ -108,29 +109,8 @@ class obs_sequence:
108
109
  reversed_vert = {value: key for key, value in vert.items()}
109
110
 
110
111
  def __init__(self, file, synonyms=None):
111
- """
112
- Create an obs_sequence object from an ASCII or binary observation sequence file,
113
- or create an empty obs_sequence object from scratch.
114
-
115
- Args:
116
- file (str): The input observation sequence ASCII or binary file.
117
- If None, an empty obs_sequence object is created from scratch.
118
- synonyms (list, optional): List of synonyms for the observation column in the DataFrame.
119
-
120
- Returns:
121
- an obs_sequence object
122
-
123
- Examples:
124
-
125
- .. code-block:: python
126
-
127
- obs_seq = obs_sequence(file='obs_seq.final')
128
-
129
- """
130
112
 
131
113
  self.loc_mod = "None"
132
- self.has_assimilation_info = False
133
- self.has_posterior = False
134
114
  self.file = file
135
115
  self.synonyms_for_obs = [
136
116
  "NCEP BUFR observation",
@@ -146,6 +126,9 @@ class obs_sequence:
146
126
  else:
147
127
  self.synonyms_for_obs.append(synonyms)
148
128
 
129
+ module_dir = os.path.dirname(__file__)
130
+ self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
131
+
149
132
  if file is None:
150
133
  # Early exit - for testing purposes or creating obs_seq objects from scratch
151
134
  self.df = pd.DataFrame()
@@ -161,9 +144,6 @@ class obs_sequence:
161
144
  self.all_obs = []
162
145
  return
163
146
 
164
- module_dir = os.path.dirname(__file__)
165
- self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
166
-
167
147
  if self.is_binary(file):
168
148
  self.header = self.read_binary_header(file)
169
149
  else:
@@ -204,12 +184,6 @@ class obs_sequence:
204
184
  }
205
185
  self.df = self.df.rename(columns=rename_dict)
206
186
 
207
- # check if the assimilation info is present
208
- if "prior_ensemble_mean".casefold() in map(str.casefold, self.columns):
209
- self.has_assimilation_info = True
210
- if "posterior_ensemble_mean".casefold() in map(str.casefold, self.columns):
211
- self.has_posterior = True
212
-
213
187
  def create_all_obs(self):
214
188
  """steps through the generator to create a
215
189
  list of all observations in the sequence
@@ -232,7 +206,7 @@ class obs_sequence:
232
206
  data.append(float(location[0])) # location x
233
207
  data.append(float(location[1])) # location y
234
208
  data.append(float(location[2])) # location z
235
- data.append(obs_sequence.vert[int(location[3])])
209
+ data.append(ObsSequence.vert[int(location[3])])
236
210
  self.loc_mod = "loc3d"
237
211
  except ValueError:
238
212
  try:
@@ -261,9 +235,13 @@ class obs_sequence:
261
235
  time = obs[-2].split()
262
236
  data.append(int(time[0])) # seconds
263
237
  data.append(int(time[1])) # days
264
- data.append(
265
- convert_dart_time(int(time[0]), int(time[1]))
266
- ) # datetime # HK todo what is approprate for 1d models?
238
+ if self.loc_mod == "loc3d":
239
+ data.append(convert_dart_time(int(time[0]), int(time[1])))
240
+ else: # HK todo what is appropriate for 1d models?
241
+ data.append(
242
+ dt.datetime(2000, 1, 1)
243
+ + dt.timedelta(seconds=int(time[0]), days=int(time[1]))
244
+ )
267
245
  data.append(float(obs[-1])) # obs error variance ?convert to sd?
268
246
 
269
247
  return data
@@ -355,20 +333,13 @@ class obs_sequence:
355
333
  obsq.write_obs_seq('obs_seq.new')
356
334
 
357
335
  """
358
- with open(file, "w") as f:
359
336
 
360
- # If a DataFrame is provided, update the header with the number of observations
361
- num_rows = len(self.df)
362
- replacement_string = f"num_obs: {num_rows:>10} max_num_obs: {num_rows:>10}"
363
- new_header = [
364
- replacement_string if "num_obs" in element else element
365
- for element in self.header
366
- ]
337
+ self.create_header_from_dataframe()
367
338
 
368
- for line in new_header[:-1]:
339
+ with open(file, "w") as f:
340
+
341
+ for line in self.header:
369
342
  f.write(str(line) + "\n")
370
- first = 1
371
- f.write(f"first: {first:>12} last: {num_rows:>12}\n")
372
343
 
373
344
  # TODO HK is there something better than copying the whole thing here?
374
345
  df_copy = self.df.copy() # copy since you want to change for writing.
@@ -376,15 +347,24 @@ class obs_sequence:
376
347
  if self.loc_mod == "loc3d":
377
348
  df_copy["longitude"] = np.deg2rad(self.df["longitude"]).round(16)
378
349
  df_copy["latitude"] = np.deg2rad(self.df["latitude"]).round(16)
379
- if "bias" in df_copy.columns:
380
- df_copy = df_copy.drop(columns=["bias", "sq_err"])
350
+ if "prior_bias" in df_copy.columns:
351
+ df_copy = df_copy.drop(
352
+ columns=["prior_bias", "prior_sq_err", "prior_totalvar"]
353
+ )
354
+ if "posterior_bias" in df_copy.columns:
355
+ df_copy = df_copy.drop(
356
+ columns=["posterior_bias", "posterior_sq_err", "posterior_totalvar"]
357
+ )
358
+ if "midpoint" in df_copy.columns:
359
+ df_copy = df_copy.drop(columns=["midpoint", "vlevels"])
381
360
 
382
361
  # linked list for reading by dart programs
383
362
  df_copy = df_copy.sort_values(
384
363
  by=["time"], kind="stable"
385
364
  ) # sort the DataFrame by time
386
- df_copy["obs_num"] = self.df.index + 1 # obs_num in time order
387
- df_copy["linked_list"] = obs_sequence.generate_linked_list_pattern(
365
+ df_copy.reset_index(drop=True, inplace=True)
366
+ df_copy["obs_num"] = df_copy.index + 1 # obs_num in time order
367
+ df_copy["linked_list"] = ObsSequence.generate_linked_list_pattern(
388
368
  len(df_copy)
389
369
  ) # linked list pattern
390
370
 
@@ -395,6 +375,97 @@ class obs_sequence:
395
375
 
396
376
  df_copy.apply(write_row, axis=1)
397
377
 
378
+ @staticmethod
379
+ def update_types_dicts(df, reverse_types):
380
+ """
381
+ Ensure all unique observation types are in the reverse_types dictionary and create
382
+ the types dictionary.
383
+
384
+ Args:
385
+ df (pd.DataFrame): The DataFrame containing the observation sequence data.
386
+ reverse_types (dict): The dictionary mapping observation types to their corresponding integer values.
387
+
388
+ Returns:
389
+ dict: The updated reverse_types dictionary.
390
+ dict: The types dictionary with keys sorted in numerical order.
391
+ """
392
+ # Create a dictionary of observation types from the dataframe
393
+ unique_types = df["type"].unique()
394
+
395
+ # Ensure all unique types are in reverse_types
396
+ for obs_type in unique_types:
397
+ if obs_type not in reverse_types:
398
+ new_id = int(max(reverse_types.values(), default=0)) + 1
399
+ reverse_types[obs_type] = str(new_id)
400
+
401
+ not_sorted_types = {
402
+ reverse_types[obs_type]: obs_type for obs_type in unique_types
403
+ }
404
+ types = {
405
+ k: not_sorted_types[k] for k in sorted(not_sorted_types)
406
+ } # to get keys in numerical order
407
+
408
+ return reverse_types, types
409
+
410
+ def create_header_from_dataframe(self):
411
+ """
412
+ Create a header for the observation sequence based on the data in the DataFrame.
413
+
414
+ It creates a dictionary of unique observation types, counts the
415
+ number of observations, and constructs the header with necessary information.
416
+
417
+ Example:
418
+ self.create_header_from_dataframe()
419
+
420
+ """
421
+
422
+ self.reverse_types, self.types = self.update_types_dicts(
423
+ self.df, self.reverse_types
424
+ )
425
+
426
+ num_obs = len(self.df)
427
+
428
+ self.header = []
429
+ self.header.append("obs_sequence")
430
+ self.header.append("obs_type_definitions")
431
+ self.header.append(f"{len(self.types)}")
432
+ for key, value in self.types.items():
433
+ self.header.append(f"{key} {value}")
434
+ self.header.append(
435
+ f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}"
436
+ ) # @todo HK not keeping track if num_qc changes
437
+ self.header.append(f"num_obs: {num_obs:>10} max_num_obs: {num_obs:>10}")
438
+ stats_cols = [
439
+ "prior_bias",
440
+ "prior_sq_err",
441
+ "prior_totalvar",
442
+ "posterior_bias",
443
+ "posterior_sq_err",
444
+ "posterior_totalvar",
445
+ ]
446
+ level_cols = ["vlevels", "midpoint"]
447
+ non_copie_cols = [
448
+ "obs_num",
449
+ "linked_list",
450
+ "longitude",
451
+ "latitude",
452
+ "vertical",
453
+ "vert_unit",
454
+ "type",
455
+ "metadata",
456
+ "external_FO",
457
+ "seconds",
458
+ "days",
459
+ "time",
460
+ "obs_err_var",
461
+ "location",
462
+ ]
463
+ for copie in self.df.columns:
464
+ if copie not in stats_cols + non_copie_cols + level_cols:
465
+ self.header.append(copie.replace("_", " "))
466
+ first = 1
467
+ self.header.append(f"first: {first:>12} last: {num_obs:>12}")
468
+
398
469
  def column_headers(self):
399
470
  """define the columns for the dataframe"""
400
471
  heading = []
@@ -440,14 +511,18 @@ class obs_sequence:
440
511
  return self.df[self.df["DART_quality_control"] == dart_qc]
441
512
 
442
513
  @requires_assimilation_info
443
- def select_failed_qcs(self):
514
+ def select_used_qcs(self):
444
515
  """
445
- Select rows from the DataFrame where the DART quality control flag is greater than 0.
516
+ Select rows from the DataFrame where the observation was used.
517
+ Includes observations for which the posterior forward observation operators failed.
446
518
 
447
519
  Returns:
448
- pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
520
+ pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag 0 or 2.
449
521
  """
450
- return self.df[self.df["DART_quality_control"] > 0]
522
+ return self.df[
523
+ (self.df["DART_quality_control"] == 0)
524
+ | (self.df["DART_quality_control"] == 2)
525
+ ]
451
526
 
452
527
  @requires_assimilation_info
453
528
  def possible_vs_used(self):
@@ -456,7 +531,7 @@ class obs_sequence:
456
531
 
457
532
  This function takes a DataFrame containing observation data, including a 'type' column for the observation
458
533
  type and an 'observation' column. The number of used observations ('used'), is the total number
459
- minus the observations that failed quality control checks (as determined by the `select_failed_qcs` function).
534
+ of assimilated observations (as determined by the `select_used_qcs` function).
460
535
  The result is a DataFrame with each observation type, the count of possible observations, and the count of
461
536
  used observations.
462
537
 
@@ -468,8 +543,8 @@ class obs_sequence:
468
543
  possible = self.df.groupby("type")["observation"].count()
469
544
  possible.rename("possible", inplace=True)
470
545
 
471
- failed_qcs = self.select_failed_qcs().groupby("type")["observation"].count()
472
- used = possible - failed_qcs.reindex(possible.index, fill_value=0)
546
+ used_qcs = self.select_used_qcs().groupby("type")["observation"].count()
547
+ used = used_qcs.reindex(possible.index, fill_value=0)
473
548
  used.rename("used", inplace=True)
474
549
 
475
550
  return pd.concat([possible, used], axis=1).reset_index()
@@ -511,7 +586,7 @@ class obs_sequence:
511
586
  with open(file, "rb") as f:
512
587
  while True:
513
588
  # Read the record length
514
- record_length = obs_sequence.read_record_length(f)
589
+ record_length = ObsSequence.read_record_length(f)
515
590
  if record_length is None:
516
591
  break
517
592
  record = f.read(record_length)
@@ -519,7 +594,7 @@ class obs_sequence:
519
594
  break
520
595
 
521
596
  # Read the trailing record length (should match the leading one)
522
- obs_sequence.check_trailing_record_length(f, record_length)
597
+ ObsSequence.check_trailing_record_length(f, record_length)
523
598
 
524
599
  linecount += 1
525
600
 
@@ -537,7 +612,7 @@ class obs_sequence:
537
612
  f.seek(0)
538
613
 
539
614
  for _ in range(2):
540
- record_length = obs_sequence.read_record_length(f)
615
+ record_length = ObsSequence.read_record_length(f)
541
616
  if record_length is None:
542
617
  break
543
618
 
@@ -545,7 +620,7 @@ class obs_sequence:
545
620
  if not record: # end of file
546
621
  break
547
622
 
548
- obs_sequence.check_trailing_record_length(f, record_length)
623
+ ObsSequence.check_trailing_record_length(f, record_length)
549
624
  header.append(record.decode("utf-8").strip())
550
625
 
551
626
  header.append(str(obs_types_definitions))
@@ -553,7 +628,7 @@ class obs_sequence:
553
628
  # obs_types_definitions
554
629
  for _ in range(3, 4 + obs_types_definitions):
555
630
  # Read the record length
556
- record_length = obs_sequence.read_record_length(f)
631
+ record_length = ObsSequence.read_record_length(f)
557
632
  if record_length is None:
558
633
  break
559
634
 
@@ -562,7 +637,7 @@ class obs_sequence:
562
637
  if not record: # end of file
563
638
  break
564
639
 
565
- obs_sequence.check_trailing_record_length(f, record_length)
640
+ ObsSequence.check_trailing_record_length(f, record_length)
566
641
 
567
642
  if _ == 3:
568
643
  continue # num obs_types_definitions
@@ -580,7 +655,7 @@ class obs_sequence:
580
655
  5 + obs_types_definitions + num_copies + num_qcs + 1,
581
656
  ):
582
657
  # Read the record length
583
- record_length = obs_sequence.read_record_length(f)
658
+ record_length = ObsSequence.read_record_length(f)
584
659
  if record_length is None:
585
660
  break
586
661
 
@@ -589,7 +664,7 @@ class obs_sequence:
589
664
  if not record:
590
665
  break
591
666
 
592
- obs_sequence.check_trailing_record_length(f, record_length)
667
+ ObsSequence.check_trailing_record_length(f, record_length)
593
668
 
594
669
  if _ == 5 + obs_types_definitions:
595
670
  continue
@@ -600,12 +675,12 @@ class obs_sequence:
600
675
 
601
676
  # first and last obs
602
677
  # Read the record length
603
- record_length = obs_sequence.read_record_length(f)
678
+ record_length = ObsSequence.read_record_length(f)
604
679
 
605
680
  # Read the actual record
606
681
  record = f.read(record_length)
607
682
 
608
- obs_sequence.check_trailing_record_length(f, record_length)
683
+ ObsSequence.check_trailing_record_length(f, record_length)
609
684
 
610
685
  # Read the whole record as a two integers
611
686
  first, last = struct.unpack("ii", record)[:8]
@@ -730,7 +805,7 @@ class obs_sequence:
730
805
  # Skip the first len(obs_seq.header) lines
731
806
  for _ in range(header_length - 1):
732
807
  # Read the record length
733
- record_length = obs_sequence.read_record_length(f)
808
+ record_length = ObsSequence.read_record_length(f)
734
809
  if record_length is None: # End of file
735
810
  break
736
811
 
@@ -747,7 +822,7 @@ class obs_sequence:
747
822
  obs.append(f"OBS {obs_num}")
748
823
  for _ in range(n): # number of copies
749
824
  # Read the record length
750
- record_length = obs_sequence.read_record_length(f)
825
+ record_length = ObsSequence.read_record_length(f)
751
826
  if record_length is None:
752
827
  break
753
828
  # Read the actual record (copie)
@@ -755,10 +830,10 @@ class obs_sequence:
755
830
  obs.append(struct.unpack("d", record)[0])
756
831
 
757
832
  # Read the trailing record length (should match the leading one)
758
- obs_sequence.check_trailing_record_length(f, record_length)
833
+ ObsSequence.check_trailing_record_length(f, record_length)
759
834
 
760
835
  # linked list info
761
- record_length = obs_sequence.read_record_length(f)
836
+ record_length = ObsSequence.read_record_length(f)
762
837
  if record_length is None:
763
838
  break
764
839
 
@@ -767,17 +842,17 @@ class obs_sequence:
767
842
  linked_list_string = f"{int1:<12} {int2:<10} {int3:<12}"
768
843
  obs.append(linked_list_string)
769
844
 
770
- obs_sequence.check_trailing_record_length(f, record_length)
845
+ ObsSequence.check_trailing_record_length(f, record_length)
771
846
 
772
847
  # location (note no location header "loc3d" or "loc1d" for binary files)
773
848
  obs.append("loc3d")
774
- record_length = obs_sequence.read_record_length(f)
849
+ record_length = ObsSequence.read_record_length(f)
775
850
  record = f.read(record_length)
776
851
  x, y, z, vert = struct.unpack("dddi", record[:28])
777
852
  location_string = f"{x} {y} {z} {vert}"
778
853
  obs.append(location_string)
779
854
 
780
- obs_sequence.check_trailing_record_length(f, record_length)
855
+ ObsSequence.check_trailing_record_length(f, record_length)
781
856
 
782
857
  # kind (type of observation) value
783
858
  obs.append("kind")
@@ -787,23 +862,23 @@ class obs_sequence:
787
862
  kind = f"{struct.unpack('i', record)[0]}"
788
863
  obs.append(kind)
789
864
 
790
- obs_sequence.check_trailing_record_length(f, record_length)
865
+ ObsSequence.check_trailing_record_length(f, record_length)
791
866
 
792
867
  # time (seconds, days)
793
- record_length = obs_sequence.read_record_length(f)
868
+ record_length = ObsSequence.read_record_length(f)
794
869
  record = f.read(record_length)
795
870
  seconds, days = struct.unpack("ii", record)[:8]
796
871
  time_string = f"{seconds} {days}"
797
872
  obs.append(time_string)
798
873
 
799
- obs_sequence.check_trailing_record_length(f, record_length)
874
+ ObsSequence.check_trailing_record_length(f, record_length)
800
875
 
801
876
  # obs error variance
802
- record_length = obs_sequence.read_record_length(f)
877
+ record_length = ObsSequence.read_record_length(f)
803
878
  record = f.read(record_length)
804
879
  obs.append(struct.unpack("d", record)[0])
805
880
 
806
- obs_sequence.check_trailing_record_length(f, record_length)
881
+ ObsSequence.check_trailing_record_length(f, record_length)
807
882
 
808
883
  yield obs
809
884
 
@@ -816,7 +891,8 @@ class obs_sequence:
816
891
  components and adds them to the DataFrame.
817
892
 
818
893
  Args:
819
- composite_types (str, optional): The YAML configuration for composite types. If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
894
+ composite_types (str, optional): The YAML configuration for composite types.
895
+ If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
820
896
 
821
897
  Returns:
822
898
  pd.DataFrame: The updated DataFrame with the new composite rows added.
@@ -838,69 +914,68 @@ class obs_sequence:
838
914
  if len(components) != len(set(components)):
839
915
  raise Exception("There are repeat values in components.")
840
916
 
917
+ # data frame for the composite types
841
918
  df_comp = self.df[
842
919
  self.df["type"]
843
920
  .str.upper()
844
921
  .isin([component.upper() for component in components])
845
922
  ]
846
- df_no_comp = self.df[
847
- ~self.df["type"]
848
- .str.upper()
849
- .isin([component.upper() for component in components])
850
- ]
851
923
 
924
+ df = pd.DataFrame()
852
925
  for key in self.composite_types_dict:
853
926
  df_new = construct_composit(
854
927
  df_comp, key, self.composite_types_dict[key]["components"]
855
928
  )
856
- df_no_comp = pd.concat([df_no_comp, df_new], axis=0)
929
+ df = pd.concat([df, df_new], axis=0)
857
930
 
858
- return df_no_comp
931
+ # add the composite types to the DataFrame
932
+ self.df = pd.concat([self.df, df], axis=0)
933
+ return
859
934
 
860
935
  @classmethod
861
936
  def join(cls, obs_sequences, copies=None):
862
937
  """
863
938
  Join a list of observation sequences together.
864
939
 
865
- This method combines the headers and observations from a list of obs_sequence objects
866
- into a single obs_sequence object.
940
+ This method combines the headers and observations from a list of ObsSequence objects
941
+ into a single ObsSequence object.
867
942
 
868
943
  Args:
869
- obs_sequences (list of obs_sequences): The list of observation sequences objects to join.
944
+ obs_sequences (list of ObsSequences): The list of observation sequences objects to join.
870
945
  copies (list of str, optional): A list of copy names to include in the combined data.
871
946
  If not provided, all copies are included.
872
947
 
873
948
  Returns:
874
- A new obs_sequence object containing the combined data.
949
+ A new ObsSequence object containing the combined data.
875
950
 
876
951
  Example:
877
952
  .. code-block:: python
878
953
 
879
- obs_seq1 = obs_sequence(file='obs_seq1.final')
880
- obs_seq2 = obs_sequence(file='obs_seq2.final')
881
- obs_seq3 = obs_sequence(file='obs_seq3.final')
882
- combined = obs_sequence.join([obs_seq1, obs_seq2, obs_seq3])
954
+ obs_seq1 = ObsSequence(file='obs_seq1.final')
955
+ obs_seq2 = ObsSequence(file='obs_seq2.final')
956
+ obs_seq3 = ObsSequence(file='obs_seq3.final')
957
+ combined = ObsSequence.join([obs_seq1, obs_seq2, obs_seq3])
883
958
  """
884
959
  if not obs_sequences:
885
960
  raise ValueError("The list of observation sequences is empty.")
886
961
 
887
- # Create a new obs_sequnece object with the combined data
962
+ # Create a new ObsSequence object with the combined data
888
963
  combo = cls(file=None)
889
964
 
890
965
  # Check if all obs_sequences have compatible attributes
891
966
  first_loc_mod = obs_sequences[0].loc_mod
892
- first_has_assimilation_info = obs_sequences[0].has_assimilation_info
893
- first_has_posterior = obs_sequences[0].has_posterior
967
+ first_has_assimilation_info = obs_sequences[0].has_assimilation_info()
968
+ first_has_posterior = obs_sequences[0].has_posterior()
894
969
  for obs_seq in obs_sequences:
895
970
  if obs_seq.loc_mod != first_loc_mod:
896
971
  raise ValueError(
897
972
  "All observation sequences must have the same loc_mod."
898
973
  )
899
- if obs_seq.has_assimilation_info != first_has_assimilation_info:
974
+ if obs_seq.has_assimilation_info() != first_has_assimilation_info:
900
975
  raise ValueError(
901
976
  "All observation sequences must have assimilation info."
902
977
  )
903
- if obs_seq.has_posterior != first_has_posterior:
978
+ if obs_seq.has_posterior() != first_has_posterior:
904
979
  raise ValueError(
905
980
  "All observation sequences must have the posterior info."
906
981
  )
@@ -908,7 +983,7 @@ class obs_sequence:
908
983
  combo.loc_mod = first_loc_mod
909
984
 
910
985
  # check the copies are compatible (list of copies to combine?)
911
- # subset of copies if needed
986
+ # subset of copies if needed # @todo HK 1d or 3d
912
987
  if copies:
913
988
  start_required_columns = ["obs_num", "observation"]
914
989
  end_required_columns = [
@@ -1009,30 +1084,40 @@ class obs_sequence:
1009
1084
 
1010
1085
  # create linked list for obs
1011
1086
  combo.df = combined_df.sort_values(by="time").reset_index(drop=True)
1012
- combo.df["linked_list"] = obs_sequence.generate_linked_list_pattern(
1087
+ combo.df["linked_list"] = ObsSequence.generate_linked_list_pattern(
1013
1088
  len(combo.df)
1014
1089
  )
1015
1090
  combo.df["obs_num"] = combined_df.index + 1
1016
1091
  combo.create_header(len(combo.df))
1017
1092
 
1018
- # set assimilation info (mean and spread) (prior and posterior)
1019
- combo.has_assimilation_info = "prior_ensemble_mean".casefold() in map(
1020
- str.casefold, combo.df.columns
1021
- )
1022
- combo.has_assimilation_info = "prior_ensemble_spread".casefold() in map(
1023
- str.casefold, combo.df.columns
1024
- )
1025
- combo.has_posterior = "posterior_ensemble_mean".casefold() in map(
1026
- str.casefold, combo.df.columns
1027
- )
1028
- combo.has_posterior = "posterior_ensemble_spread".casefold() in map(
1029
- str.casefold, combo.df.columns
1030
- )
1031
-
1032
1093
  return combo
1033
1094
 
1095
+ def has_assimilation_info(self):
1096
+ """
1097
+ Check if the DataFrame has prior information.
1098
+
1099
+ Returns:
1100
+ bool: True if both 'prior_ensemble_mean' and 'prior_ensemble_spread' columns are present, False otherwise.
1101
+ """
1102
+ return "prior_ensemble_mean".casefold() in map(
1103
+ str.casefold, self.df.columns
1104
+ ) and "prior_ensemble_spread".casefold() in map(str.casefold, self.df.columns)
1105
+
1106
+ def has_posterior(self):
1107
+ """
1108
+ Check if the DataFrame has posterior information.
1109
+
1110
+ Returns:
1111
+ bool: True if both 'posterior_ensemble_mean' and 'posterior_ensemble_spread' columns are present, False otherwise.
1112
+ """
1113
+ return "posterior_ensemble_mean".casefold() in map(
1114
+ str.casefold, self.df.columns
1115
+ ) and "posterior_ensemble_spread".casefold() in map(
1116
+ str.casefold, self.df.columns
1117
+ )
1118
+
1034
1119
  def create_header(self, n):
1035
- """Create a header for the obs_seq file from the obs_sequence object."""
1120
+ """Create a header for the obs_seq file from the ObsSequence object."""
1036
1121
  assert (
1037
1122
  self.n_copies == self.n_non_qc + self.n_qc
1038
1123
  ), "n_copies must be equal to n_non_qc + n_qc"
@@ -1065,7 +1150,7 @@ def load_yaml_to_dict(file_path):
1065
1150
  return yaml.safe_load(file)
1066
1151
  except Exception as e:
1067
1152
  print(f"Error loading YAML file: {e}")
1068
- return None
1153
+ raise
1069
1154
 
1070
1155
 
1071
1156
  def convert_dart_time(seconds, days):
@@ -1093,17 +1178,39 @@ def construct_composit(df_comp, composite, components):
1093
1178
  components (list of str): A list containing the type names of the two components to be combined.
1094
1179
 
1095
1180
  Returns:
1096
- merged_df (pd.DataFrame): The updated DataFrame with the new composite rows added.
1181
+ merged_df (pd.DataFrame): A DataFrame containing the new composite rows.
1097
1182
  """
1098
1183
  selected_rows = df_comp[df_comp["type"] == components[0].upper()]
1099
1184
  selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
1100
1185
 
1101
- columns_to_combine = df_comp.filter(regex="ensemble").columns.tolist()
1102
- columns_to_combine.append("observation") # TODO HK: bias, sq_err, obs_err_var
1186
+ prior_columns_to_combine = df_comp.filter(regex="prior_ensemble").columns.tolist()
1187
+ posterior_columns_to_combine = df_comp.filter(
1188
+ regex="posterior_ensemble"
1189
+ ).columns.tolist()
1190
+ columns_to_combine = (
1191
+ prior_columns_to_combine
1192
+ + posterior_columns_to_combine
1193
+ + ["observation", "obs_err_var"]
1194
+ )
1103
1195
  merge_columns = ["latitude", "longitude", "vertical", "time"]
1104
-
1105
- print("duplicates in u: ", selected_rows[merge_columns].duplicated().sum())
1106
- print("duplicates in v: ", selected_rows_v[merge_columns].duplicated().sum())
1196
+ same_obs_columns = merge_columns + [
1197
+ "observation",
1198
+ "obs_err_var",
1199
+ ] # same observation is duplicated
1200
+
1201
+ if (
1202
+ selected_rows[same_obs_columns].duplicated().sum() > 0
1203
+ or selected_rows_v[same_obs_columns].duplicated().sum() > 0
1204
+ ):
1205
+ print(
1206
+ f"{selected_rows[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
1207
+ )
1208
+ print(f"{selected_rows[same_obs_columns]}")
1209
+ print(
1210
+ f"{selected_rows_v[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
1211
+ )
1212
+ print(f"{selected_rows_v[same_obs_columns]}")
1213
+ raise Exception("There are duplicates in the components.")
1107
1214
 
1108
1215
  # Merge the two DataFrames on location and time columns
1109
1216
  merged_df = pd.merge(