pydartdiags 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydartdiags might be problematic. Click here for more details.
- pydartdiags/obs_sequence/obs_sequence.py +310 -158
- pydartdiags/stats/stats.py +93 -15
- {pydartdiags-0.5.1.dist-info → pydartdiags-0.6.1.dist-info}/METADATA +2 -2
- {pydartdiags-0.5.1.dist-info → pydartdiags-0.6.1.dist-info}/RECORD +7 -7
- {pydartdiags-0.5.1.dist-info → pydartdiags-0.6.1.dist-info}/WHEEL +1 -1
- {pydartdiags-0.5.1.dist-info → pydartdiags-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {pydartdiags-0.5.1.dist-info → pydartdiags-0.6.1.dist-info}/top_level.txt +0 -0
|
@@ -19,17 +19,46 @@ def requires_assimilation_info(func):
|
|
|
19
19
|
return wrapper
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
class
|
|
22
|
+
class ObsSequence:
|
|
23
23
|
"""
|
|
24
|
-
Initialize an
|
|
25
|
-
or create an empty
|
|
24
|
+
Initialize an ObsSequence object from an ASCII or binary observation sequence file,
|
|
25
|
+
or create an empty ObsSequence object from scratch.
|
|
26
|
+
|
|
27
|
+
1D observations are given a datetime of days, seconds since 2000-01-01 00:00:00
|
|
28
|
+
|
|
29
|
+
3D observations are given a datetime of days, seconds since 1601-01-01 00:00:00 (DART Gregorian calendar)
|
|
26
30
|
|
|
27
31
|
Args:
|
|
28
32
|
file (str): The input observation sequence ASCII or binary file.
|
|
29
|
-
|
|
33
|
+
If None, an empty ObsSequence object is created from scratch.
|
|
34
|
+
synonyms (list, optional): List of additional synonyms for the observation column in the DataFrame.
|
|
35
|
+
The default list is
|
|
36
|
+
|
|
37
|
+
.. code-block:: python
|
|
38
|
+
|
|
39
|
+
['NCEP BUFR observation',
|
|
40
|
+
'AIRS observation',
|
|
41
|
+
'GTSPP observation',
|
|
42
|
+
'SST observation',
|
|
43
|
+
'observations',
|
|
44
|
+
'WOD observation']
|
|
45
|
+
|
|
46
|
+
You can add more synonyms by providing a list of strings when
|
|
47
|
+
creating the ObsSequence object.
|
|
48
|
+
|
|
49
|
+
.. code-block:: python
|
|
50
|
+
|
|
51
|
+
ObsSequence(file, synonyms=['synonym1', 'synonym2'])
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
ValueError: If neither 'loc3d' nor 'loc1d' could be found in the observation sequence.
|
|
55
|
+
|
|
56
|
+
Examples:
|
|
57
|
+
|
|
58
|
+
.. code-block:: python
|
|
59
|
+
|
|
60
|
+
obs_seq = ObsSequence(file='obs_seq.final')
|
|
30
61
|
|
|
31
|
-
Returns:
|
|
32
|
-
An obs_sequence object
|
|
33
62
|
|
|
34
63
|
Attributes:
|
|
35
64
|
df (pandas.DataFrame): The DataFrame containing the observation sequence data.
|
|
@@ -54,34 +83,18 @@ class obs_sequence:
|
|
|
54
83
|
- scale height: 'VERTISSCALEHEIGHT' (unitless)
|
|
55
84
|
loc_mod (str): The location model, either 'loc3d' or 'loc1d'.
|
|
56
85
|
For 3D sphere models: latitude and longitude are in degrees in the DataFrame.
|
|
57
|
-
types (dict): Dictionary of types of observations the observation sequence,
|
|
86
|
+
types (dict): Dictionary of types of observations in the observation sequence,
|
|
58
87
|
e.g. {23: 'ACARS_TEMPERATURE'},
|
|
59
88
|
reverse_types (dict): Dictionary of types with keys and values reversed, e.g
|
|
60
89
|
{'ACARS_TEMPERATURE': 23}
|
|
61
90
|
synonyms_for_obs (list): List of synonyms for the observation column in the DataFrame.
|
|
62
|
-
The default list is
|
|
63
|
-
|
|
64
|
-
.. code-block:: python
|
|
65
|
-
|
|
66
|
-
[ 'NCEP BUFR observation',
|
|
67
|
-
'AIRS observation',
|
|
68
|
-
'GTSPP observation',
|
|
69
|
-
'SST observation',
|
|
70
|
-
'observations',
|
|
71
|
-
'WOD observation']
|
|
72
91
|
|
|
73
|
-
You can add more synonyms by providing a list of strings when
|
|
74
|
-
creating the obs_sequence object.
|
|
75
|
-
|
|
76
|
-
.. code-block:: python
|
|
77
|
-
|
|
78
|
-
obs_sequence(file, synonyms=['synonym1', 'synonym2']).df
|
|
79
92
|
|
|
80
93
|
seq (generator): Generator of observations from the observation sequence file.
|
|
81
94
|
all_obs (list): List of all observations, each observation is a list.
|
|
82
|
-
Valid when the
|
|
83
|
-
Set to None when the
|
|
84
|
-
|
|
95
|
+
Valid when the ObsSequence is created from a file.
|
|
96
|
+
Set to None when the ObsSequence is created from scratch or multiple
|
|
97
|
+
ObsSequences are joined.
|
|
85
98
|
"""
|
|
86
99
|
|
|
87
100
|
vert = {
|
|
@@ -96,27 +109,6 @@ class obs_sequence:
|
|
|
96
109
|
reversed_vert = {value: key for key, value in vert.items()}
|
|
97
110
|
|
|
98
111
|
def __init__(self, file, synonyms=None):
|
|
99
|
-
"""
|
|
100
|
-
Create an obs_sequence object from an ASCII or binary observation sequence file,
|
|
101
|
-
or create an empty obs_sequence object from scratch.
|
|
102
|
-
|
|
103
|
-
Args:
|
|
104
|
-
file (str): The input observation sequence ASCII or binary file.
|
|
105
|
-
If None, an empty obs_sequence object is created from scratch.
|
|
106
|
-
synonyms (list, optional): List of synonyms for the observation column in the DataFrame.
|
|
107
|
-
|
|
108
|
-
Returns:
|
|
109
|
-
an obs_sequence object
|
|
110
|
-
1D observations are given a datetime of days, seconds since 2000-01-01 00:00:00
|
|
111
|
-
3D observations are given a datetime of days, seconds since 1601-01-01 00:00:00 (DART Gregorian calendar)
|
|
112
|
-
|
|
113
|
-
Examples:
|
|
114
|
-
|
|
115
|
-
.. code-block:: python
|
|
116
|
-
|
|
117
|
-
obs_seq = obs_sequence(file='obs_seq.final')
|
|
118
|
-
|
|
119
|
-
"""
|
|
120
112
|
|
|
121
113
|
self.loc_mod = "None"
|
|
122
114
|
self.file = file
|
|
@@ -192,6 +184,14 @@ class obs_sequence:
|
|
|
192
184
|
}
|
|
193
185
|
self.df = self.df.rename(columns=rename_dict)
|
|
194
186
|
|
|
187
|
+
if self.is_binary(file):
|
|
188
|
+
# binary files do not have "OBS X" in, so set linked list from df.
|
|
189
|
+
self.update_attributes_from_df()
|
|
190
|
+
|
|
191
|
+
# Replace MISSING_R8s with NaNs in posterior stats where DART_quality_control = 2
|
|
192
|
+
if self.has_posterior():
|
|
193
|
+
ObsSequence.replace_qc2_nan(self.df)
|
|
194
|
+
|
|
195
195
|
def create_all_obs(self):
|
|
196
196
|
"""steps through the generator to create a
|
|
197
197
|
list of all observations in the sequence
|
|
@@ -205,7 +205,7 @@ class obs_sequence:
|
|
|
205
205
|
def obs_to_list(self, obs):
|
|
206
206
|
"""put single observation into a list"""
|
|
207
207
|
data = []
|
|
208
|
-
data.append(obs[0].split()[1]) # obs_num
|
|
208
|
+
data.append(int(obs[0].split()[1])) # obs_num
|
|
209
209
|
data.extend(list(map(float, obs[1 : self.n_copies + 1]))) # all the copies
|
|
210
210
|
data.append(obs[self.n_copies + 1]) # linked list info
|
|
211
211
|
try: # HK todo only have to check loc3d or loc1d for the first observation, the whole file is the same
|
|
@@ -214,7 +214,7 @@ class obs_sequence:
|
|
|
214
214
|
data.append(float(location[0])) # location x
|
|
215
215
|
data.append(float(location[1])) # location y
|
|
216
216
|
data.append(float(location[2])) # location z
|
|
217
|
-
data.append(
|
|
217
|
+
data.append(ObsSequence.vert[int(location[3])])
|
|
218
218
|
self.loc_mod = "loc3d"
|
|
219
219
|
except ValueError:
|
|
220
220
|
try:
|
|
@@ -227,9 +227,9 @@ class obs_sequence:
|
|
|
227
227
|
"Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
|
|
228
228
|
)
|
|
229
229
|
typeI = obs.index("kind") # type of observation
|
|
230
|
-
type_value = obs[typeI + 1]
|
|
231
|
-
if
|
|
232
|
-
data.append(
|
|
230
|
+
type_value = int(obs[typeI + 1])
|
|
231
|
+
if type_value < 0:
|
|
232
|
+
data.append(type_value)
|
|
233
233
|
else:
|
|
234
234
|
data.append(self.types[type_value]) # observation type
|
|
235
235
|
|
|
@@ -291,14 +291,22 @@ class obs_sequence:
|
|
|
291
291
|
+ str(self.reversed_vert[data[self.n_copies + 5]])
|
|
292
292
|
) # location x, y, z, vert
|
|
293
293
|
obs.append("kind") # this is type of observation
|
|
294
|
-
|
|
294
|
+
obs_type = data[self.n_copies + 6]
|
|
295
|
+
if isinstance(obs_type, str):
|
|
296
|
+
obs.append(self.reverse_types[obs_type]) # observation type
|
|
297
|
+
else:
|
|
298
|
+
obs.append(obs_type) # Identity obs negative integer
|
|
295
299
|
# Convert metadata to a string and append !HK @todo you are not converting to string
|
|
296
300
|
obs.extend(data[self.n_copies + 7]) # metadata
|
|
297
301
|
obs.extend(data[self.n_copies + 8]) # external forward operator
|
|
298
302
|
elif self.loc_mod == "loc1d":
|
|
299
303
|
obs.append(data[self.n_copies + 2]) # 1d location
|
|
300
304
|
obs.append("kind") # this is type of observation
|
|
301
|
-
|
|
305
|
+
obs_type = data[self.n_copies + 3]
|
|
306
|
+
if isinstance(obs_type, str):
|
|
307
|
+
obs.append(self.reverse_types[obs_type]) # observation type
|
|
308
|
+
else:
|
|
309
|
+
obs.append(obs_type) # Identity obs negative integer
|
|
302
310
|
obs.extend(data[self.n_copies + 4]) # metadata
|
|
303
311
|
obs.extend(data[self.n_copies + 5]) # external forward operator
|
|
304
312
|
obs.append(" ".join(map(str, data[-4:-2]))) # seconds, days
|
|
@@ -324,14 +332,17 @@ class obs_sequence:
|
|
|
324
332
|
|
|
325
333
|
This function writes the observation sequence stored in the obs_seq.DataFrame to a specified file.
|
|
326
334
|
It updates the header with the number of observations, converts coordinates back to radians
|
|
327
|
-
if necessary,
|
|
328
|
-
list pattern for reading by DART
|
|
335
|
+
if necessary, reverts NaNs back to MISSING_R8 for observations with QC=2, drops unnecessary
|
|
336
|
+
columns, sorts the DataFrame by time, and generates a linked list pattern for reading by DART
|
|
337
|
+
programs.
|
|
329
338
|
|
|
330
339
|
Args:
|
|
331
340
|
file (str): The path to the file where the observation sequence will be written.
|
|
332
341
|
|
|
333
342
|
Notes:
|
|
334
343
|
- Longitude and latitude are converted back to radians if the location model is 'loc3d'.
|
|
344
|
+
- The replacement of MISSING_R8 values with NaNs for any obs that failed the posterior
|
|
345
|
+
forward observation operators (QC2) is reverted.
|
|
335
346
|
- The 'bias' and 'sq_err' columns are dropped if they exist in the DataFrame.
|
|
336
347
|
- The DataFrame is sorted by the 'time' column.
|
|
337
348
|
- An 'obs_num' column is added to the DataFrame to number the observations in time order.
|
|
@@ -342,7 +353,8 @@ class obs_sequence:
|
|
|
342
353
|
|
|
343
354
|
"""
|
|
344
355
|
|
|
345
|
-
|
|
356
|
+
# Update attributes, header, and linked list from dataframe
|
|
357
|
+
self.update_attributes_from_df()
|
|
346
358
|
|
|
347
359
|
with open(file, "w") as f:
|
|
348
360
|
|
|
@@ -366,15 +378,9 @@ class obs_sequence:
|
|
|
366
378
|
if "midpoint" in df_copy.columns:
|
|
367
379
|
df_copy = df_copy.drop(columns=["midpoint", "vlevels"])
|
|
368
380
|
|
|
369
|
-
#
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
) # sort the DataFrame by time
|
|
373
|
-
df_copy.reset_index(drop=True, inplace=True)
|
|
374
|
-
df_copy["obs_num"] = df_copy.index + 1 # obs_num in time order
|
|
375
|
-
df_copy["linked_list"] = obs_sequence.generate_linked_list_pattern(
|
|
376
|
-
len(df_copy)
|
|
377
|
-
) # linked list pattern
|
|
381
|
+
# Revert NaNs back to MISSING_R8s
|
|
382
|
+
if self.has_posterior():
|
|
383
|
+
ObsSequence.revert_qc2_nan(df_copy)
|
|
378
384
|
|
|
379
385
|
def write_row(row):
|
|
380
386
|
ob_write = self.list_to_obs(row.tolist())
|
|
@@ -398,13 +404,16 @@ class obs_sequence:
|
|
|
398
404
|
dict: The types dictionary with keys sorted in numerical order.
|
|
399
405
|
"""
|
|
400
406
|
# Create a dictionary of observation types from the dataframe
|
|
401
|
-
|
|
407
|
+
# Ignore Identity obs (negative integers)
|
|
408
|
+
unique_types = df.loc[
|
|
409
|
+
df["type"].apply(lambda x: isinstance(x, str)), "type"
|
|
410
|
+
].unique()
|
|
402
411
|
|
|
403
412
|
# Ensure all unique types are in reverse_types
|
|
404
413
|
for obs_type in unique_types:
|
|
405
414
|
if obs_type not in reverse_types:
|
|
406
|
-
new_id =
|
|
407
|
-
reverse_types[obs_type] =
|
|
415
|
+
new_id = max(reverse_types.values(), default=0) + 1
|
|
416
|
+
reverse_types[obs_type] = new_id
|
|
408
417
|
|
|
409
418
|
not_sorted_types = {
|
|
410
419
|
reverse_types[obs_type]: obs_type for obs_type in unique_types
|
|
@@ -439,9 +448,7 @@ class obs_sequence:
|
|
|
439
448
|
self.header.append(f"{len(self.types)}")
|
|
440
449
|
for key, value in self.types.items():
|
|
441
450
|
self.header.append(f"{key} {value}")
|
|
442
|
-
self.header.append(
|
|
443
|
-
f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}"
|
|
444
|
-
) # @todo HK not keeping track if num_qc changes
|
|
451
|
+
self.header.append(f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}")
|
|
445
452
|
self.header.append(f"num_obs: {num_obs:>10} max_num_obs: {num_obs:>10}")
|
|
446
453
|
stats_cols = [
|
|
447
454
|
"prior_bias",
|
|
@@ -594,7 +601,7 @@ class obs_sequence:
|
|
|
594
601
|
with open(file, "rb") as f:
|
|
595
602
|
while True:
|
|
596
603
|
# Read the record length
|
|
597
|
-
record_length =
|
|
604
|
+
record_length = ObsSequence.read_record_length(f)
|
|
598
605
|
if record_length is None:
|
|
599
606
|
break
|
|
600
607
|
record = f.read(record_length)
|
|
@@ -602,7 +609,7 @@ class obs_sequence:
|
|
|
602
609
|
break
|
|
603
610
|
|
|
604
611
|
# Read the trailing record length (should match the leading one)
|
|
605
|
-
|
|
612
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
606
613
|
|
|
607
614
|
linecount += 1
|
|
608
615
|
|
|
@@ -620,7 +627,7 @@ class obs_sequence:
|
|
|
620
627
|
f.seek(0)
|
|
621
628
|
|
|
622
629
|
for _ in range(2):
|
|
623
|
-
record_length =
|
|
630
|
+
record_length = ObsSequence.read_record_length(f)
|
|
624
631
|
if record_length is None:
|
|
625
632
|
break
|
|
626
633
|
|
|
@@ -628,7 +635,7 @@ class obs_sequence:
|
|
|
628
635
|
if not record: # end of file
|
|
629
636
|
break
|
|
630
637
|
|
|
631
|
-
|
|
638
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
632
639
|
header.append(record.decode("utf-8").strip())
|
|
633
640
|
|
|
634
641
|
header.append(str(obs_types_definitions))
|
|
@@ -636,7 +643,7 @@ class obs_sequence:
|
|
|
636
643
|
# obs_types_definitions
|
|
637
644
|
for _ in range(3, 4 + obs_types_definitions):
|
|
638
645
|
# Read the record length
|
|
639
|
-
record_length =
|
|
646
|
+
record_length = ObsSequence.read_record_length(f)
|
|
640
647
|
if record_length is None:
|
|
641
648
|
break
|
|
642
649
|
|
|
@@ -645,7 +652,7 @@ class obs_sequence:
|
|
|
645
652
|
if not record: # end of file
|
|
646
653
|
break
|
|
647
654
|
|
|
648
|
-
|
|
655
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
649
656
|
|
|
650
657
|
if _ == 3:
|
|
651
658
|
continue # num obs_types_definitions
|
|
@@ -663,7 +670,7 @@ class obs_sequence:
|
|
|
663
670
|
5 + obs_types_definitions + num_copies + num_qcs + 1,
|
|
664
671
|
):
|
|
665
672
|
# Read the record length
|
|
666
|
-
record_length =
|
|
673
|
+
record_length = ObsSequence.read_record_length(f)
|
|
667
674
|
if record_length is None:
|
|
668
675
|
break
|
|
669
676
|
|
|
@@ -672,7 +679,7 @@ class obs_sequence:
|
|
|
672
679
|
if not record:
|
|
673
680
|
break
|
|
674
681
|
|
|
675
|
-
|
|
682
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
676
683
|
|
|
677
684
|
if _ == 5 + obs_types_definitions:
|
|
678
685
|
continue
|
|
@@ -683,12 +690,12 @@ class obs_sequence:
|
|
|
683
690
|
|
|
684
691
|
# first and last obs
|
|
685
692
|
# Read the record length
|
|
686
|
-
record_length =
|
|
693
|
+
record_length = ObsSequence.read_record_length(f)
|
|
687
694
|
|
|
688
695
|
# Read the actual record
|
|
689
696
|
record = f.read(record_length)
|
|
690
697
|
|
|
691
|
-
|
|
698
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
692
699
|
|
|
693
700
|
# Read the whole record as a two integers
|
|
694
701
|
first, last = struct.unpack("ii", record)[:8]
|
|
@@ -700,7 +707,8 @@ class obs_sequence:
|
|
|
700
707
|
def collect_obs_types(header):
|
|
701
708
|
"""Create a dictionary for the observation types in the obs_seq header"""
|
|
702
709
|
num_obs_types = int(header[2])
|
|
703
|
-
types
|
|
710
|
+
# The first line containing obs types is the 4th line in an obs_seq file.
|
|
711
|
+
types = {int(x.split()[0]): x.split()[1] for x in header[3 : num_obs_types + 3]}
|
|
704
712
|
return types
|
|
705
713
|
|
|
706
714
|
@staticmethod
|
|
@@ -813,7 +821,7 @@ class obs_sequence:
|
|
|
813
821
|
# Skip the first len(obs_seq.header) lines
|
|
814
822
|
for _ in range(header_length - 1):
|
|
815
823
|
# Read the record length
|
|
816
|
-
record_length =
|
|
824
|
+
record_length = ObsSequence.read_record_length(f)
|
|
817
825
|
if record_length is None: # End of file
|
|
818
826
|
break
|
|
819
827
|
|
|
@@ -830,7 +838,7 @@ class obs_sequence:
|
|
|
830
838
|
obs.append(f"OBS {obs_num}")
|
|
831
839
|
for _ in range(n): # number of copies
|
|
832
840
|
# Read the record length
|
|
833
|
-
record_length =
|
|
841
|
+
record_length = ObsSequence.read_record_length(f)
|
|
834
842
|
if record_length is None:
|
|
835
843
|
break
|
|
836
844
|
# Read the actual record (copie)
|
|
@@ -838,10 +846,10 @@ class obs_sequence:
|
|
|
838
846
|
obs.append(struct.unpack("d", record)[0])
|
|
839
847
|
|
|
840
848
|
# Read the trailing record length (should match the leading one)
|
|
841
|
-
|
|
849
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
842
850
|
|
|
843
851
|
# linked list info
|
|
844
|
-
record_length =
|
|
852
|
+
record_length = ObsSequence.read_record_length(f)
|
|
845
853
|
if record_length is None:
|
|
846
854
|
break
|
|
847
855
|
|
|
@@ -850,63 +858,94 @@ class obs_sequence:
|
|
|
850
858
|
linked_list_string = f"{int1:<12} {int2:<10} {int3:<12}"
|
|
851
859
|
obs.append(linked_list_string)
|
|
852
860
|
|
|
853
|
-
|
|
861
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
854
862
|
|
|
855
863
|
# location (note no location header "loc3d" or "loc1d" for binary files)
|
|
856
864
|
obs.append("loc3d")
|
|
857
|
-
record_length =
|
|
865
|
+
record_length = ObsSequence.read_record_length(f)
|
|
858
866
|
record = f.read(record_length)
|
|
859
867
|
x, y, z, vert = struct.unpack("dddi", record[:28])
|
|
860
868
|
location_string = f"{x} {y} {z} {vert}"
|
|
861
869
|
obs.append(location_string)
|
|
862
870
|
|
|
863
|
-
|
|
871
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
864
872
|
|
|
865
873
|
# kind (type of observation) value
|
|
866
874
|
obs.append("kind")
|
|
867
|
-
|
|
868
|
-
record_length = struct.unpack("i", record_length_bytes)[0]
|
|
875
|
+
record_length = ObsSequence.read_record_length(f)
|
|
869
876
|
record = f.read(record_length)
|
|
870
877
|
kind = f"{struct.unpack('i', record)[0]}"
|
|
871
878
|
obs.append(kind)
|
|
872
879
|
|
|
873
|
-
|
|
880
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
881
|
+
|
|
882
|
+
# Skip metadata (obs_def) and go directly to the time record
|
|
883
|
+
while True:
|
|
884
|
+
pos = f.tell()
|
|
885
|
+
record_length = ObsSequence.read_record_length(f)
|
|
886
|
+
if record_length is None:
|
|
887
|
+
break # End of file
|
|
888
|
+
|
|
889
|
+
record = f.read(record_length)
|
|
890
|
+
# Check if this record is likely the "time" record (8 bytes, can be unpacked as two ints)
|
|
891
|
+
if record_length == 8:
|
|
892
|
+
try:
|
|
893
|
+
seconds, days = struct.unpack("ii", record)
|
|
894
|
+
# If unpack succeeds, this is the time record
|
|
895
|
+
f.seek(pos) # Seek back so the main loop can process it
|
|
896
|
+
break
|
|
897
|
+
except struct.error:
|
|
898
|
+
pass # Not the time record, keep skipping
|
|
899
|
+
|
|
900
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
874
901
|
|
|
875
902
|
# time (seconds, days)
|
|
876
|
-
record_length =
|
|
903
|
+
record_length = ObsSequence.read_record_length(f)
|
|
877
904
|
record = f.read(record_length)
|
|
878
|
-
|
|
905
|
+
try: # This is incase the record is not the time record because of metadata funkyness
|
|
906
|
+
seconds, days = struct.unpack("ii", record)
|
|
907
|
+
except struct.error as e:
|
|
908
|
+
print(
|
|
909
|
+
f"Reading observation {obs_num}... record length: {record_length} kind {kind}"
|
|
910
|
+
)
|
|
911
|
+
print(f"")
|
|
912
|
+
print(f"Error unpacking seconds and days: {e}")
|
|
913
|
+
raise
|
|
879
914
|
time_string = f"{seconds} {days}"
|
|
880
915
|
obs.append(time_string)
|
|
881
916
|
|
|
882
|
-
|
|
917
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
883
918
|
|
|
884
919
|
# obs error variance
|
|
885
|
-
record_length =
|
|
920
|
+
record_length = ObsSequence.read_record_length(f)
|
|
886
921
|
record = f.read(record_length)
|
|
887
922
|
obs.append(struct.unpack("d", record)[0])
|
|
888
923
|
|
|
889
|
-
|
|
924
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
890
925
|
|
|
891
926
|
yield obs
|
|
892
927
|
|
|
893
|
-
def composite_types(self, composite_types="use_default"):
|
|
928
|
+
def composite_types(self, composite_types="use_default", raise_on_duplicate=False):
|
|
894
929
|
"""
|
|
895
|
-
Set up and construct composite types for the DataFrame.
|
|
930
|
+
Set up and construct composite observation types for the DataFrame.
|
|
896
931
|
|
|
897
|
-
This function sets up composite types based on a provided YAML configuration or
|
|
932
|
+
This function sets up composite observation types based on a provided YAML configuration or
|
|
898
933
|
a default configuration. It constructs new composite rows by combining specified
|
|
899
|
-
components and adds them to the DataFrame.
|
|
934
|
+
components and adds them to the DataFrame in place.
|
|
900
935
|
|
|
901
936
|
Args:
|
|
902
937
|
composite_types (str, optional): The YAML configuration for composite types.
|
|
903
|
-
|
|
938
|
+
If 'use_default', the default configuration is used. Otherwise, a custom YAML
|
|
939
|
+
configuration can be provided.
|
|
940
|
+
raise_on_duplicate (bool, optional): If True, raises an exception if there are
|
|
941
|
+
duplicates in the components. otherwise default False, deals with duplicates as though
|
|
942
|
+
they are distinct observations.
|
|
904
943
|
|
|
905
944
|
Returns:
|
|
906
945
|
pd.DataFrame: The updated DataFrame with the new composite rows added.
|
|
907
946
|
|
|
908
947
|
Raises:
|
|
909
|
-
Exception: If there are repeat values in the components
|
|
948
|
+
Exception: If there are repeat values in the components and raise_on_duplicate = True
|
|
910
949
|
"""
|
|
911
950
|
|
|
912
951
|
if composite_types == "use_default":
|
|
@@ -932,7 +971,10 @@ class obs_sequence:
|
|
|
932
971
|
df = pd.DataFrame()
|
|
933
972
|
for key in self.composite_types_dict:
|
|
934
973
|
df_new = construct_composit(
|
|
935
|
-
df_comp,
|
|
974
|
+
df_comp,
|
|
975
|
+
key,
|
|
976
|
+
self.composite_types_dict[key]["components"],
|
|
977
|
+
raise_on_duplicate,
|
|
936
978
|
)
|
|
937
979
|
df = pd.concat([df, df_new], axis=0)
|
|
938
980
|
|
|
@@ -945,29 +987,29 @@ class obs_sequence:
|
|
|
945
987
|
"""
|
|
946
988
|
Join a list of observation sequences together.
|
|
947
989
|
|
|
948
|
-
This method combines the headers and observations from a list of
|
|
949
|
-
into a single
|
|
990
|
+
This method combines the headers and observations from a list of ObsSequence objects
|
|
991
|
+
into a single ObsSequence object.
|
|
950
992
|
|
|
951
993
|
Args:
|
|
952
|
-
obs_sequences (list of
|
|
994
|
+
obs_sequences (list of ObsSequences): The list of observation sequences objects to join.
|
|
953
995
|
copies (list of str, optional): A list of copy names to include in the combined data.
|
|
954
996
|
If not provided, all copies are included.
|
|
955
997
|
|
|
956
998
|
Returns:
|
|
957
|
-
A new
|
|
999
|
+
A new ObsSequence object containing the combined data.
|
|
958
1000
|
|
|
959
1001
|
Example:
|
|
960
1002
|
.. code-block:: python
|
|
961
1003
|
|
|
962
|
-
obs_seq1 =
|
|
963
|
-
obs_seq2 =
|
|
964
|
-
obs_seq3 =
|
|
965
|
-
combined =
|
|
1004
|
+
obs_seq1 = ObsSequence(file='obs_seq1.final')
|
|
1005
|
+
obs_seq2 = ObsSequence(file='obs_seq2.final')
|
|
1006
|
+
obs_seq3 = ObsSequence(file='obs_seq3.final')
|
|
1007
|
+
combined = ObsSequence.join([obs_seq1, obs_seq2, obs_seq3])
|
|
966
1008
|
"""
|
|
967
1009
|
if not obs_sequences:
|
|
968
1010
|
raise ValueError("The list of observation sequences is empty.")
|
|
969
1011
|
|
|
970
|
-
# Create a new
|
|
1012
|
+
# Create a new ObsSequence object with the combined data
|
|
971
1013
|
combo = cls(file=None)
|
|
972
1014
|
|
|
973
1015
|
# Check if all obs_sequences have compatible attributes
|
|
@@ -1053,53 +1095,49 @@ class obs_sequence:
|
|
|
1053
1095
|
if item in obs_sequences[0].qc_copie_names
|
|
1054
1096
|
]
|
|
1055
1097
|
|
|
1056
|
-
combo.n_copies = len(combo.copie_names)
|
|
1057
|
-
combo.n_qc = len(combo.qc_copie_names)
|
|
1058
|
-
combo.n_non_qc = len(combo.non_qc_copie_names)
|
|
1059
|
-
|
|
1060
1098
|
else:
|
|
1061
1099
|
for obs_seq in obs_sequences:
|
|
1062
1100
|
if not obs_sequences[0].df.columns.isin(obs_seq.df.columns).all():
|
|
1063
1101
|
raise ValueError(
|
|
1064
1102
|
"All observation sequences must have the same copies."
|
|
1065
1103
|
)
|
|
1066
|
-
combo.n_copies = obs_sequences[0].n_copies
|
|
1067
|
-
combo.n_qc = obs_sequences[0].n_qc
|
|
1068
|
-
combo.n_non_qc = obs_sequences[0].n_non_qc
|
|
1069
1104
|
combo.copie_names = obs_sequences[0].copie_names
|
|
1105
|
+
combo.non_qc_copie_names = obs_sequences[0].non_qc_copie_names
|
|
1106
|
+
combo.qc_copie_names = obs_sequences[0].qc_copie_names
|
|
1107
|
+
combo.n_copies = len(combo.copie_names)
|
|
1070
1108
|
|
|
1071
1109
|
# todo HK @todo combine synonyms for obs?
|
|
1072
1110
|
|
|
1073
1111
|
# Initialize combined data
|
|
1074
|
-
|
|
1075
|
-
combined_df = pd.DataFrame()
|
|
1076
|
-
combo.all_obs = None # set to none to force writing from the dataframe if write_obs_seq is called
|
|
1112
|
+
combo.df = pd.DataFrame()
|
|
1077
1113
|
|
|
1078
1114
|
# Iterate over the list of observation sequences and combine their data
|
|
1079
1115
|
for obs_seq in obs_sequences:
|
|
1080
1116
|
if copies:
|
|
1081
|
-
|
|
1082
|
-
[
|
|
1117
|
+
combo.df = pd.concat(
|
|
1118
|
+
[combo.df, obs_seq.df[requested_columns]], ignore_index=True
|
|
1083
1119
|
)
|
|
1084
1120
|
else:
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
keys = set(combined_types)
|
|
1090
|
-
combo.reverse_types = {item: i + 1 for i, item in enumerate(keys)}
|
|
1091
|
-
combo.types = {v: k for k, v in combo.reverse_types.items()}
|
|
1092
|
-
|
|
1093
|
-
# create linked list for obs
|
|
1094
|
-
combo.df = combined_df.sort_values(by="time").reset_index(drop=True)
|
|
1095
|
-
combo.df["linked_list"] = obs_sequence.generate_linked_list_pattern(
|
|
1096
|
-
len(combo.df)
|
|
1097
|
-
)
|
|
1098
|
-
combo.df["obs_num"] = combined_df.index + 1
|
|
1099
|
-
combo.create_header(len(combo.df))
|
|
1121
|
+
combo.df = pd.concat([combo.df, obs_seq.df], ignore_index=True)
|
|
1122
|
+
|
|
1123
|
+
# update ObsSequence attributes from the combined DataFrame
|
|
1124
|
+
combo.update_attributes_from_df()
|
|
1100
1125
|
|
|
1101
1126
|
return combo
|
|
1102
1127
|
|
|
1128
|
+
@staticmethod
|
|
1129
|
+
def update_linked_list(df):
|
|
1130
|
+
"""
|
|
1131
|
+
Sorts the DataFrame by 'time', resets the index, and adds/updates 'linked_list'
|
|
1132
|
+
and 'obs_num' columns in place.
|
|
1133
|
+
Modifies the input DataFrame directly.
|
|
1134
|
+
"""
|
|
1135
|
+
df.sort_values(by="time", inplace=True, kind="stable")
|
|
1136
|
+
df.reset_index(drop=True, inplace=True)
|
|
1137
|
+
df["linked_list"] = ObsSequence.generate_linked_list_pattern(len(df))
|
|
1138
|
+
df["obs_num"] = df.index + 1
|
|
1139
|
+
return None
|
|
1140
|
+
|
|
1103
1141
|
def has_assimilation_info(self):
|
|
1104
1142
|
"""
|
|
1105
1143
|
Check if the DataFrame has prior information.
|
|
@@ -1125,7 +1163,7 @@ class obs_sequence:
|
|
|
1125
1163
|
)
|
|
1126
1164
|
|
|
1127
1165
|
def create_header(self, n):
|
|
1128
|
-
"""Create a header for the obs_seq file from the
|
|
1166
|
+
"""Create a header for the obs_seq file from the ObsSequence object."""
|
|
1129
1167
|
assert (
|
|
1130
1168
|
self.n_copies == self.n_non_qc + self.n_qc
|
|
1131
1169
|
), "n_copies must be equal to n_non_qc + n_qc"
|
|
@@ -1142,6 +1180,100 @@ class obs_sequence:
|
|
|
1142
1180
|
self.header.append(copie)
|
|
1143
1181
|
self.header.append(f"first: 1 last: {n}")
|
|
1144
1182
|
|
|
1183
|
+
@staticmethod
|
|
1184
|
+
def replace_qc2_nan(df):
|
|
1185
|
+
"""
|
|
1186
|
+
Replace MISSING_R8 values with NaNs in posterior columns for observations where
|
|
1187
|
+
DART_quality_control = 2 (posterior forward observation operators failed)
|
|
1188
|
+
|
|
1189
|
+
This causes these observations to be ignored in the calculations of posterior statistics
|
|
1190
|
+
"""
|
|
1191
|
+
df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_mean"] = np.nan
|
|
1192
|
+
df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_spread"] = np.nan
|
|
1193
|
+
num_post_members = len(
|
|
1194
|
+
df.columns[df.columns.str.startswith("posterior_ensemble_member_")]
|
|
1195
|
+
)
|
|
1196
|
+
for i in range(1, num_post_members + 1):
|
|
1197
|
+
df.loc[
|
|
1198
|
+
df["DART_quality_control"] == 2.0,
|
|
1199
|
+
"posterior_ensemble_member_" + str(i),
|
|
1200
|
+
] = np.nan
|
|
1201
|
+
|
|
1202
|
+
@staticmethod
|
|
1203
|
+
def revert_qc2_nan(df):
|
|
1204
|
+
"""
|
|
1205
|
+
Revert NaNs back to MISSING_R8s for observations where DART_quality_control = 2
|
|
1206
|
+
(posterior forward observation operators failed)
|
|
1207
|
+
"""
|
|
1208
|
+
df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_mean"] = (
|
|
1209
|
+
-888888.000000
|
|
1210
|
+
)
|
|
1211
|
+
df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_spread"] = (
|
|
1212
|
+
-888888.000000
|
|
1213
|
+
)
|
|
1214
|
+
num_post_members = len(
|
|
1215
|
+
df.columns[df.columns.str.startswith("posterior_ensemble_member_")]
|
|
1216
|
+
)
|
|
1217
|
+
for i in range(1, num_post_members + 1):
|
|
1218
|
+
df.loc[
|
|
1219
|
+
df["DART_quality_control"] == 2.0, "posterior_ensemble_member_" + str(i)
|
|
1220
|
+
] = -888888.000000
|
|
1221
|
+
|
|
1222
|
+
def update_attributes_from_df(self):
|
|
1223
|
+
"""
|
|
1224
|
+
Update all internal data (fields/properties) of the ObsSequence object that
|
|
1225
|
+
depend on the DataFrame (self.df).
|
|
1226
|
+
Call this after self.df is replaced or its structure changes.
|
|
1227
|
+
|
|
1228
|
+
Important:
|
|
1229
|
+
|
|
1230
|
+
Assumes copies are all columns between 'obs_num' and 'linked_list' (if present)
|
|
1231
|
+
|
|
1232
|
+
"""
|
|
1233
|
+
# Update columns
|
|
1234
|
+
self.columns = list(self.df.columns)
|
|
1235
|
+
|
|
1236
|
+
# Update all_obs (list of lists, each row) @todo HK do we need this?
|
|
1237
|
+
self.all_obs = None
|
|
1238
|
+
|
|
1239
|
+
# Update copie_names, non_qc_copie_names, qc_copie_names, n_copies, n_non_qc, n_qc
|
|
1240
|
+
# Try to infer from columns if possible, else leave as is
|
|
1241
|
+
# Assume copies are all columns between 'obs_num' and 'linked_list' (if present)
|
|
1242
|
+
if "obs_num" in self.df.columns and "linked_list" in self.df.columns:
|
|
1243
|
+
obs_num_idx = self.df.columns.get_loc("obs_num")
|
|
1244
|
+
linked_list_idx = self.df.columns.get_loc("linked_list")
|
|
1245
|
+
self.copie_names = list(self.df.columns[obs_num_idx + 1 : linked_list_idx])
|
|
1246
|
+
else:
|
|
1247
|
+
# Fallback: use previous value or empty
|
|
1248
|
+
self.copie_names = getattr(self, "copie_names", [])
|
|
1249
|
+
self.n_copies = len(self.copie_names)
|
|
1250
|
+
|
|
1251
|
+
# Try to infer non_qc and qc copies from previous names if possible
|
|
1252
|
+
# Find qc copies first
|
|
1253
|
+
self.qc_copie_names = [c for c in self.copie_names if c in self.qc_copie_names]
|
|
1254
|
+
if self.qc_copie_names == []: # If no qc copies found, assume all are non-qc
|
|
1255
|
+
self.non_qc_copie_names = self.copie_names
|
|
1256
|
+
else: # pull out non-qc copies from the copie_names
|
|
1257
|
+
self.non_qc_copie_names = [
|
|
1258
|
+
c for c in self.copie_names if c not in self.qc_copie_names
|
|
1259
|
+
]
|
|
1260
|
+
self.n_qc = len(self.qc_copie_names)
|
|
1261
|
+
self.n_non_qc = len(self.non_qc_copie_names)
|
|
1262
|
+
|
|
1263
|
+
# Update header and types and reverse_types
|
|
1264
|
+
self.create_header_from_dataframe()
|
|
1265
|
+
|
|
1266
|
+
# Update seq (generator should be empty or None if not from file)
|
|
1267
|
+
self.seq = []
|
|
1268
|
+
# Update loc_mod
|
|
1269
|
+
if "vertical" in self.df.columns:
|
|
1270
|
+
self.loc_mod = "loc3d"
|
|
1271
|
+
else:
|
|
1272
|
+
self.loc_mod = "loc1d"
|
|
1273
|
+
|
|
1274
|
+
# update linked list for obs and obs_nums
|
|
1275
|
+
ObsSequence.update_linked_list(self.df)
|
|
1276
|
+
|
|
1145
1277
|
|
|
1146
1278
|
def load_yaml_to_dict(file_path):
|
|
1147
1279
|
"""
|
|
@@ -1172,24 +1304,31 @@ def convert_dart_time(seconds, days):
|
|
|
1172
1304
|
return time
|
|
1173
1305
|
|
|
1174
1306
|
|
|
1175
|
-
def construct_composit(df_comp, composite, components):
|
|
1307
|
+
def construct_composit(df_comp, composite, components, raise_on_duplicate):
|
|
1176
1308
|
"""
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
specified columns using the square root of the sum of squares method.
|
|
1309
|
+
Creates a new DataFrame by combining pairs of rows from two specified component
|
|
1310
|
+
types in an observation DataFrame. It matches rows based on location and time,
|
|
1311
|
+
and then combines certain columns using the square root of the sum of squares
|
|
1312
|
+
of the components.
|
|
1182
1313
|
|
|
1183
1314
|
Args:
|
|
1184
1315
|
df_comp (pd.DataFrame): The DataFrame containing the component rows to be combined.
|
|
1185
1316
|
composite (str): The type name for the new composite rows.
|
|
1186
1317
|
components (list of str): A list containing the type names of the two components to be combined.
|
|
1318
|
+
raise_on_duplicate (bool): If False, raises an exception if there are duplicates in the components.
|
|
1319
|
+
otherwise deals with duplicates as though they are distinct observations.
|
|
1320
|
+
|
|
1187
1321
|
|
|
1188
1322
|
Returns:
|
|
1189
1323
|
merged_df (pd.DataFrame): A DataFrame containing the new composite rows.
|
|
1190
1324
|
"""
|
|
1325
|
+
# select rows for the two components
|
|
1326
|
+
if len(components) != 2:
|
|
1327
|
+
raise ValueError("components must be a list of two component types.")
|
|
1191
1328
|
selected_rows = df_comp[df_comp["type"] == components[0].upper()]
|
|
1192
1329
|
selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
|
|
1330
|
+
selected_rows = selected_rows.copy()
|
|
1331
|
+
selected_rows_v = selected_rows_v.copy()
|
|
1193
1332
|
|
|
1194
1333
|
prior_columns_to_combine = df_comp.filter(regex="prior_ensemble").columns.tolist()
|
|
1195
1334
|
posterior_columns_to_combine = df_comp.filter(
|
|
@@ -1200,7 +1339,7 @@ def construct_composit(df_comp, composite, components):
|
|
|
1200
1339
|
+ posterior_columns_to_combine
|
|
1201
1340
|
+ ["observation", "obs_err_var"]
|
|
1202
1341
|
)
|
|
1203
|
-
merge_columns = ["latitude", "longitude", "vertical", "time"]
|
|
1342
|
+
merge_columns = ["latitude", "longitude", "vertical", "time"] # @todo HK 1d or 3d
|
|
1204
1343
|
same_obs_columns = merge_columns + [
|
|
1205
1344
|
"observation",
|
|
1206
1345
|
"obs_err_var",
|
|
@@ -1210,15 +1349,25 @@ def construct_composit(df_comp, composite, components):
|
|
|
1210
1349
|
selected_rows[same_obs_columns].duplicated().sum() > 0
|
|
1211
1350
|
or selected_rows_v[same_obs_columns].duplicated().sum() > 0
|
|
1212
1351
|
):
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
f"{
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1352
|
+
|
|
1353
|
+
if raise_on_duplicate:
|
|
1354
|
+
print(
|
|
1355
|
+
f"{selected_rows[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
|
|
1356
|
+
)
|
|
1357
|
+
print(f"{selected_rows[same_obs_columns]}")
|
|
1358
|
+
print(
|
|
1359
|
+
f"{selected_rows_v[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
|
|
1360
|
+
)
|
|
1361
|
+
print(f"{selected_rows_v[same_obs_columns]}")
|
|
1362
|
+
raise Exception("There are duplicates in the components.")
|
|
1363
|
+
|
|
1364
|
+
else:
|
|
1365
|
+
selected_rows["dup_num"] = selected_rows.groupby(
|
|
1366
|
+
same_obs_columns
|
|
1367
|
+
).cumcount()
|
|
1368
|
+
selected_rows_v["dup_num"] = selected_rows_v.groupby(
|
|
1369
|
+
same_obs_columns
|
|
1370
|
+
).cumcount()
|
|
1222
1371
|
|
|
1223
1372
|
# Merge the two DataFrames on location and time columns
|
|
1224
1373
|
merged_df = pd.merge(
|
|
@@ -1235,4 +1384,7 @@ def construct_composit(df_comp, composite, components):
|
|
|
1235
1384
|
columns=[col for col in merged_df.columns if col.endswith("_v")]
|
|
1236
1385
|
)
|
|
1237
1386
|
|
|
1387
|
+
if "dup_num" in merged_df.columns:
|
|
1388
|
+
merged_df = merged_df.drop(columns=["dup_num"])
|
|
1389
|
+
|
|
1238
1390
|
return merged_df
|
pydartdiags/stats/stats.py
CHANGED
|
@@ -4,8 +4,6 @@ import numpy as np
|
|
|
4
4
|
from functools import wraps
|
|
5
5
|
from datetime import datetime, timedelta
|
|
6
6
|
|
|
7
|
-
# from pydartdiags.obs_sequence import obs_sequence as obsq
|
|
8
|
-
|
|
9
7
|
|
|
10
8
|
def apply_to_phases_in_place(func):
|
|
11
9
|
"""
|
|
@@ -93,6 +91,12 @@ def calculate_rank(df, phase):
|
|
|
93
91
|
"""
|
|
94
92
|
Calculate the rank of observations within an ensemble.
|
|
95
93
|
|
|
94
|
+
Note:
|
|
95
|
+
|
|
96
|
+
This function is decorated with @apply_to_phases_by_obs, which modifies its usage.
|
|
97
|
+
You should call it as calculate_rank(df), and the decorator will automatically apply the
|
|
98
|
+
function to all relevant phases (‘prior’ and ‘posterior’).
|
|
99
|
+
|
|
96
100
|
This function takes a DataFrame containing ensemble predictions and observed values,
|
|
97
101
|
adds sampling noise to the ensemble predictions, and calculates the rank of the observed
|
|
98
102
|
value within the perturbed ensemble for each observation. The rank indicates the position
|
|
@@ -103,8 +107,6 @@ def calculate_rank(df, phase):
|
|
|
103
107
|
Parameters:
|
|
104
108
|
df (pd.DataFrame): A DataFrame with columns for rank, and observation type.
|
|
105
109
|
|
|
106
|
-
phase (str): The phase for which to calculate the statistics ('prior' or 'posterior')
|
|
107
|
-
|
|
108
110
|
Returns:
|
|
109
111
|
DataFrame containing columns for 'rank' and observation 'type'.
|
|
110
112
|
"""
|
|
@@ -158,15 +160,20 @@ def diag_stats(df, phase):
|
|
|
158
160
|
"""
|
|
159
161
|
Calculate diagnostic statistics for a given phase and add them to the DataFrame.
|
|
160
162
|
|
|
163
|
+
Note:
|
|
164
|
+
This function is decorated with @apply_to_phases_in_place, which modifies its usage.
|
|
165
|
+
You should call it as diag_stats(df), and the decorator will automatically apply the
|
|
166
|
+
function to all relevant phases (‘prior’ and ‘posterior’) modifying the DataFrame
|
|
167
|
+
in place.
|
|
168
|
+
|
|
161
169
|
Args:
|
|
162
170
|
df (pandas.DataFrame): The input DataFrame containing observation data and ensemble statistics.
|
|
163
|
-
|
|
164
|
-
- 'observation': The actual observation values.
|
|
165
|
-
- 'obs_err_var': The variance of the observation error.
|
|
166
|
-
- 'prior_ensemble_mean' and/or 'posterior_ensemble_mean': The mean of the ensemble.
|
|
167
|
-
- 'prior_ensemble_spread' and/or 'posterior_ensemble_spread': The spread of the ensemble.
|
|
171
|
+
The DataFrame must include the following columns:
|
|
168
172
|
|
|
169
|
-
|
|
173
|
+
- 'observation': The actual observation values.
|
|
174
|
+
- 'obs_err_var': The variance of the observation error.
|
|
175
|
+
- 'prior_ensemble_mean' and/or 'posterior_ensemble_mean': The mean of the ensemble.
|
|
176
|
+
- 'prior_ensemble_spread' and/or 'posterior_ensemble_spread': The spread of the ensemble.
|
|
170
177
|
|
|
171
178
|
Returns:
|
|
172
179
|
None: The function modifies the DataFrame in place by adding the following columns:
|
|
@@ -203,9 +210,12 @@ def bin_by_layer(df, levels, verticalUnit="pressure (Pa)"):
|
|
|
203
210
|
vertical level bin. Only observations (row) with the specified vertical unit are binned.
|
|
204
211
|
|
|
205
212
|
Args:
|
|
206
|
-
df (pandas.DataFrame): The input DataFrame containing observation data.
|
|
213
|
+
df (pandas.DataFrame): The input DataFrame containing observation data.
|
|
214
|
+
The DataFrame must include the following columns:
|
|
215
|
+
|
|
207
216
|
- 'vertical': The vertical coordinate values of the observations.
|
|
208
217
|
- 'vert_unit': The unit of the vertical coordinate values.
|
|
218
|
+
|
|
209
219
|
levels (list): A list of bin edges for the vertical levels.
|
|
210
220
|
verticalUnit (str, optional): The unit of the vertical axis (e.g., 'pressure (Pa)'). Default is 'pressure (Pa)'.
|
|
211
221
|
|
|
@@ -261,6 +271,28 @@ def bin_by_time(df, time_value):
|
|
|
261
271
|
|
|
262
272
|
@apply_to_phases_by_type_return_df
|
|
263
273
|
def grand_statistics(df, phase):
|
|
274
|
+
"""
|
|
275
|
+
Calculate grand statistics (RMSE, bias, total spread) for each observation type and phase.
|
|
276
|
+
|
|
277
|
+
This function assumes that diagnostic statistics (such as squared error, bias, and total variance)
|
|
278
|
+
have already been computed by :func:`diag_stats` and are present in the DataFrame. It groups the data by observation
|
|
279
|
+
type and computes the root mean square error (RMSE), mean bias, and total spread for the specified phase.
|
|
280
|
+
|
|
281
|
+
Note:
|
|
282
|
+
This function is decorated with @apply_to_phases_by_type_return_df, which modifies its usage
|
|
283
|
+
You should call it as grand_statistics(df), and the decorator will automatically apply the function
|
|
284
|
+
to all relevant phases ('prior' and 'posterior') and return a merged DataFrame.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
df (pandas.DataFrame): The input DataFrame containing diagnostic statistics for observations.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
pandas.DataFrame: A DataFrame with columns:
|
|
291
|
+
- 'type': The observation type.
|
|
292
|
+
- '{phase}_rmse': The root mean square error for the phase.
|
|
293
|
+
- '{phase}_bias': The mean bias for the phase.
|
|
294
|
+
- '{phase}_totalspread': The total spread for the phase.
|
|
295
|
+
"""
|
|
264
296
|
|
|
265
297
|
# assuming diag_stats has been called
|
|
266
298
|
grand = (
|
|
@@ -283,6 +315,33 @@ def grand_statistics(df, phase):
|
|
|
283
315
|
|
|
284
316
|
@apply_to_phases_by_type_return_df
|
|
285
317
|
def layer_statistics(df, phase):
|
|
318
|
+
"""
|
|
319
|
+
Calculate statistics (RMSE, bias, total spread) for each observation type and vertical layer.
|
|
320
|
+
|
|
321
|
+
This function assumes that diagnostic statistics (such as squared error, bias, and total variance)
|
|
322
|
+
have already been computed with :func:`diag_stats` and are present in the DataFrame. It groups the data by
|
|
323
|
+
vertical layer midpoint and observation type, and computes the root mean square error (RMSE),
|
|
324
|
+
mean bias, and total spread for the specified phase for each vertical layer.
|
|
325
|
+
|
|
326
|
+
Note:
|
|
327
|
+
This function is decorated with @apply_to_phases_by_type_return_df, which modifies its usage
|
|
328
|
+
You should call it as layer_statistics(df), and the decorator will automatically apply the function
|
|
329
|
+
to all relevant phases ('prior' and 'posterior') and return a merged DataFrame.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
df (pandas.DataFrame): The input DataFrame containing diagnostic statistics for observations.
|
|
333
|
+
phase (str): The phase for which to calculate the statistics ('prior' or 'posterior').
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
pandas.DataFrame: A DataFrame with columns:
|
|
337
|
+
- 'midpoint': The midpoint of the vertical layer.
|
|
338
|
+
- 'type': The observation type.
|
|
339
|
+
- '{phase}_rmse': The root mean square error for the phase.
|
|
340
|
+
- '{phase}_bias': The mean bias for the phase.
|
|
341
|
+
- '{phase}_totalspread': The total spread for the phase.
|
|
342
|
+
- 'vert_unit': The vertical unit.
|
|
343
|
+
- 'vlevels': The categorized vertical level.
|
|
344
|
+
"""
|
|
286
345
|
|
|
287
346
|
# assuming diag_stats has been called
|
|
288
347
|
layer_stats = (
|
|
@@ -310,14 +369,31 @@ def layer_statistics(df, phase):
|
|
|
310
369
|
@apply_to_phases_by_type_return_df
|
|
311
370
|
def time_statistics(df, phase):
|
|
312
371
|
"""
|
|
313
|
-
Calculate time-based statistics for
|
|
372
|
+
Calculate time-based statistics (RMSE, bias, total spread) for each observation type and time bin.
|
|
373
|
+
|
|
374
|
+
This function assumes that diagnostic statistics (such as squared error, bias, and total variance)
|
|
375
|
+
have already been computed by :func:`diag_stats` and are present in the DataFrame. It groups the data
|
|
376
|
+
by time bin midpoint and observation type, and computes the root mean square error (RMSE), mean bias,
|
|
377
|
+
and total spread for the specified phase for each time bin.
|
|
378
|
+
|
|
379
|
+
Note:
|
|
380
|
+
This function is decorated with @apply_to_phases_by_type_return_df.
|
|
381
|
+
You should call it as time_statistics(df), and the decorator will automatically apply the function
|
|
382
|
+
to all relevant phases ('prior' and 'posterior') and return a merged DataFrame.
|
|
314
383
|
|
|
315
384
|
Args:
|
|
316
|
-
df (pandas.DataFrame): The input DataFrame containing
|
|
385
|
+
df (pandas.DataFrame): The input DataFrame containing diagnostic statistics for observations.
|
|
317
386
|
phase (str): The phase for which to calculate the statistics ('prior' or 'posterior').
|
|
318
387
|
|
|
319
388
|
Returns:
|
|
320
|
-
pandas.DataFrame: A DataFrame
|
|
389
|
+
pandas.DataFrame: A DataFrame with columns:
|
|
390
|
+
- 'time_bin_midpoint': The midpoint of the time bin.
|
|
391
|
+
- 'type': The observation type.
|
|
392
|
+
- '{phase}_rmse': The root mean square error for the phase.
|
|
393
|
+
- '{phase}_bias': The mean bias for the phase.
|
|
394
|
+
- '{phase}_totalspread': The total spread for the phase.
|
|
395
|
+
- 'time_bin': The time bin interval.
|
|
396
|
+
- 'time': The first time value in the bin.
|
|
321
397
|
"""
|
|
322
398
|
# Assuming diag_stats has been called
|
|
323
399
|
time_stats = (
|
|
@@ -402,7 +478,9 @@ def possible_vs_used_by_time(df):
|
|
|
402
478
|
Calculates the count of possible vs. used observations by type and time bin.
|
|
403
479
|
|
|
404
480
|
Args:
|
|
405
|
-
df (pd.DataFrame): The input DataFrame containing observation data.
|
|
481
|
+
df (pd.DataFrame): The input DataFrame containing observation data.
|
|
482
|
+
The DataFrame must include:
|
|
483
|
+
|
|
406
484
|
- 'type': The observation type.
|
|
407
485
|
- 'time_bin_midpoint': The midpoint of the time bin.
|
|
408
486
|
- 'observation': The observation values.
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydartdiags
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Observation Sequence Diagnostics for DART
|
|
5
5
|
Home-page: https://github.com/NCAR/pyDARTdiags.git
|
|
6
6
|
Author: Helen Kershaw
|
|
7
7
|
Author-email: Helen Kershaw <hkershaw@ucar.edu>
|
|
8
|
+
License-Expression: Apache-2.0
|
|
8
9
|
Project-URL: Homepage, https://github.com/NCAR/pyDARTdiags.git
|
|
9
10
|
Project-URL: Issues, https://github.com/NCAR/pyDARTdiags/issues
|
|
10
11
|
Project-URL: Documentation, https://ncar.github.io/pyDARTdiags
|
|
11
12
|
Classifier: Programming Language :: Python :: 3
|
|
12
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
13
|
Classifier: Operating System :: OS Independent
|
|
14
14
|
Requires-Python: >=3.8
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
@@ -3,13 +3,13 @@ pydartdiags/matplots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
|
3
3
|
pydartdiags/matplots/matplots.py,sha256=Bo0TTz1gvsHEvTfTfLfdTi_3hNRN1okmyY5a5yYgtzk,13455
|
|
4
4
|
pydartdiags/obs_sequence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
pydartdiags/obs_sequence/composite_types.yaml,sha256=PVLMU6x6KcVMCwPB-U65C_e0YQUemfqUhYMpf1DhFOY,917
|
|
6
|
-
pydartdiags/obs_sequence/obs_sequence.py,sha256=
|
|
6
|
+
pydartdiags/obs_sequence/obs_sequence.py,sha256=szxASzecTcJzP2rEqssRo9VHw26nwpZ7W9Yi6sTbbHI,55112
|
|
7
7
|
pydartdiags/plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
pydartdiags/plots/plots.py,sha256=U7WQjE_qN-5a8-85D-PkkgILSFBzTJQ1mcGBa7l5DHI,6464
|
|
9
9
|
pydartdiags/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
pydartdiags/stats/stats.py,sha256=
|
|
11
|
-
pydartdiags-0.
|
|
12
|
-
pydartdiags-0.
|
|
13
|
-
pydartdiags-0.
|
|
14
|
-
pydartdiags-0.
|
|
15
|
-
pydartdiags-0.
|
|
10
|
+
pydartdiags/stats/stats.py,sha256=a88VuLoHOlhbjYjnrVPHVNnhiDx-4B3YA1jbc6FUSyU,20193
|
|
11
|
+
pydartdiags-0.6.1.dist-info/licenses/LICENSE,sha256=ROglds_Eg_ylXp-1MHmEawDqMw_UsCB4r9sk7z9PU9M,11377
|
|
12
|
+
pydartdiags-0.6.1.dist-info/METADATA,sha256=AeuLMziCQas1vggEwAKD6CEfdadxwoSDWEu-Fgwaix0,2381
|
|
13
|
+
pydartdiags-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
14
|
+
pydartdiags-0.6.1.dist-info/top_level.txt,sha256=LfMoPLnSd0VhhlWev1eeX9t6AzvyASOloag0LO_ppWg,12
|
|
15
|
+
pydartdiags-0.6.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|