pydartdiags 0.6.0__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydartdiags might be problematic. Click here for more details.
- {pydartdiags-0.6.0/src/pydartdiags.egg-info → pydartdiags-0.6.1}/PKG-INFO +1 -1
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/pyproject.toml +1 -1
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/setup.py +1 -1
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags/obs_sequence/obs_sequence.py +238 -78
- {pydartdiags-0.6.0 → pydartdiags-0.6.1/src/pydartdiags.egg-info}/PKG-INFO +1 -1
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/tests/test_obs_sequence.py +478 -30
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/LICENSE +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/MANIFEST.in +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/README.md +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/setup.cfg +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags/__init__.py +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags/matplots/__init__.py +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags/matplots/matplots.py +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags/obs_sequence/__init__.py +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags/obs_sequence/composite_types.yaml +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags/plots/__init__.py +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags/plots/plots.py +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags/stats/__init__.py +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags/stats/stats.py +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags.egg-info/SOURCES.txt +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags.egg-info/dependency_links.txt +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags.egg-info/requires.txt +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/src/pydartdiags.egg-info/top_level.txt +0 -0
- {pydartdiags-0.6.0 → pydartdiags-0.6.1}/tests/test_stats.py +0 -0
|
@@ -184,6 +184,14 @@ class ObsSequence:
|
|
|
184
184
|
}
|
|
185
185
|
self.df = self.df.rename(columns=rename_dict)
|
|
186
186
|
|
|
187
|
+
if self.is_binary(file):
|
|
188
|
+
# binary files do not have "OBS X" in, so set linked list from df.
|
|
189
|
+
self.update_attributes_from_df()
|
|
190
|
+
|
|
191
|
+
# Replace MISSING_R8s with NaNs in posterior stats where DART_quality_control = 2
|
|
192
|
+
if self.has_posterior():
|
|
193
|
+
ObsSequence.replace_qc2_nan(self.df)
|
|
194
|
+
|
|
187
195
|
def create_all_obs(self):
|
|
188
196
|
"""steps through the generator to create a
|
|
189
197
|
list of all observations in the sequence
|
|
@@ -197,7 +205,7 @@ class ObsSequence:
|
|
|
197
205
|
def obs_to_list(self, obs):
|
|
198
206
|
"""put single observation into a list"""
|
|
199
207
|
data = []
|
|
200
|
-
data.append(obs[0].split()[1]) # obs_num
|
|
208
|
+
data.append(int(obs[0].split()[1])) # obs_num
|
|
201
209
|
data.extend(list(map(float, obs[1 : self.n_copies + 1]))) # all the copies
|
|
202
210
|
data.append(obs[self.n_copies + 1]) # linked list info
|
|
203
211
|
try: # HK todo only have to check loc3d or loc1d for the first observation, the whole file is the same
|
|
@@ -219,9 +227,9 @@ class ObsSequence:
|
|
|
219
227
|
"Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
|
|
220
228
|
)
|
|
221
229
|
typeI = obs.index("kind") # type of observation
|
|
222
|
-
type_value = obs[typeI + 1]
|
|
223
|
-
if
|
|
224
|
-
data.append(
|
|
230
|
+
type_value = int(obs[typeI + 1])
|
|
231
|
+
if type_value < 0:
|
|
232
|
+
data.append(type_value)
|
|
225
233
|
else:
|
|
226
234
|
data.append(self.types[type_value]) # observation type
|
|
227
235
|
|
|
@@ -283,14 +291,22 @@ class ObsSequence:
|
|
|
283
291
|
+ str(self.reversed_vert[data[self.n_copies + 5]])
|
|
284
292
|
) # location x, y, z, vert
|
|
285
293
|
obs.append("kind") # this is type of observation
|
|
286
|
-
|
|
294
|
+
obs_type = data[self.n_copies + 6]
|
|
295
|
+
if isinstance(obs_type, str):
|
|
296
|
+
obs.append(self.reverse_types[obs_type]) # observation type
|
|
297
|
+
else:
|
|
298
|
+
obs.append(obs_type) # Identity obs negative integer
|
|
287
299
|
# Convert metadata to a string and append !HK @todo you are not converting to string
|
|
288
300
|
obs.extend(data[self.n_copies + 7]) # metadata
|
|
289
301
|
obs.extend(data[self.n_copies + 8]) # external forward operator
|
|
290
302
|
elif self.loc_mod == "loc1d":
|
|
291
303
|
obs.append(data[self.n_copies + 2]) # 1d location
|
|
292
304
|
obs.append("kind") # this is type of observation
|
|
293
|
-
|
|
305
|
+
obs_type = data[self.n_copies + 3]
|
|
306
|
+
if isinstance(obs_type, str):
|
|
307
|
+
obs.append(self.reverse_types[obs_type]) # observation type
|
|
308
|
+
else:
|
|
309
|
+
obs.append(obs_type) # Identity obs negative integer
|
|
294
310
|
obs.extend(data[self.n_copies + 4]) # metadata
|
|
295
311
|
obs.extend(data[self.n_copies + 5]) # external forward operator
|
|
296
312
|
obs.append(" ".join(map(str, data[-4:-2]))) # seconds, days
|
|
@@ -316,14 +332,17 @@ class ObsSequence:
|
|
|
316
332
|
|
|
317
333
|
This function writes the observation sequence stored in the obs_seq.DataFrame to a specified file.
|
|
318
334
|
It updates the header with the number of observations, converts coordinates back to radians
|
|
319
|
-
if necessary,
|
|
320
|
-
list pattern for reading by DART
|
|
335
|
+
if necessary, reverts NaNs back to MISSING_R8 for observations with QC=2, drops unnecessary
|
|
336
|
+
columns, sorts the DataFrame by time, and generates a linked list pattern for reading by DART
|
|
337
|
+
programs.
|
|
321
338
|
|
|
322
339
|
Args:
|
|
323
340
|
file (str): The path to the file where the observation sequence will be written.
|
|
324
341
|
|
|
325
342
|
Notes:
|
|
326
343
|
- Longitude and latitude are converted back to radians if the location model is 'loc3d'.
|
|
344
|
+
- The replacement of MISSING_R8 values with NaNs for any obs that failed the posterior
|
|
345
|
+
forward observation operators (QC2) is reverted.
|
|
327
346
|
- The 'bias' and 'sq_err' columns are dropped if they exist in the DataFrame.
|
|
328
347
|
- The DataFrame is sorted by the 'time' column.
|
|
329
348
|
- An 'obs_num' column is added to the DataFrame to number the observations in time order.
|
|
@@ -334,7 +353,8 @@ class ObsSequence:
|
|
|
334
353
|
|
|
335
354
|
"""
|
|
336
355
|
|
|
337
|
-
|
|
356
|
+
# Update attributes, header, and linked list from dataframe
|
|
357
|
+
self.update_attributes_from_df()
|
|
338
358
|
|
|
339
359
|
with open(file, "w") as f:
|
|
340
360
|
|
|
@@ -358,15 +378,9 @@ class ObsSequence:
|
|
|
358
378
|
if "midpoint" in df_copy.columns:
|
|
359
379
|
df_copy = df_copy.drop(columns=["midpoint", "vlevels"])
|
|
360
380
|
|
|
361
|
-
#
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
) # sort the DataFrame by time
|
|
365
|
-
df_copy.reset_index(drop=True, inplace=True)
|
|
366
|
-
df_copy["obs_num"] = df_copy.index + 1 # obs_num in time order
|
|
367
|
-
df_copy["linked_list"] = ObsSequence.generate_linked_list_pattern(
|
|
368
|
-
len(df_copy)
|
|
369
|
-
) # linked list pattern
|
|
381
|
+
# Revert NaNs back to MISSING_R8s
|
|
382
|
+
if self.has_posterior():
|
|
383
|
+
ObsSequence.revert_qc2_nan(df_copy)
|
|
370
384
|
|
|
371
385
|
def write_row(row):
|
|
372
386
|
ob_write = self.list_to_obs(row.tolist())
|
|
@@ -390,13 +404,16 @@ class ObsSequence:
|
|
|
390
404
|
dict: The types dictionary with keys sorted in numerical order.
|
|
391
405
|
"""
|
|
392
406
|
# Create a dictionary of observation types from the dataframe
|
|
393
|
-
|
|
407
|
+
# Ignore Identity obs (negative integers)
|
|
408
|
+
unique_types = df.loc[
|
|
409
|
+
df["type"].apply(lambda x: isinstance(x, str)), "type"
|
|
410
|
+
].unique()
|
|
394
411
|
|
|
395
412
|
# Ensure all unique types are in reverse_types
|
|
396
413
|
for obs_type in unique_types:
|
|
397
414
|
if obs_type not in reverse_types:
|
|
398
|
-
new_id =
|
|
399
|
-
reverse_types[obs_type] =
|
|
415
|
+
new_id = max(reverse_types.values(), default=0) + 1
|
|
416
|
+
reverse_types[obs_type] = new_id
|
|
400
417
|
|
|
401
418
|
not_sorted_types = {
|
|
402
419
|
reverse_types[obs_type]: obs_type for obs_type in unique_types
|
|
@@ -431,9 +448,7 @@ class ObsSequence:
|
|
|
431
448
|
self.header.append(f"{len(self.types)}")
|
|
432
449
|
for key, value in self.types.items():
|
|
433
450
|
self.header.append(f"{key} {value}")
|
|
434
|
-
self.header.append(
|
|
435
|
-
f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}"
|
|
436
|
-
) # @todo HK not keeping track if num_qc changes
|
|
451
|
+
self.header.append(f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}")
|
|
437
452
|
self.header.append(f"num_obs: {num_obs:>10} max_num_obs: {num_obs:>10}")
|
|
438
453
|
stats_cols = [
|
|
439
454
|
"prior_bias",
|
|
@@ -692,7 +707,8 @@ class ObsSequence:
|
|
|
692
707
|
def collect_obs_types(header):
|
|
693
708
|
"""Create a dictionary for the observation types in the obs_seq header"""
|
|
694
709
|
num_obs_types = int(header[2])
|
|
695
|
-
types
|
|
710
|
+
# The first line containing obs types is the 4th line in an obs_seq file.
|
|
711
|
+
types = {int(x.split()[0]): x.split()[1] for x in header[3 : num_obs_types + 3]}
|
|
696
712
|
return types
|
|
697
713
|
|
|
698
714
|
@staticmethod
|
|
@@ -856,18 +872,45 @@ class ObsSequence:
|
|
|
856
872
|
|
|
857
873
|
# kind (type of observation) value
|
|
858
874
|
obs.append("kind")
|
|
859
|
-
|
|
860
|
-
record_length = struct.unpack("i", record_length_bytes)[0]
|
|
875
|
+
record_length = ObsSequence.read_record_length(f)
|
|
861
876
|
record = f.read(record_length)
|
|
862
877
|
kind = f"{struct.unpack('i', record)[0]}"
|
|
863
878
|
obs.append(kind)
|
|
864
879
|
|
|
865
880
|
ObsSequence.check_trailing_record_length(f, record_length)
|
|
866
881
|
|
|
882
|
+
# Skip metadata (obs_def) and go directly to the time record
|
|
883
|
+
while True:
|
|
884
|
+
pos = f.tell()
|
|
885
|
+
record_length = ObsSequence.read_record_length(f)
|
|
886
|
+
if record_length is None:
|
|
887
|
+
break # End of file
|
|
888
|
+
|
|
889
|
+
record = f.read(record_length)
|
|
890
|
+
# Check if this record is likely the "time" record (8 bytes, can be unpacked as two ints)
|
|
891
|
+
if record_length == 8:
|
|
892
|
+
try:
|
|
893
|
+
seconds, days = struct.unpack("ii", record)
|
|
894
|
+
# If unpack succeeds, this is the time record
|
|
895
|
+
f.seek(pos) # Seek back so the main loop can process it
|
|
896
|
+
break
|
|
897
|
+
except struct.error:
|
|
898
|
+
pass # Not the time record, keep skipping
|
|
899
|
+
|
|
900
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
901
|
+
|
|
867
902
|
# time (seconds, days)
|
|
868
903
|
record_length = ObsSequence.read_record_length(f)
|
|
869
904
|
record = f.read(record_length)
|
|
870
|
-
|
|
905
|
+
try: # This is incase the record is not the time record because of metadata funkyness
|
|
906
|
+
seconds, days = struct.unpack("ii", record)
|
|
907
|
+
except struct.error as e:
|
|
908
|
+
print(
|
|
909
|
+
f"Reading observation {obs_num}... record length: {record_length} kind {kind}"
|
|
910
|
+
)
|
|
911
|
+
print(f"")
|
|
912
|
+
print(f"Error unpacking seconds and days: {e}")
|
|
913
|
+
raise
|
|
871
914
|
time_string = f"{seconds} {days}"
|
|
872
915
|
obs.append(time_string)
|
|
873
916
|
|
|
@@ -882,23 +925,27 @@ class ObsSequence:
|
|
|
882
925
|
|
|
883
926
|
yield obs
|
|
884
927
|
|
|
885
|
-
def composite_types(self, composite_types="use_default"):
|
|
928
|
+
def composite_types(self, composite_types="use_default", raise_on_duplicate=False):
|
|
886
929
|
"""
|
|
887
|
-
Set up and construct composite types for the DataFrame.
|
|
930
|
+
Set up and construct composite observation types for the DataFrame.
|
|
888
931
|
|
|
889
|
-
This function sets up composite types based on a provided YAML configuration or
|
|
932
|
+
This function sets up composite observation types based on a provided YAML configuration or
|
|
890
933
|
a default configuration. It constructs new composite rows by combining specified
|
|
891
|
-
components and adds them to the DataFrame.
|
|
934
|
+
components and adds them to the DataFrame in place.
|
|
892
935
|
|
|
893
936
|
Args:
|
|
894
937
|
composite_types (str, optional): The YAML configuration for composite types.
|
|
895
|
-
|
|
938
|
+
If 'use_default', the default configuration is used. Otherwise, a custom YAML
|
|
939
|
+
configuration can be provided.
|
|
940
|
+
raise_on_duplicate (bool, optional): If True, raises an exception if there are
|
|
941
|
+
duplicates in the components. otherwise default False, deals with duplicates as though
|
|
942
|
+
they are distinct observations.
|
|
896
943
|
|
|
897
944
|
Returns:
|
|
898
945
|
pd.DataFrame: The updated DataFrame with the new composite rows added.
|
|
899
946
|
|
|
900
947
|
Raises:
|
|
901
|
-
Exception: If there are repeat values in the components
|
|
948
|
+
Exception: If there are repeat values in the components and raise_on_duplicate = True
|
|
902
949
|
"""
|
|
903
950
|
|
|
904
951
|
if composite_types == "use_default":
|
|
@@ -924,7 +971,10 @@ class ObsSequence:
|
|
|
924
971
|
df = pd.DataFrame()
|
|
925
972
|
for key in self.composite_types_dict:
|
|
926
973
|
df_new = construct_composit(
|
|
927
|
-
df_comp,
|
|
974
|
+
df_comp,
|
|
975
|
+
key,
|
|
976
|
+
self.composite_types_dict[key]["components"],
|
|
977
|
+
raise_on_duplicate,
|
|
928
978
|
)
|
|
929
979
|
df = pd.concat([df, df_new], axis=0)
|
|
930
980
|
|
|
@@ -1045,53 +1095,49 @@ class ObsSequence:
|
|
|
1045
1095
|
if item in obs_sequences[0].qc_copie_names
|
|
1046
1096
|
]
|
|
1047
1097
|
|
|
1048
|
-
combo.n_copies = len(combo.copie_names)
|
|
1049
|
-
combo.n_qc = len(combo.qc_copie_names)
|
|
1050
|
-
combo.n_non_qc = len(combo.non_qc_copie_names)
|
|
1051
|
-
|
|
1052
1098
|
else:
|
|
1053
1099
|
for obs_seq in obs_sequences:
|
|
1054
1100
|
if not obs_sequences[0].df.columns.isin(obs_seq.df.columns).all():
|
|
1055
1101
|
raise ValueError(
|
|
1056
1102
|
"All observation sequences must have the same copies."
|
|
1057
1103
|
)
|
|
1058
|
-
combo.n_copies = obs_sequences[0].n_copies
|
|
1059
|
-
combo.n_qc = obs_sequences[0].n_qc
|
|
1060
|
-
combo.n_non_qc = obs_sequences[0].n_non_qc
|
|
1061
1104
|
combo.copie_names = obs_sequences[0].copie_names
|
|
1105
|
+
combo.non_qc_copie_names = obs_sequences[0].non_qc_copie_names
|
|
1106
|
+
combo.qc_copie_names = obs_sequences[0].qc_copie_names
|
|
1107
|
+
combo.n_copies = len(combo.copie_names)
|
|
1062
1108
|
|
|
1063
1109
|
# todo HK @todo combine synonyms for obs?
|
|
1064
1110
|
|
|
1065
1111
|
# Initialize combined data
|
|
1066
|
-
|
|
1067
|
-
combined_df = pd.DataFrame()
|
|
1068
|
-
combo.all_obs = None # set to none to force writing from the dataframe if write_obs_seq is called
|
|
1112
|
+
combo.df = pd.DataFrame()
|
|
1069
1113
|
|
|
1070
1114
|
# Iterate over the list of observation sequences and combine their data
|
|
1071
1115
|
for obs_seq in obs_sequences:
|
|
1072
1116
|
if copies:
|
|
1073
|
-
|
|
1074
|
-
[
|
|
1117
|
+
combo.df = pd.concat(
|
|
1118
|
+
[combo.df, obs_seq.df[requested_columns]], ignore_index=True
|
|
1075
1119
|
)
|
|
1076
1120
|
else:
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
keys = set(combined_types)
|
|
1082
|
-
combo.reverse_types = {item: i + 1 for i, item in enumerate(keys)}
|
|
1083
|
-
combo.types = {v: k for k, v in combo.reverse_types.items()}
|
|
1084
|
-
|
|
1085
|
-
# create linked list for obs
|
|
1086
|
-
combo.df = combined_df.sort_values(by="time").reset_index(drop=True)
|
|
1087
|
-
combo.df["linked_list"] = ObsSequence.generate_linked_list_pattern(
|
|
1088
|
-
len(combo.df)
|
|
1089
|
-
)
|
|
1090
|
-
combo.df["obs_num"] = combined_df.index + 1
|
|
1091
|
-
combo.create_header(len(combo.df))
|
|
1121
|
+
combo.df = pd.concat([combo.df, obs_seq.df], ignore_index=True)
|
|
1122
|
+
|
|
1123
|
+
# update ObsSequence attributes from the combined DataFrame
|
|
1124
|
+
combo.update_attributes_from_df()
|
|
1092
1125
|
|
|
1093
1126
|
return combo
|
|
1094
1127
|
|
|
1128
|
+
@staticmethod
|
|
1129
|
+
def update_linked_list(df):
|
|
1130
|
+
"""
|
|
1131
|
+
Sorts the DataFrame by 'time', resets the index, and adds/updates 'linked_list'
|
|
1132
|
+
and 'obs_num' columns in place.
|
|
1133
|
+
Modifies the input DataFrame directly.
|
|
1134
|
+
"""
|
|
1135
|
+
df.sort_values(by="time", inplace=True, kind="stable")
|
|
1136
|
+
df.reset_index(drop=True, inplace=True)
|
|
1137
|
+
df["linked_list"] = ObsSequence.generate_linked_list_pattern(len(df))
|
|
1138
|
+
df["obs_num"] = df.index + 1
|
|
1139
|
+
return None
|
|
1140
|
+
|
|
1095
1141
|
def has_assimilation_info(self):
|
|
1096
1142
|
"""
|
|
1097
1143
|
Check if the DataFrame has prior information.
|
|
@@ -1134,6 +1180,100 @@ class ObsSequence:
|
|
|
1134
1180
|
self.header.append(copie)
|
|
1135
1181
|
self.header.append(f"first: 1 last: {n}")
|
|
1136
1182
|
|
|
1183
|
+
@staticmethod
|
|
1184
|
+
def replace_qc2_nan(df):
|
|
1185
|
+
"""
|
|
1186
|
+
Replace MISSING_R8 values with NaNs in posterior columns for observations where
|
|
1187
|
+
DART_quality_control = 2 (posterior forward observation operators failed)
|
|
1188
|
+
|
|
1189
|
+
This causes these observations to be ignored in the calculations of posterior statistics
|
|
1190
|
+
"""
|
|
1191
|
+
df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_mean"] = np.nan
|
|
1192
|
+
df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_spread"] = np.nan
|
|
1193
|
+
num_post_members = len(
|
|
1194
|
+
df.columns[df.columns.str.startswith("posterior_ensemble_member_")]
|
|
1195
|
+
)
|
|
1196
|
+
for i in range(1, num_post_members + 1):
|
|
1197
|
+
df.loc[
|
|
1198
|
+
df["DART_quality_control"] == 2.0,
|
|
1199
|
+
"posterior_ensemble_member_" + str(i),
|
|
1200
|
+
] = np.nan
|
|
1201
|
+
|
|
1202
|
+
@staticmethod
|
|
1203
|
+
def revert_qc2_nan(df):
|
|
1204
|
+
"""
|
|
1205
|
+
Revert NaNs back to MISSING_R8s for observations where DART_quality_control = 2
|
|
1206
|
+
(posterior forward observation operators failed)
|
|
1207
|
+
"""
|
|
1208
|
+
df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_mean"] = (
|
|
1209
|
+
-888888.000000
|
|
1210
|
+
)
|
|
1211
|
+
df.loc[df["DART_quality_control"] == 2.0, "posterior_ensemble_spread"] = (
|
|
1212
|
+
-888888.000000
|
|
1213
|
+
)
|
|
1214
|
+
num_post_members = len(
|
|
1215
|
+
df.columns[df.columns.str.startswith("posterior_ensemble_member_")]
|
|
1216
|
+
)
|
|
1217
|
+
for i in range(1, num_post_members + 1):
|
|
1218
|
+
df.loc[
|
|
1219
|
+
df["DART_quality_control"] == 2.0, "posterior_ensemble_member_" + str(i)
|
|
1220
|
+
] = -888888.000000
|
|
1221
|
+
|
|
1222
|
+
def update_attributes_from_df(self):
|
|
1223
|
+
"""
|
|
1224
|
+
Update all internal data (fields/properties) of the ObsSequence object that
|
|
1225
|
+
depend on the DataFrame (self.df).
|
|
1226
|
+
Call this after self.df is replaced or its structure changes.
|
|
1227
|
+
|
|
1228
|
+
Important:
|
|
1229
|
+
|
|
1230
|
+
Assumes copies are all columns between 'obs_num' and 'linked_list' (if present)
|
|
1231
|
+
|
|
1232
|
+
"""
|
|
1233
|
+
# Update columns
|
|
1234
|
+
self.columns = list(self.df.columns)
|
|
1235
|
+
|
|
1236
|
+
# Update all_obs (list of lists, each row) @todo HK do we need this?
|
|
1237
|
+
self.all_obs = None
|
|
1238
|
+
|
|
1239
|
+
# Update copie_names, non_qc_copie_names, qc_copie_names, n_copies, n_non_qc, n_qc
|
|
1240
|
+
# Try to infer from columns if possible, else leave as is
|
|
1241
|
+
# Assume copies are all columns between 'obs_num' and 'linked_list' (if present)
|
|
1242
|
+
if "obs_num" in self.df.columns and "linked_list" in self.df.columns:
|
|
1243
|
+
obs_num_idx = self.df.columns.get_loc("obs_num")
|
|
1244
|
+
linked_list_idx = self.df.columns.get_loc("linked_list")
|
|
1245
|
+
self.copie_names = list(self.df.columns[obs_num_idx + 1 : linked_list_idx])
|
|
1246
|
+
else:
|
|
1247
|
+
# Fallback: use previous value or empty
|
|
1248
|
+
self.copie_names = getattr(self, "copie_names", [])
|
|
1249
|
+
self.n_copies = len(self.copie_names)
|
|
1250
|
+
|
|
1251
|
+
# Try to infer non_qc and qc copies from previous names if possible
|
|
1252
|
+
# Find qc copies first
|
|
1253
|
+
self.qc_copie_names = [c for c in self.copie_names if c in self.qc_copie_names]
|
|
1254
|
+
if self.qc_copie_names == []: # If no qc copies found, assume all are non-qc
|
|
1255
|
+
self.non_qc_copie_names = self.copie_names
|
|
1256
|
+
else: # pull out non-qc copies from the copie_names
|
|
1257
|
+
self.non_qc_copie_names = [
|
|
1258
|
+
c for c in self.copie_names if c not in self.qc_copie_names
|
|
1259
|
+
]
|
|
1260
|
+
self.n_qc = len(self.qc_copie_names)
|
|
1261
|
+
self.n_non_qc = len(self.non_qc_copie_names)
|
|
1262
|
+
|
|
1263
|
+
# Update header and types and reverse_types
|
|
1264
|
+
self.create_header_from_dataframe()
|
|
1265
|
+
|
|
1266
|
+
# Update seq (generator should be empty or None if not from file)
|
|
1267
|
+
self.seq = []
|
|
1268
|
+
# Update loc_mod
|
|
1269
|
+
if "vertical" in self.df.columns:
|
|
1270
|
+
self.loc_mod = "loc3d"
|
|
1271
|
+
else:
|
|
1272
|
+
self.loc_mod = "loc1d"
|
|
1273
|
+
|
|
1274
|
+
# update linked list for obs and obs_nums
|
|
1275
|
+
ObsSequence.update_linked_list(self.df)
|
|
1276
|
+
|
|
1137
1277
|
|
|
1138
1278
|
def load_yaml_to_dict(file_path):
|
|
1139
1279
|
"""
|
|
@@ -1164,24 +1304,31 @@ def convert_dart_time(seconds, days):
|
|
|
1164
1304
|
return time
|
|
1165
1305
|
|
|
1166
1306
|
|
|
1167
|
-
def construct_composit(df_comp, composite, components):
|
|
1307
|
+
def construct_composit(df_comp, composite, components, raise_on_duplicate):
|
|
1168
1308
|
"""
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
specified columns using the square root of the sum of squares method.
|
|
1309
|
+
Creates a new DataFrame by combining pairs of rows from two specified component
|
|
1310
|
+
types in an observation DataFrame. It matches rows based on location and time,
|
|
1311
|
+
and then combines certain columns using the square root of the sum of squares
|
|
1312
|
+
of the components.
|
|
1174
1313
|
|
|
1175
1314
|
Args:
|
|
1176
1315
|
df_comp (pd.DataFrame): The DataFrame containing the component rows to be combined.
|
|
1177
1316
|
composite (str): The type name for the new composite rows.
|
|
1178
1317
|
components (list of str): A list containing the type names of the two components to be combined.
|
|
1318
|
+
raise_on_duplicate (bool): If False, raises an exception if there are duplicates in the components.
|
|
1319
|
+
otherwise deals with duplicates as though they are distinct observations.
|
|
1320
|
+
|
|
1179
1321
|
|
|
1180
1322
|
Returns:
|
|
1181
1323
|
merged_df (pd.DataFrame): A DataFrame containing the new composite rows.
|
|
1182
1324
|
"""
|
|
1325
|
+
# select rows for the two components
|
|
1326
|
+
if len(components) != 2:
|
|
1327
|
+
raise ValueError("components must be a list of two component types.")
|
|
1183
1328
|
selected_rows = df_comp[df_comp["type"] == components[0].upper()]
|
|
1184
1329
|
selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
|
|
1330
|
+
selected_rows = selected_rows.copy()
|
|
1331
|
+
selected_rows_v = selected_rows_v.copy()
|
|
1185
1332
|
|
|
1186
1333
|
prior_columns_to_combine = df_comp.filter(regex="prior_ensemble").columns.tolist()
|
|
1187
1334
|
posterior_columns_to_combine = df_comp.filter(
|
|
@@ -1192,7 +1339,7 @@ def construct_composit(df_comp, composite, components):
|
|
|
1192
1339
|
+ posterior_columns_to_combine
|
|
1193
1340
|
+ ["observation", "obs_err_var"]
|
|
1194
1341
|
)
|
|
1195
|
-
merge_columns = ["latitude", "longitude", "vertical", "time"]
|
|
1342
|
+
merge_columns = ["latitude", "longitude", "vertical", "time"] # @todo HK 1d or 3d
|
|
1196
1343
|
same_obs_columns = merge_columns + [
|
|
1197
1344
|
"observation",
|
|
1198
1345
|
"obs_err_var",
|
|
@@ -1202,15 +1349,25 @@ def construct_composit(df_comp, composite, components):
|
|
|
1202
1349
|
selected_rows[same_obs_columns].duplicated().sum() > 0
|
|
1203
1350
|
or selected_rows_v[same_obs_columns].duplicated().sum() > 0
|
|
1204
1351
|
):
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
f"{
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1352
|
+
|
|
1353
|
+
if raise_on_duplicate:
|
|
1354
|
+
print(
|
|
1355
|
+
f"{selected_rows[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
|
|
1356
|
+
)
|
|
1357
|
+
print(f"{selected_rows[same_obs_columns]}")
|
|
1358
|
+
print(
|
|
1359
|
+
f"{selected_rows_v[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
|
|
1360
|
+
)
|
|
1361
|
+
print(f"{selected_rows_v[same_obs_columns]}")
|
|
1362
|
+
raise Exception("There are duplicates in the components.")
|
|
1363
|
+
|
|
1364
|
+
else:
|
|
1365
|
+
selected_rows["dup_num"] = selected_rows.groupby(
|
|
1366
|
+
same_obs_columns
|
|
1367
|
+
).cumcount()
|
|
1368
|
+
selected_rows_v["dup_num"] = selected_rows_v.groupby(
|
|
1369
|
+
same_obs_columns
|
|
1370
|
+
).cumcount()
|
|
1214
1371
|
|
|
1215
1372
|
# Merge the two DataFrames on location and time columns
|
|
1216
1373
|
merged_df = pd.merge(
|
|
@@ -1227,4 +1384,7 @@ def construct_composit(df_comp, composite, components):
|
|
|
1227
1384
|
columns=[col for col in merged_df.columns if col.endswith("_v")]
|
|
1228
1385
|
)
|
|
1229
1386
|
|
|
1387
|
+
if "dup_num" in merged_df.columns:
|
|
1388
|
+
merged_df = merged_df.drop(columns=["dup_num"])
|
|
1389
|
+
|
|
1230
1390
|
return merged_df
|
|
@@ -165,6 +165,10 @@ class TestWriteAscii:
|
|
|
165
165
|
),
|
|
166
166
|
os.path.join(os.path.dirname(__file__), "data", "obs_seq.1d.final"),
|
|
167
167
|
os.path.join(os.path.dirname(__file__), "data", "obs_seq.out.GSI.small"),
|
|
168
|
+
os.path.join(os.path.dirname(__file__), "data", "obs_seq.final.qc2_2obs"),
|
|
169
|
+
os.path.join(os.path.dirname(__file__), "data", "obs_seq.in.all-id"),
|
|
170
|
+
os.path.join(os.path.dirname(__file__), "data", "obs_seq.in.mix"),
|
|
171
|
+
os.path.join(os.path.dirname(__file__), "data", "obs_seq.final.wrfhydro"),
|
|
168
172
|
],
|
|
169
173
|
)
|
|
170
174
|
def test_write_ascii(self, ascii_obs_seq_file_path, temp_dir):
|
|
@@ -420,32 +424,14 @@ class TestJoin:
|
|
|
420
424
|
assert obs_seq_mega.loc_mod == "loc3d"
|
|
421
425
|
assert obs_seq_mega.has_assimilation_info() == True
|
|
422
426
|
assert obs_seq_mega.has_posterior() == False
|
|
423
|
-
assert list(obs_seq_mega.types.keys()) == list(range(1,
|
|
427
|
+
assert list(obs_seq_mega.types.keys()) == list(range(1, 8)) # 7 obs types
|
|
424
428
|
obs_types = [
|
|
425
|
-
"AIRCRAFT_TEMPERATURE",
|
|
426
|
-
"BLUE_LAND_SFC_ALTIMETER",
|
|
427
|
-
"MARINE_SFC_SPECIFIC_HUMIDITY",
|
|
428
|
-
"SAT_V_WIND_COMPONENT",
|
|
429
|
-
"RADIOSONDE_SPECIFIC_HUMIDITY",
|
|
430
|
-
"MARINE_SFC_TEMPERATURE",
|
|
431
|
-
"RADIOSONDE_U_WIND_COMPONENT",
|
|
432
|
-
"MARINE_SFC_ALTIMETER",
|
|
433
|
-
"AIRCRAFT_V_WIND_COMPONENT",
|
|
434
|
-
"RADIOSONDE_SURFACE_ALTIMETER",
|
|
435
429
|
"ACARS_TEMPERATURE",
|
|
436
|
-
"LAND_SFC_ALTIMETER",
|
|
437
|
-
"MARINE_SFC_V_WIND_COMPONENT",
|
|
438
|
-
"AIRS_TEMPERATURE",
|
|
439
|
-
"GPSRO_REFRACTIVITY",
|
|
440
|
-
"MARINE_SFC_U_WIND_COMPONENT",
|
|
441
430
|
"ACARS_U_WIND_COMPONENT",
|
|
442
|
-
"RADIOSONDE_V_WIND_COMPONENT",
|
|
443
|
-
"SAT_U_WIND_COMPONENT",
|
|
444
|
-
"GREEN_LAND_SFC_ALTIMETER",
|
|
445
431
|
"ACARS_V_WIND_COMPONENT",
|
|
446
|
-
"
|
|
432
|
+
"AIRCRAFT_TEMPERATURE",
|
|
447
433
|
"AIRCRAFT_U_WIND_COMPONENT",
|
|
448
|
-
"
|
|
434
|
+
"AIRCRAFT_V_WIND_COMPONENT",
|
|
449
435
|
"PINK_LAND_SFC_ALTIMETER",
|
|
450
436
|
]
|
|
451
437
|
all_obs_present = all(
|
|
@@ -720,16 +706,16 @@ class TestUpdateTypesDicts:
|
|
|
720
706
|
return pd.DataFrame(data)
|
|
721
707
|
|
|
722
708
|
def test_update_types_dicts(self, sample_df):
|
|
723
|
-
reverse_types = {"ACARS_TEMPERATURE":
|
|
709
|
+
reverse_types = {"ACARS_TEMPERATURE": 32, "RADIOSONDE_U_WIND_COMPONENT": 51}
|
|
724
710
|
expected_reverse_types = {
|
|
725
|
-
"ACARS_TEMPERATURE":
|
|
726
|
-
"RADIOSONDE_U_WIND_COMPONENT":
|
|
727
|
-
"PINEAPPLE_COUNT":
|
|
711
|
+
"ACARS_TEMPERATURE": 32,
|
|
712
|
+
"RADIOSONDE_U_WIND_COMPONENT": 51,
|
|
713
|
+
"PINEAPPLE_COUNT": 52,
|
|
728
714
|
}
|
|
729
715
|
expected_types = {
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
716
|
+
32: "ACARS_TEMPERATURE",
|
|
717
|
+
51: "RADIOSONDE_U_WIND_COMPONENT",
|
|
718
|
+
52: "PINEAPPLE_COUNT",
|
|
733
719
|
}
|
|
734
720
|
|
|
735
721
|
updated_reverse_types, types = obsq.ObsSequence.update_types_dicts(
|
|
@@ -846,14 +832,98 @@ class TestCompositeTypes:
|
|
|
846
832
|
== orig_df.loc[orig_df["type"] == "ACARS_TEMPERATURE", col].values[0]
|
|
847
833
|
)
|
|
848
834
|
|
|
849
|
-
def
|
|
835
|
+
def test_composite_types_dups_catch(self):
|
|
850
836
|
test_dir = os.path.dirname(__file__)
|
|
851
837
|
file_path = os.path.join(test_dir, "data", "dups-obs.final")
|
|
852
838
|
|
|
853
839
|
dup = obsq.ObsSequence(file_path)
|
|
854
840
|
# Test that composite_types raises an error
|
|
855
841
|
with pytest.raises(Exception, match="There are duplicates in the components."):
|
|
856
|
-
dup.composite_types()
|
|
842
|
+
dup.composite_types(raise_on_duplicate=True)
|
|
843
|
+
|
|
844
|
+
def test_composite_types_dups(self):
|
|
845
|
+
test_dir = os.path.dirname(__file__)
|
|
846
|
+
file_path = os.path.join(test_dir, "data", "dups-obs.final")
|
|
847
|
+
|
|
848
|
+
obs_seq = obsq.ObsSequence(file_path)
|
|
849
|
+
|
|
850
|
+
# Save the original DataFrame for comparison
|
|
851
|
+
orig_df = obs_seq.df.copy()
|
|
852
|
+
|
|
853
|
+
# Test that composite_types does not raise an error
|
|
854
|
+
obs_seq.composite_types(raise_on_duplicate=False)
|
|
855
|
+
|
|
856
|
+
# Verify that the DataFrame has the expected types
|
|
857
|
+
types = obs_seq.df["type"].unique()
|
|
858
|
+
expected_composite_types = [
|
|
859
|
+
"ACARS_TEMPERATURE",
|
|
860
|
+
"ACARS_U_WIND_COMPONENT",
|
|
861
|
+
"ACARS_V_WIND_COMPONENT",
|
|
862
|
+
"ACARS_HORIZONTAL_WIND",
|
|
863
|
+
]
|
|
864
|
+
assert len(types) == len(expected_composite_types)
|
|
865
|
+
for type in expected_composite_types:
|
|
866
|
+
assert type in types
|
|
867
|
+
|
|
868
|
+
# Verify composite types are correctly calculated
|
|
869
|
+
prior_columns = obs_seq.df.filter(regex="prior_ensemble").columns.tolist()
|
|
870
|
+
posterior_columns = obs_seq.df.filter(
|
|
871
|
+
regex="posterior_ensemble"
|
|
872
|
+
).columns.tolist()
|
|
873
|
+
combo_cols = ["observation", "obs_err_var"] + prior_columns + posterior_columns
|
|
874
|
+
|
|
875
|
+
for col in combo_cols:
|
|
876
|
+
u_wind = obs_seq.df.loc[
|
|
877
|
+
obs_seq.df["type"] == "ACARS_U_WIND_COMPONENT", col
|
|
878
|
+
].values[0]
|
|
879
|
+
v_wind = obs_seq.df.loc[
|
|
880
|
+
obs_seq.df["type"] == "ACARS_V_WIND_COMPONENT", col
|
|
881
|
+
].values[0]
|
|
882
|
+
wind = obs_seq.df.loc[
|
|
883
|
+
obs_seq.df["type"] == "ACARS_HORIZONTAL_WIND", col
|
|
884
|
+
].values[0]
|
|
885
|
+
assert np.isclose(
|
|
886
|
+
np.sqrt(u_wind**2 + v_wind**2), wind
|
|
887
|
+
), f"Mismatch in column {col}: {wind} != sqrt({u_wind}^2 + {v_wind}^2)"
|
|
888
|
+
|
|
889
|
+
# Verify that the non-composite columns are unchanged
|
|
890
|
+
for col in obs_seq.df.columns:
|
|
891
|
+
if col not in combo_cols:
|
|
892
|
+
assert (
|
|
893
|
+
obs_seq.df.loc[
|
|
894
|
+
obs_seq.df["type"] == "ACARS_U_WIND_COMPONENT", col
|
|
895
|
+
].values[0]
|
|
896
|
+
== orig_df.loc[
|
|
897
|
+
orig_df["type"] == "ACARS_U_WIND_COMPONENT", col
|
|
898
|
+
].values[0]
|
|
899
|
+
)
|
|
900
|
+
assert (
|
|
901
|
+
obs_seq.df.loc[
|
|
902
|
+
obs_seq.df["type"] == "ACARS_V_WIND_COMPONENT", col
|
|
903
|
+
].values[0]
|
|
904
|
+
== orig_df.loc[
|
|
905
|
+
orig_df["type"] == "ACARS_V_WIND_COMPONENT", col
|
|
906
|
+
].values[0]
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
# Horizontal wind not in original, should be the same as the component
|
|
910
|
+
for col in obs_seq.df.columns:
|
|
911
|
+
if col not in combo_cols and col != "type":
|
|
912
|
+
assert (
|
|
913
|
+
obs_seq.df.loc[
|
|
914
|
+
obs_seq.df["type"] == "ACARS_HORIZONTAL_WIND", col
|
|
915
|
+
].values[0]
|
|
916
|
+
== obs_seq.df.loc[
|
|
917
|
+
obs_seq.df["type"] == "ACARS_U_WIND_COMPONENT", col
|
|
918
|
+
].values[0]
|
|
919
|
+
)
|
|
920
|
+
|
|
921
|
+
# Verify that the non-composite types are unchanged for all columns
|
|
922
|
+
for col in obs_seq.df.columns:
|
|
923
|
+
assert (
|
|
924
|
+
obs_seq.df.loc[obs_seq.df["type"] == "ACARS_TEMPERATURE", col].values[0]
|
|
925
|
+
== orig_df.loc[orig_df["type"] == "ACARS_TEMPERATURE", col].values[0]
|
|
926
|
+
)
|
|
857
927
|
|
|
858
928
|
def test_no_yaml_file(self):
|
|
859
929
|
with pytest.raises(Exception):
|
|
@@ -874,6 +944,384 @@ class TestCompositeTypes:
|
|
|
874
944
|
with pytest.raises(yaml.YAMLError):
|
|
875
945
|
obsq.load_yaml_to_dict(broken_file)
|
|
876
946
|
|
|
947
|
+
def test_composite_types_more_than_two_components(self, tmpdir):
|
|
948
|
+
# Create a YAML file with a composite type with more than 2 components
|
|
949
|
+
composite_yaml = """
|
|
950
|
+
acars_super_wind:
|
|
951
|
+
components: [ACARS_U_WIND_COMPONENT, ACARS_V_WIND_COMPONENT, ACARS_TEMPERATURE]
|
|
952
|
+
"""
|
|
953
|
+
composite_file = tmpdir.join("composite_more_than_two.yaml")
|
|
954
|
+
with open(composite_file, "w") as f:
|
|
955
|
+
f.write(composite_yaml)
|
|
956
|
+
|
|
957
|
+
test_dir = os.path.dirname(__file__)
|
|
958
|
+
file_path = os.path.join(test_dir, "data", "three-obs.final")
|
|
959
|
+
obs_seq = obsq.ObsSequence(file_path)
|
|
960
|
+
# Should raise an exception due to >2 components
|
|
961
|
+
with pytest.raises(
|
|
962
|
+
Exception, match="components must be a list of two component types."
|
|
963
|
+
):
|
|
964
|
+
obs_seq.composite_types(composite_types=str(composite_file))
|
|
965
|
+
|
|
966
|
+
|
|
967
|
+
class TestUpdateAttributesFromDf:
|
|
968
|
+
def test_update_attributes_from_df(self):
|
|
969
|
+
obj = obsq.ObsSequence(file=None)
|
|
970
|
+
df1 = pd.DataFrame(
|
|
971
|
+
{
|
|
972
|
+
"obs_num": [1, 2],
|
|
973
|
+
"observation": [10.0, 20.0],
|
|
974
|
+
"linked_list": ["-1 2 -1", "1 -1 -1"],
|
|
975
|
+
"type": ["A", "B"],
|
|
976
|
+
"time": [dt.datetime(2020, 1, 1), dt.datetime(2020, 1, 2)],
|
|
977
|
+
}
|
|
978
|
+
)
|
|
979
|
+
obj.df = df1
|
|
980
|
+
obj.update_attributes_from_df()
|
|
981
|
+
|
|
982
|
+
# Check initial state
|
|
983
|
+
assert obj.columns == ["obs_num", "observation", "linked_list", "type", "time"]
|
|
984
|
+
assert obj.all_obs == None
|
|
985
|
+
assert obj.copie_names == ["observation"]
|
|
986
|
+
assert obj.n_copies == 1
|
|
987
|
+
# Check linked_list and obs_num updated
|
|
988
|
+
assert list(obj.df["obs_num"]) == [1, 2]
|
|
989
|
+
assert list(
|
|
990
|
+
obj.df["linked_list"]
|
|
991
|
+
) == obsq.ObsSequence.generate_linked_list_pattern(2)
|
|
992
|
+
|
|
993
|
+
# Change the DataFrame
|
|
994
|
+
df2 = pd.DataFrame(
|
|
995
|
+
{
|
|
996
|
+
"obs_num": [3],
|
|
997
|
+
"observation": [30.0],
|
|
998
|
+
"prior_ensemble_mean": [15.0],
|
|
999
|
+
"linked_list": ["-1 -1 -1"],
|
|
1000
|
+
"type": ["C"],
|
|
1001
|
+
"time": [dt.datetime(2020, 1, 3)],
|
|
1002
|
+
}
|
|
1003
|
+
)
|
|
1004
|
+
obj.df = df2
|
|
1005
|
+
obj.update_attributes_from_df()
|
|
1006
|
+
|
|
1007
|
+
# Check updated state
|
|
1008
|
+
assert obj.columns == [
|
|
1009
|
+
"obs_num",
|
|
1010
|
+
"observation",
|
|
1011
|
+
"prior_ensemble_mean",
|
|
1012
|
+
"linked_list",
|
|
1013
|
+
"type",
|
|
1014
|
+
"time",
|
|
1015
|
+
]
|
|
1016
|
+
assert obj.all_obs == None
|
|
1017
|
+
assert "prior_ensemble_mean" in obj.copie_names
|
|
1018
|
+
assert obj.n_copies == 2 # observation and prior_ensemble_mean
|
|
1019
|
+
assert list(obj.df["obs_num"]) == [1]
|
|
1020
|
+
assert list(
|
|
1021
|
+
obj.df["linked_list"]
|
|
1022
|
+
) == obsq.ObsSequence.generate_linked_list_pattern(1)
|
|
1023
|
+
|
|
1024
|
+
def test_update_attributes_from_df_drop_column(self):
|
|
1025
|
+
obj = obsq.ObsSequence(file=None)
|
|
1026
|
+
df = pd.DataFrame(
|
|
1027
|
+
{
|
|
1028
|
+
"obs_num": [1, 2],
|
|
1029
|
+
"observation": [10.0, 20.0],
|
|
1030
|
+
"prior_ensemble_mean": [1.5, 2.5],
|
|
1031
|
+
"linked_list": ["-1 2 -1", "1 -1 -1"],
|
|
1032
|
+
"type": ["A", "B"],
|
|
1033
|
+
"time": [dt.datetime(2020, 1, 1), dt.datetime(2020, 1, 2)],
|
|
1034
|
+
}
|
|
1035
|
+
)
|
|
1036
|
+
obj.df = df
|
|
1037
|
+
obj.update_attributes_from_df()
|
|
1038
|
+
|
|
1039
|
+
# Initial state
|
|
1040
|
+
assert "prior_ensemble_mean" in obj.copie_names
|
|
1041
|
+
assert obj.n_copies == 2 # observation and prior_ensemble_mean
|
|
1042
|
+
assert list(obj.df["obs_num"]) == [1, 2]
|
|
1043
|
+
assert list(
|
|
1044
|
+
obj.df["linked_list"]
|
|
1045
|
+
) == obsq.ObsSequence.generate_linked_list_pattern(2)
|
|
1046
|
+
|
|
1047
|
+
# Drop a column and update
|
|
1048
|
+
obj.df = obj.df.drop(columns=["prior_ensemble_mean"])
|
|
1049
|
+
obj.update_attributes_from_df()
|
|
1050
|
+
|
|
1051
|
+
# Check that the dropped column is no longer present
|
|
1052
|
+
assert "prior_ensemble_mean" not in obj.copie_names
|
|
1053
|
+
assert obj.n_copies == 1 # only observation left
|
|
1054
|
+
assert list(obj.df["obs_num"]) == [1, 2]
|
|
1055
|
+
assert list(
|
|
1056
|
+
obj.df["linked_list"]
|
|
1057
|
+
) == obsq.ObsSequence.generate_linked_list_pattern(2)
|
|
1058
|
+
|
|
1059
|
+
def test_update_attributes_from_df_qc_counts(self):
|
|
1060
|
+
obj = obsq.ObsSequence(file=None)
|
|
1061
|
+
df = pd.DataFrame(
|
|
1062
|
+
{
|
|
1063
|
+
"obs_num": [1, 2],
|
|
1064
|
+
"observation": [10.0, 20.0],
|
|
1065
|
+
"DART_QC": [0, 1],
|
|
1066
|
+
"linked_list": ["-1 2 -1", "1 -1 -1"],
|
|
1067
|
+
"type": ["A", "B"],
|
|
1068
|
+
"time": [dt.datetime(2020, 1, 1), dt.datetime(2020, 1, 2)],
|
|
1069
|
+
}
|
|
1070
|
+
)
|
|
1071
|
+
obj.df = df
|
|
1072
|
+
obj.copie_names = ["observation", "DART_QC"]
|
|
1073
|
+
obj.non_qc_copie_names = ["observation"]
|
|
1074
|
+
obj.qc_copie_names = ["DART_QC"]
|
|
1075
|
+
obj.n_non_qc = 1
|
|
1076
|
+
obj.n_qc = 1
|
|
1077
|
+
obj.update_attributes_from_df()
|
|
1078
|
+
|
|
1079
|
+
# Check initial QC/non-QC counts
|
|
1080
|
+
assert obj.n_non_qc == 1
|
|
1081
|
+
assert obj.n_qc == 1
|
|
1082
|
+
assert obj.non_qc_copie_names == ["observation"]
|
|
1083
|
+
assert obj.qc_copie_names == ["DART_QC"]
|
|
1084
|
+
assert list(obj.df["obs_num"]) == [1, 2]
|
|
1085
|
+
assert list(
|
|
1086
|
+
obj.df["linked_list"]
|
|
1087
|
+
) == obsq.ObsSequence.generate_linked_list_pattern(2)
|
|
1088
|
+
|
|
1089
|
+
# Now drop the QC column and update
|
|
1090
|
+
obj.df = obj.df.drop(columns=["DART_QC"])
|
|
1091
|
+
obj.update_attributes_from_df()
|
|
1092
|
+
|
|
1093
|
+
# Check that n_qc is now 0 and n_non_qc is 1
|
|
1094
|
+
assert obj.n_non_qc == 1
|
|
1095
|
+
assert obj.n_qc == 0
|
|
1096
|
+
assert obj.non_qc_copie_names == ["observation"]
|
|
1097
|
+
assert obj.qc_copie_names == []
|
|
1098
|
+
assert list(obj.df["obs_num"]) == [1, 2]
|
|
1099
|
+
assert list(
|
|
1100
|
+
obj.df["linked_list"]
|
|
1101
|
+
) == obsq.ObsSequence.generate_linked_list_pattern(2)
|
|
1102
|
+
|
|
1103
|
+
def test_update_attributes_from_df_drop_multiple_qc_copies(self):
|
|
1104
|
+
obj = obsq.ObsSequence(file=None)
|
|
1105
|
+
# Initial DataFrame with 1 non-QC and 3 QC copies
|
|
1106
|
+
df = pd.DataFrame(
|
|
1107
|
+
{
|
|
1108
|
+
"obs_num": [1, 2],
|
|
1109
|
+
"observation": [10.0, 20.0],
|
|
1110
|
+
"QC1": [0, 1],
|
|
1111
|
+
"QC2": [1, 0],
|
|
1112
|
+
"QC3": [2, 2],
|
|
1113
|
+
"linked_list": ["-1 2 -1", "1 -1 -1"],
|
|
1114
|
+
"type": ["A", "B"],
|
|
1115
|
+
"time": [dt.datetime(2020, 1, 1), dt.datetime(2020, 1, 2)],
|
|
1116
|
+
}
|
|
1117
|
+
)
|
|
1118
|
+
obj.df = df
|
|
1119
|
+
obj.copie_names = ["observation", "QC1", "QC2", "QC3"]
|
|
1120
|
+
obj.non_qc_copie_names = ["observation"]
|
|
1121
|
+
obj.qc_copie_names = ["QC1", "QC2", "QC3"]
|
|
1122
|
+
obj.n_non_qc = 1
|
|
1123
|
+
obj.n_qc = 3
|
|
1124
|
+
|
|
1125
|
+
obj.update_attributes_from_df()
|
|
1126
|
+
|
|
1127
|
+
# Check initial QC/non-QC counts
|
|
1128
|
+
assert obj.n_non_qc == 1
|
|
1129
|
+
assert obj.n_qc == 3
|
|
1130
|
+
assert obj.non_qc_copie_names == ["observation"]
|
|
1131
|
+
assert obj.qc_copie_names == ["QC1", "QC2", "QC3"]
|
|
1132
|
+
assert list(obj.df["obs_num"]) == [1, 2]
|
|
1133
|
+
assert list(
|
|
1134
|
+
obj.df["linked_list"]
|
|
1135
|
+
) == obsq.ObsSequence.generate_linked_list_pattern(2)
|
|
1136
|
+
|
|
1137
|
+
# Drop two QC columns and update
|
|
1138
|
+
obj.df = obj.df.drop(columns=["QC2", "QC3"])
|
|
1139
|
+
obj.update_attributes_from_df()
|
|
1140
|
+
|
|
1141
|
+
# Check that only one QC copy remains
|
|
1142
|
+
assert obj.n_non_qc == 1
|
|
1143
|
+
assert obj.n_qc == 1
|
|
1144
|
+
assert obj.non_qc_copie_names == ["observation"]
|
|
1145
|
+
assert obj.qc_copie_names == ["QC1"]
|
|
1146
|
+
assert obj.copie_names == ["observation", "QC1"]
|
|
1147
|
+
assert list(obj.df["obs_num"]) == [1, 2]
|
|
1148
|
+
assert list(
|
|
1149
|
+
obj.df["linked_list"]
|
|
1150
|
+
) == obsq.ObsSequence.generate_linked_list_pattern(2)
|
|
1151
|
+
|
|
1152
|
+
def test_update_attributes_from_df_drop_row(self):
|
|
1153
|
+
obj = obsq.ObsSequence(file=None)
|
|
1154
|
+
df = pd.DataFrame(
|
|
1155
|
+
{
|
|
1156
|
+
"obs_num": [1, 2, 3],
|
|
1157
|
+
"observation": [10.0, 20.0, 30.0],
|
|
1158
|
+
"linked_list": ["-1 2 -1", "1 3 -1", "2 -1 -1"],
|
|
1159
|
+
"type": ["A", "B", "C"],
|
|
1160
|
+
"time": [
|
|
1161
|
+
dt.datetime(2020, 1, 1),
|
|
1162
|
+
dt.datetime(2020, 1, 2),
|
|
1163
|
+
dt.datetime(2020, 1, 3),
|
|
1164
|
+
],
|
|
1165
|
+
}
|
|
1166
|
+
)
|
|
1167
|
+
obj.df = df
|
|
1168
|
+
obj.update_attributes_from_df()
|
|
1169
|
+
|
|
1170
|
+
# Drop the middle row (index 1)
|
|
1171
|
+
obj.df = obj.df.drop(index=1).reset_index(drop=True)
|
|
1172
|
+
obj.update_attributes_from_df()
|
|
1173
|
+
|
|
1174
|
+
# After dropping, only rows with obs_num 1 and 3 remain, but obs_num should be renumbered
|
|
1175
|
+
assert list(obj.df["obs_num"]) == [1, 2]
|
|
1176
|
+
assert list(
|
|
1177
|
+
obj.df["linked_list"]
|
|
1178
|
+
) == obsq.ObsSequence.generate_linked_list_pattern(2)
|
|
1179
|
+
assert obj.n_copies == 1
|
|
1180
|
+
assert obj.n_qc == 0
|
|
1181
|
+
assert obj.n_non_qc == 1
|
|
1182
|
+
assert obj.copie_names == ["observation"]
|
|
1183
|
+
assert obj.columns == ["obs_num", "observation", "linked_list", "type", "time"]
|
|
1184
|
+
|
|
1185
|
+
def test_update_attributes_from_df_add_column(self):
|
|
1186
|
+
obj = obsq.ObsSequence(file=None)
|
|
1187
|
+
df = pd.DataFrame(
|
|
1188
|
+
{
|
|
1189
|
+
"obs_num": [1, 2],
|
|
1190
|
+
"observation": [10.0, 20.0],
|
|
1191
|
+
"linked_list": ["-1 2 -1", "1 -1 -1"],
|
|
1192
|
+
"type": ["A", "B"],
|
|
1193
|
+
"time": [dt.datetime(2020, 1, 1), dt.datetime(2020, 1, 2)],
|
|
1194
|
+
}
|
|
1195
|
+
)
|
|
1196
|
+
obj.df = df
|
|
1197
|
+
obj.update_attributes_from_df()
|
|
1198
|
+
|
|
1199
|
+
# Insert a new column between 'observation' and 'linked_list'
|
|
1200
|
+
insert_at = obj.df.columns.get_loc("linked_list")
|
|
1201
|
+
obj.df.insert(insert_at, "prior_ensemble_mean", [1.5, 2.5])
|
|
1202
|
+
obj.update_attributes_from_df()
|
|
1203
|
+
|
|
1204
|
+
# Check that the new column is present and in the correct position
|
|
1205
|
+
assert obj.df.columns.tolist() == [
|
|
1206
|
+
"obs_num",
|
|
1207
|
+
"observation",
|
|
1208
|
+
"prior_ensemble_mean",
|
|
1209
|
+
"linked_list",
|
|
1210
|
+
"type",
|
|
1211
|
+
"time",
|
|
1212
|
+
]
|
|
1213
|
+
assert "prior_ensemble_mean" in obj.copie_names
|
|
1214
|
+
assert obj.n_copies == 2 # observation and prior_ensemble_mean
|
|
1215
|
+
assert obj.n_qc == 0 # no QC columns
|
|
1216
|
+
assert obj.n_non_qc == 2
|
|
1217
|
+
assert list(obj.df["obs_num"]) == [1, 2]
|
|
1218
|
+
assert list(
|
|
1219
|
+
obj.df["linked_list"]
|
|
1220
|
+
) == obsq.ObsSequence.generate_linked_list_pattern(2)
|
|
1221
|
+
|
|
1222
|
+
|
|
1223
|
+
class TestQC2Replacement:
|
|
1224
|
+
@pytest.fixture
|
|
1225
|
+
def obs_seq(self):
|
|
1226
|
+
# Create a sample DataFrame for testing
|
|
1227
|
+
data = {
|
|
1228
|
+
"DART_quality_control": [0, 2, 2, 0],
|
|
1229
|
+
"posterior_ensemble_mean": [1.1, -888888.0, -888888.0, 2.2],
|
|
1230
|
+
"posterior_ensemble_spread": [0.1, -888888.0, -888888.0, 0.2],
|
|
1231
|
+
"posterior_ensemble_member_1": [1.0, -888888.0, -888888.0, 2.0],
|
|
1232
|
+
"posterior_ensemble_member_2": [1.2, -888888.0, -888888.0, 2.3],
|
|
1233
|
+
}
|
|
1234
|
+
df = pd.DataFrame(data)
|
|
1235
|
+
|
|
1236
|
+
# Create an instance of obs_sequence with the sample DataFrame
|
|
1237
|
+
obs_seq = obsq.ObsSequence(file=None)
|
|
1238
|
+
obs_seq.df = df
|
|
1239
|
+
return obs_seq
|
|
1240
|
+
|
|
1241
|
+
@pytest.fixture
|
|
1242
|
+
def obs_seq_nan(self):
|
|
1243
|
+
# Create a sample DataFrame for testing
|
|
1244
|
+
data_nan = {
|
|
1245
|
+
"DART_quality_control": [0, 2, 2, 0],
|
|
1246
|
+
"posterior_ensemble_mean": [1.1, np.nan, np.nan, 2.2],
|
|
1247
|
+
"posterior_ensemble_spread": [0.1, np.nan, np.nan, 0.2],
|
|
1248
|
+
"posterior_ensemble_member_1": [1.0, np.nan, np.nan, 2.0],
|
|
1249
|
+
"posterior_ensemble_member_2": [1.2, np.nan, np.nan, 2.3],
|
|
1250
|
+
}
|
|
1251
|
+
df = pd.DataFrame(data_nan)
|
|
1252
|
+
|
|
1253
|
+
# Create an instance of obs_sequence with the sample DataFrame
|
|
1254
|
+
obs_seq_nan = obsq.ObsSequence(file=None)
|
|
1255
|
+
obs_seq_nan.df = df
|
|
1256
|
+
return obs_seq_nan
|
|
1257
|
+
|
|
1258
|
+
def test_replace_qc2_nan(self, obs_seq):
|
|
1259
|
+
# Call the replace_qc2_r8s method
|
|
1260
|
+
obsq.ObsSequence.replace_qc2_nan(obs_seq.df)
|
|
1261
|
+
|
|
1262
|
+
# Verify that NaNs are correctly replaced for QC2 rows
|
|
1263
|
+
assert (
|
|
1264
|
+
obs_seq.df.loc[
|
|
1265
|
+
obs_seq.df["DART_quality_control"] == 2.0, "posterior_ensemble_mean"
|
|
1266
|
+
]
|
|
1267
|
+
.isnull()
|
|
1268
|
+
.all()
|
|
1269
|
+
)
|
|
1270
|
+
assert (
|
|
1271
|
+
obs_seq.df.loc[
|
|
1272
|
+
obs_seq.df["DART_quality_control"] == 2.0, "posterior_ensemble_spread"
|
|
1273
|
+
]
|
|
1274
|
+
.isnull()
|
|
1275
|
+
.all()
|
|
1276
|
+
)
|
|
1277
|
+
assert (
|
|
1278
|
+
obs_seq.df.loc[
|
|
1279
|
+
obs_seq.df["DART_quality_control"] == 2.0, "posterior_ensemble_member_1"
|
|
1280
|
+
]
|
|
1281
|
+
.isnull()
|
|
1282
|
+
.all()
|
|
1283
|
+
)
|
|
1284
|
+
assert (
|
|
1285
|
+
obs_seq.df.loc[
|
|
1286
|
+
obs_seq.df["DART_quality_control"] == 2.0, "posterior_ensemble_member_2"
|
|
1287
|
+
]
|
|
1288
|
+
.isnull()
|
|
1289
|
+
.all()
|
|
1290
|
+
)
|
|
1291
|
+
|
|
1292
|
+
def test_revert_qc2_nan(self, obs_seq_nan):
|
|
1293
|
+
# Revert NaNs back to MISSING_R8s
|
|
1294
|
+
obsq.ObsSequence.revert_qc2_nan(obs_seq_nan.df)
|
|
1295
|
+
|
|
1296
|
+
# Verify that MISSING_R8s (-888888.0) are correctly restored for QC2 rows
|
|
1297
|
+
assert (
|
|
1298
|
+
obs_seq_nan.df.loc[
|
|
1299
|
+
obs_seq_nan.df["DART_quality_control"] == 2.0, "posterior_ensemble_mean"
|
|
1300
|
+
]
|
|
1301
|
+
== -888888.0
|
|
1302
|
+
).all()
|
|
1303
|
+
assert (
|
|
1304
|
+
obs_seq_nan.df.loc[
|
|
1305
|
+
obs_seq_nan.df["DART_quality_control"] == 2.0,
|
|
1306
|
+
"posterior_ensemble_spread",
|
|
1307
|
+
]
|
|
1308
|
+
== -888888.0
|
|
1309
|
+
).all()
|
|
1310
|
+
assert (
|
|
1311
|
+
obs_seq_nan.df.loc[
|
|
1312
|
+
obs_seq_nan.df["DART_quality_control"] == 2.0,
|
|
1313
|
+
"posterior_ensemble_member_1",
|
|
1314
|
+
]
|
|
1315
|
+
== -888888.0
|
|
1316
|
+
).all()
|
|
1317
|
+
assert (
|
|
1318
|
+
obs_seq_nan.df.loc[
|
|
1319
|
+
obs_seq_nan.df["DART_quality_control"] == 2.0,
|
|
1320
|
+
"posterior_ensemble_member_2",
|
|
1321
|
+
]
|
|
1322
|
+
== -888888.0
|
|
1323
|
+
).all()
|
|
1324
|
+
|
|
877
1325
|
|
|
878
1326
|
if __name__ == "__main__":
|
|
879
1327
|
pytest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|