pydartdiags 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydartdiags might be problematic. Click here for more details.
- pydartdiags/matplots/matplots.py +200 -20
- pydartdiags/obs_sequence/composite_types.yaml +35 -0
- pydartdiags/obs_sequence/obs_sequence.py +268 -161
- pydartdiags/stats/stats.py +230 -43
- {pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info}/METADATA +4 -3
- pydartdiags-0.6.0.dist-info/RECORD +15 -0
- {pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info}/WHEEL +1 -1
- pydartdiags-0.5.0.dist-info/RECORD +0 -14
- {pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info/licenses}/LICENSE +0 -0
- {pydartdiags-0.5.0.dist-info → pydartdiags-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -9,7 +9,7 @@ import struct
|
|
|
9
9
|
|
|
10
10
|
def requires_assimilation_info(func):
|
|
11
11
|
def wrapper(self, *args, **kwargs):
|
|
12
|
-
if self.has_assimilation_info:
|
|
12
|
+
if self.has_assimilation_info():
|
|
13
13
|
return func(self, *args, **kwargs)
|
|
14
14
|
else:
|
|
15
15
|
raise ValueError(
|
|
@@ -19,27 +19,46 @@ def requires_assimilation_info(func):
|
|
|
19
19
|
return wrapper
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
else:
|
|
27
|
-
raise ValueError("Posterior information is required to call this function.")
|
|
28
|
-
|
|
29
|
-
return wrapper
|
|
22
|
+
class ObsSequence:
|
|
23
|
+
"""
|
|
24
|
+
Initialize an ObsSequence object from an ASCII or binary observation sequence file,
|
|
25
|
+
or create an empty ObsSequence object from scratch.
|
|
30
26
|
|
|
27
|
+
1D observations are given a datetime of days, seconds since 2000-01-01 00:00:00
|
|
31
28
|
|
|
32
|
-
|
|
33
|
-
"""
|
|
34
|
-
Initialize an obs_sequence object from an ASCII or binary observation sequence file,
|
|
35
|
-
or create an empty obs_sequence object from scratch.
|
|
29
|
+
3D observations are given a datetime of days, seconds since 1601-01-01 00:00:00 (DART Gregorian calendar)
|
|
36
30
|
|
|
37
31
|
Args:
|
|
38
32
|
file (str): The input observation sequence ASCII or binary file.
|
|
39
|
-
|
|
33
|
+
If None, an empty ObsSequence object is created from scratch.
|
|
34
|
+
synonyms (list, optional): List of additional synonyms for the observation column in the DataFrame.
|
|
35
|
+
The default list is
|
|
36
|
+
|
|
37
|
+
.. code-block:: python
|
|
38
|
+
|
|
39
|
+
['NCEP BUFR observation',
|
|
40
|
+
'AIRS observation',
|
|
41
|
+
'GTSPP observation',
|
|
42
|
+
'SST observation',
|
|
43
|
+
'observations',
|
|
44
|
+
'WOD observation']
|
|
45
|
+
|
|
46
|
+
You can add more synonyms by providing a list of strings when
|
|
47
|
+
creating the ObsSequence object.
|
|
48
|
+
|
|
49
|
+
.. code-block:: python
|
|
50
|
+
|
|
51
|
+
ObsSequence(file, synonyms=['synonym1', 'synonym2'])
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
ValueError: If neither 'loc3d' nor 'loc1d' could be found in the observation sequence.
|
|
55
|
+
|
|
56
|
+
Examples:
|
|
57
|
+
|
|
58
|
+
.. code-block:: python
|
|
59
|
+
|
|
60
|
+
obs_seq = ObsSequence(file='obs_seq.final')
|
|
40
61
|
|
|
41
|
-
Returns:
|
|
42
|
-
An obs_sequence object
|
|
43
62
|
|
|
44
63
|
Attributes:
|
|
45
64
|
df (pandas.DataFrame): The DataFrame containing the observation sequence data.
|
|
@@ -64,36 +83,18 @@ class obs_sequence:
|
|
|
64
83
|
- scale height: 'VERTISSCALEHEIGHT' (unitless)
|
|
65
84
|
loc_mod (str): The location model, either 'loc3d' or 'loc1d'.
|
|
66
85
|
For 3D sphere models: latitude and longitude are in degrees in the DataFrame.
|
|
67
|
-
types (dict): Dictionary of types of observations the observation sequence,
|
|
86
|
+
types (dict): Dictionary of types of observations in the observation sequence,
|
|
68
87
|
e.g. {23: 'ACARS_TEMPERATURE'},
|
|
69
88
|
reverse_types (dict): Dictionary of types with keys and values reversed, e.g
|
|
70
89
|
{'ACARS_TEMPERATURE': 23}
|
|
71
90
|
synonyms_for_obs (list): List of synonyms for the observation column in the DataFrame.
|
|
72
|
-
The defualt list is
|
|
73
91
|
|
|
74
|
-
.. code-block:: python
|
|
75
92
|
|
|
76
|
-
[ 'NCEP BUFR observation',
|
|
77
|
-
'AIRS observation',
|
|
78
|
-
'GTSPP observation',
|
|
79
|
-
'SST observation',
|
|
80
|
-
'observations',
|
|
81
|
-
'WOD observation']
|
|
82
|
-
|
|
83
|
-
You can add more synonyms by providing a list of strings when
|
|
84
|
-
creating the obs_sequence object.
|
|
85
|
-
|
|
86
|
-
.. code-block:: python
|
|
87
|
-
|
|
88
|
-
obs_sequence(file, synonyms=['synonym1', 'synonym2']).df
|
|
89
|
-
|
|
90
|
-
has_assimilation_info (bool): Indicates if assimilation information is present.
|
|
91
|
-
has_posterior (bool): Indicates if posterior information is present.
|
|
92
93
|
seq (generator): Generator of observations from the observation sequence file.
|
|
93
94
|
all_obs (list): List of all observations, each observation is a list.
|
|
94
|
-
Valid when the
|
|
95
|
-
Set to None when the
|
|
96
|
-
|
|
95
|
+
Valid when the ObsSequence is created from a file.
|
|
96
|
+
Set to None when the ObsSequence is created from scratch or multiple
|
|
97
|
+
ObsSequences are joined.
|
|
97
98
|
"""
|
|
98
99
|
|
|
99
100
|
vert = {
|
|
@@ -108,29 +109,8 @@ class obs_sequence:
|
|
|
108
109
|
reversed_vert = {value: key for key, value in vert.items()}
|
|
109
110
|
|
|
110
111
|
def __init__(self, file, synonyms=None):
|
|
111
|
-
"""
|
|
112
|
-
Create an obs_sequence object from an ASCII or binary observation sequence file,
|
|
113
|
-
or create an empty obs_sequence object from scratch.
|
|
114
|
-
|
|
115
|
-
Args:
|
|
116
|
-
file (str): The input observation sequence ASCII or binary file.
|
|
117
|
-
If None, an empty obs_sequence object is created from scratch.
|
|
118
|
-
synonyms (list, optional): List of synonyms for the observation column in the DataFrame.
|
|
119
|
-
|
|
120
|
-
Returns:
|
|
121
|
-
an obs_sequence object
|
|
122
|
-
|
|
123
|
-
Examples:
|
|
124
|
-
|
|
125
|
-
.. code-block:: python
|
|
126
|
-
|
|
127
|
-
obs_seq = obs_sequence(file='obs_seq.final')
|
|
128
|
-
|
|
129
|
-
"""
|
|
130
112
|
|
|
131
113
|
self.loc_mod = "None"
|
|
132
|
-
self.has_assimilation_info = False
|
|
133
|
-
self.has_posterior = False
|
|
134
114
|
self.file = file
|
|
135
115
|
self.synonyms_for_obs = [
|
|
136
116
|
"NCEP BUFR observation",
|
|
@@ -146,6 +126,9 @@ class obs_sequence:
|
|
|
146
126
|
else:
|
|
147
127
|
self.synonyms_for_obs.append(synonyms)
|
|
148
128
|
|
|
129
|
+
module_dir = os.path.dirname(__file__)
|
|
130
|
+
self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
|
|
131
|
+
|
|
149
132
|
if file is None:
|
|
150
133
|
# Early exit - for testing purposes or creating obs_seq objects from scratch
|
|
151
134
|
self.df = pd.DataFrame()
|
|
@@ -161,9 +144,6 @@ class obs_sequence:
|
|
|
161
144
|
self.all_obs = []
|
|
162
145
|
return
|
|
163
146
|
|
|
164
|
-
module_dir = os.path.dirname(__file__)
|
|
165
|
-
self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
|
|
166
|
-
|
|
167
147
|
if self.is_binary(file):
|
|
168
148
|
self.header = self.read_binary_header(file)
|
|
169
149
|
else:
|
|
@@ -204,12 +184,6 @@ class obs_sequence:
|
|
|
204
184
|
}
|
|
205
185
|
self.df = self.df.rename(columns=rename_dict)
|
|
206
186
|
|
|
207
|
-
# check if the assimilation info is present
|
|
208
|
-
if "prior_ensemble_mean".casefold() in map(str.casefold, self.columns):
|
|
209
|
-
self.has_assimilation_info = True
|
|
210
|
-
if "posterior_ensemble_mean".casefold() in map(str.casefold, self.columns):
|
|
211
|
-
self.has_posterior = True
|
|
212
|
-
|
|
213
187
|
def create_all_obs(self):
|
|
214
188
|
"""steps through the generator to create a
|
|
215
189
|
list of all observations in the sequence
|
|
@@ -232,7 +206,7 @@ class obs_sequence:
|
|
|
232
206
|
data.append(float(location[0])) # location x
|
|
233
207
|
data.append(float(location[1])) # location y
|
|
234
208
|
data.append(float(location[2])) # location z
|
|
235
|
-
data.append(
|
|
209
|
+
data.append(ObsSequence.vert[int(location[3])])
|
|
236
210
|
self.loc_mod = "loc3d"
|
|
237
211
|
except ValueError:
|
|
238
212
|
try:
|
|
@@ -261,9 +235,13 @@ class obs_sequence:
|
|
|
261
235
|
time = obs[-2].split()
|
|
262
236
|
data.append(int(time[0])) # seconds
|
|
263
237
|
data.append(int(time[1])) # days
|
|
264
|
-
|
|
265
|
-
convert_dart_time(int(time[0]), int(time[1]))
|
|
266
|
-
|
|
238
|
+
if self.loc_mod == "loc3d":
|
|
239
|
+
data.append(convert_dart_time(int(time[0]), int(time[1])))
|
|
240
|
+
else: # HK todo what is appropriate for 1d models?
|
|
241
|
+
data.append(
|
|
242
|
+
dt.datetime(2000, 1, 1)
|
|
243
|
+
+ dt.timedelta(seconds=int(time[0]), days=int(time[1]))
|
|
244
|
+
)
|
|
267
245
|
data.append(float(obs[-1])) # obs error variance ?convert to sd?
|
|
268
246
|
|
|
269
247
|
return data
|
|
@@ -355,20 +333,13 @@ class obs_sequence:
|
|
|
355
333
|
obsq.write_obs_seq('obs_seq.new')
|
|
356
334
|
|
|
357
335
|
"""
|
|
358
|
-
with open(file, "w") as f:
|
|
359
336
|
|
|
360
|
-
|
|
361
|
-
num_rows = len(self.df)
|
|
362
|
-
replacement_string = f"num_obs: {num_rows:>10} max_num_obs: {num_rows:>10}"
|
|
363
|
-
new_header = [
|
|
364
|
-
replacement_string if "num_obs" in element else element
|
|
365
|
-
for element in self.header
|
|
366
|
-
]
|
|
337
|
+
self.create_header_from_dataframe()
|
|
367
338
|
|
|
368
|
-
|
|
339
|
+
with open(file, "w") as f:
|
|
340
|
+
|
|
341
|
+
for line in self.header:
|
|
369
342
|
f.write(str(line) + "\n")
|
|
370
|
-
first = 1
|
|
371
|
-
f.write(f"first: {first:>12} last: {num_rows:>12}\n")
|
|
372
343
|
|
|
373
344
|
# TODO HK is there something better than copying the whole thing here?
|
|
374
345
|
df_copy = self.df.copy() # copy since you want to change for writing.
|
|
@@ -376,15 +347,24 @@ class obs_sequence:
|
|
|
376
347
|
if self.loc_mod == "loc3d":
|
|
377
348
|
df_copy["longitude"] = np.deg2rad(self.df["longitude"]).round(16)
|
|
378
349
|
df_copy["latitude"] = np.deg2rad(self.df["latitude"]).round(16)
|
|
379
|
-
if "
|
|
380
|
-
df_copy = df_copy.drop(
|
|
350
|
+
if "prior_bias" in df_copy.columns:
|
|
351
|
+
df_copy = df_copy.drop(
|
|
352
|
+
columns=["prior_bias", "prior_sq_err", "prior_totalvar"]
|
|
353
|
+
)
|
|
354
|
+
if "posterior_bias" in df_copy.columns:
|
|
355
|
+
df_copy = df_copy.drop(
|
|
356
|
+
columns=["posterior_bias", "posterior_sq_err", "posterior_totalvar"]
|
|
357
|
+
)
|
|
358
|
+
if "midpoint" in df_copy.columns:
|
|
359
|
+
df_copy = df_copy.drop(columns=["midpoint", "vlevels"])
|
|
381
360
|
|
|
382
361
|
# linked list for reading by dart programs
|
|
383
362
|
df_copy = df_copy.sort_values(
|
|
384
363
|
by=["time"], kind="stable"
|
|
385
364
|
) # sort the DataFrame by time
|
|
386
|
-
df_copy
|
|
387
|
-
df_copy["
|
|
365
|
+
df_copy.reset_index(drop=True, inplace=True)
|
|
366
|
+
df_copy["obs_num"] = df_copy.index + 1 # obs_num in time order
|
|
367
|
+
df_copy["linked_list"] = ObsSequence.generate_linked_list_pattern(
|
|
388
368
|
len(df_copy)
|
|
389
369
|
) # linked list pattern
|
|
390
370
|
|
|
@@ -395,6 +375,97 @@ class obs_sequence:
|
|
|
395
375
|
|
|
396
376
|
df_copy.apply(write_row, axis=1)
|
|
397
377
|
|
|
378
|
+
@staticmethod
|
|
379
|
+
def update_types_dicts(df, reverse_types):
|
|
380
|
+
"""
|
|
381
|
+
Ensure all unique observation types are in the reverse_types dictionary and create
|
|
382
|
+
the types dictionary.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
df (pd.DataFrame): The DataFrame containing the observation sequence data.
|
|
386
|
+
reverse_types (dict): The dictionary mapping observation types to their corresponding integer values.
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
dict: The updated reverse_types dictionary.
|
|
390
|
+
dict: The types dictionary with keys sorted in numerical order.
|
|
391
|
+
"""
|
|
392
|
+
# Create a dictionary of observation types from the dataframe
|
|
393
|
+
unique_types = df["type"].unique()
|
|
394
|
+
|
|
395
|
+
# Ensure all unique types are in reverse_types
|
|
396
|
+
for obs_type in unique_types:
|
|
397
|
+
if obs_type not in reverse_types:
|
|
398
|
+
new_id = int(max(reverse_types.values(), default=0)) + 1
|
|
399
|
+
reverse_types[obs_type] = str(new_id)
|
|
400
|
+
|
|
401
|
+
not_sorted_types = {
|
|
402
|
+
reverse_types[obs_type]: obs_type for obs_type in unique_types
|
|
403
|
+
}
|
|
404
|
+
types = {
|
|
405
|
+
k: not_sorted_types[k] for k in sorted(not_sorted_types)
|
|
406
|
+
} # to get keys in numerical order
|
|
407
|
+
|
|
408
|
+
return reverse_types, types
|
|
409
|
+
|
|
410
|
+
def create_header_from_dataframe(self):
|
|
411
|
+
"""
|
|
412
|
+
Create a header for the observation sequence based on the data in the DataFrame.
|
|
413
|
+
|
|
414
|
+
It creates a dictionary of unique observation types, counts the
|
|
415
|
+
number of observations, and constructs the header with necessary information.
|
|
416
|
+
|
|
417
|
+
Example:
|
|
418
|
+
self.create_header_from_dataframe()
|
|
419
|
+
|
|
420
|
+
"""
|
|
421
|
+
|
|
422
|
+
self.reverse_types, self.types = self.update_types_dicts(
|
|
423
|
+
self.df, self.reverse_types
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
num_obs = len(self.df)
|
|
427
|
+
|
|
428
|
+
self.header = []
|
|
429
|
+
self.header.append("obs_sequence")
|
|
430
|
+
self.header.append("obs_type_definitions")
|
|
431
|
+
self.header.append(f"{len(self.types)}")
|
|
432
|
+
for key, value in self.types.items():
|
|
433
|
+
self.header.append(f"{key} {value}")
|
|
434
|
+
self.header.append(
|
|
435
|
+
f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}"
|
|
436
|
+
) # @todo HK not keeping track if num_qc changes
|
|
437
|
+
self.header.append(f"num_obs: {num_obs:>10} max_num_obs: {num_obs:>10}")
|
|
438
|
+
stats_cols = [
|
|
439
|
+
"prior_bias",
|
|
440
|
+
"prior_sq_err",
|
|
441
|
+
"prior_totalvar",
|
|
442
|
+
"posterior_bias",
|
|
443
|
+
"posterior_sq_err",
|
|
444
|
+
"posterior_totalvar",
|
|
445
|
+
]
|
|
446
|
+
level_cols = ["vlevels", "midpoint"]
|
|
447
|
+
non_copie_cols = [
|
|
448
|
+
"obs_num",
|
|
449
|
+
"linked_list",
|
|
450
|
+
"longitude",
|
|
451
|
+
"latitude",
|
|
452
|
+
"vertical",
|
|
453
|
+
"vert_unit",
|
|
454
|
+
"type",
|
|
455
|
+
"metadata",
|
|
456
|
+
"external_FO",
|
|
457
|
+
"seconds",
|
|
458
|
+
"days",
|
|
459
|
+
"time",
|
|
460
|
+
"obs_err_var",
|
|
461
|
+
"location",
|
|
462
|
+
]
|
|
463
|
+
for copie in self.df.columns:
|
|
464
|
+
if copie not in stats_cols + non_copie_cols + level_cols:
|
|
465
|
+
self.header.append(copie.replace("_", " "))
|
|
466
|
+
first = 1
|
|
467
|
+
self.header.append(f"first: {first:>12} last: {num_obs:>12}")
|
|
468
|
+
|
|
398
469
|
def column_headers(self):
|
|
399
470
|
"""define the columns for the dataframe"""
|
|
400
471
|
heading = []
|
|
@@ -440,14 +511,18 @@ class obs_sequence:
|
|
|
440
511
|
return self.df[self.df["DART_quality_control"] == dart_qc]
|
|
441
512
|
|
|
442
513
|
@requires_assimilation_info
|
|
443
|
-
def
|
|
514
|
+
def select_used_qcs(self):
|
|
444
515
|
"""
|
|
445
|
-
Select rows from the DataFrame where the
|
|
516
|
+
Select rows from the DataFrame where the observation was used.
|
|
517
|
+
Includes observations for which the posterior forward observation operators failed.
|
|
446
518
|
|
|
447
519
|
Returns:
|
|
448
|
-
pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag
|
|
520
|
+
pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag 0 or 2.
|
|
449
521
|
"""
|
|
450
|
-
return self.df[
|
|
522
|
+
return self.df[
|
|
523
|
+
(self.df["DART_quality_control"] == 0)
|
|
524
|
+
| (self.df["DART_quality_control"] == 2)
|
|
525
|
+
]
|
|
451
526
|
|
|
452
527
|
@requires_assimilation_info
|
|
453
528
|
def possible_vs_used(self):
|
|
@@ -456,7 +531,7 @@ class obs_sequence:
|
|
|
456
531
|
|
|
457
532
|
This function takes a DataFrame containing observation data, including a 'type' column for the observation
|
|
458
533
|
type and an 'observation' column. The number of used observations ('used'), is the total number
|
|
459
|
-
|
|
534
|
+
of assimilated observations (as determined by the `select_used_qcs` function).
|
|
460
535
|
The result is a DataFrame with each observation type, the count of possible observations, and the count of
|
|
461
536
|
used observations.
|
|
462
537
|
|
|
@@ -468,8 +543,8 @@ class obs_sequence:
|
|
|
468
543
|
possible = self.df.groupby("type")["observation"].count()
|
|
469
544
|
possible.rename("possible", inplace=True)
|
|
470
545
|
|
|
471
|
-
|
|
472
|
-
used =
|
|
546
|
+
used_qcs = self.select_used_qcs().groupby("type")["observation"].count()
|
|
547
|
+
used = used_qcs.reindex(possible.index, fill_value=0)
|
|
473
548
|
used.rename("used", inplace=True)
|
|
474
549
|
|
|
475
550
|
return pd.concat([possible, used], axis=1).reset_index()
|
|
@@ -511,7 +586,7 @@ class obs_sequence:
|
|
|
511
586
|
with open(file, "rb") as f:
|
|
512
587
|
while True:
|
|
513
588
|
# Read the record length
|
|
514
|
-
record_length =
|
|
589
|
+
record_length = ObsSequence.read_record_length(f)
|
|
515
590
|
if record_length is None:
|
|
516
591
|
break
|
|
517
592
|
record = f.read(record_length)
|
|
@@ -519,7 +594,7 @@ class obs_sequence:
|
|
|
519
594
|
break
|
|
520
595
|
|
|
521
596
|
# Read the trailing record length (should match the leading one)
|
|
522
|
-
|
|
597
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
523
598
|
|
|
524
599
|
linecount += 1
|
|
525
600
|
|
|
@@ -537,7 +612,7 @@ class obs_sequence:
|
|
|
537
612
|
f.seek(0)
|
|
538
613
|
|
|
539
614
|
for _ in range(2):
|
|
540
|
-
record_length =
|
|
615
|
+
record_length = ObsSequence.read_record_length(f)
|
|
541
616
|
if record_length is None:
|
|
542
617
|
break
|
|
543
618
|
|
|
@@ -545,7 +620,7 @@ class obs_sequence:
|
|
|
545
620
|
if not record: # end of file
|
|
546
621
|
break
|
|
547
622
|
|
|
548
|
-
|
|
623
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
549
624
|
header.append(record.decode("utf-8").strip())
|
|
550
625
|
|
|
551
626
|
header.append(str(obs_types_definitions))
|
|
@@ -553,7 +628,7 @@ class obs_sequence:
|
|
|
553
628
|
# obs_types_definitions
|
|
554
629
|
for _ in range(3, 4 + obs_types_definitions):
|
|
555
630
|
# Read the record length
|
|
556
|
-
record_length =
|
|
631
|
+
record_length = ObsSequence.read_record_length(f)
|
|
557
632
|
if record_length is None:
|
|
558
633
|
break
|
|
559
634
|
|
|
@@ -562,7 +637,7 @@ class obs_sequence:
|
|
|
562
637
|
if not record: # end of file
|
|
563
638
|
break
|
|
564
639
|
|
|
565
|
-
|
|
640
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
566
641
|
|
|
567
642
|
if _ == 3:
|
|
568
643
|
continue # num obs_types_definitions
|
|
@@ -580,7 +655,7 @@ class obs_sequence:
|
|
|
580
655
|
5 + obs_types_definitions + num_copies + num_qcs + 1,
|
|
581
656
|
):
|
|
582
657
|
# Read the record length
|
|
583
|
-
record_length =
|
|
658
|
+
record_length = ObsSequence.read_record_length(f)
|
|
584
659
|
if record_length is None:
|
|
585
660
|
break
|
|
586
661
|
|
|
@@ -589,7 +664,7 @@ class obs_sequence:
|
|
|
589
664
|
if not record:
|
|
590
665
|
break
|
|
591
666
|
|
|
592
|
-
|
|
667
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
593
668
|
|
|
594
669
|
if _ == 5 + obs_types_definitions:
|
|
595
670
|
continue
|
|
@@ -600,12 +675,12 @@ class obs_sequence:
|
|
|
600
675
|
|
|
601
676
|
# first and last obs
|
|
602
677
|
# Read the record length
|
|
603
|
-
record_length =
|
|
678
|
+
record_length = ObsSequence.read_record_length(f)
|
|
604
679
|
|
|
605
680
|
# Read the actual record
|
|
606
681
|
record = f.read(record_length)
|
|
607
682
|
|
|
608
|
-
|
|
683
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
609
684
|
|
|
610
685
|
# Read the whole record as a two integers
|
|
611
686
|
first, last = struct.unpack("ii", record)[:8]
|
|
@@ -730,7 +805,7 @@ class obs_sequence:
|
|
|
730
805
|
# Skip the first len(obs_seq.header) lines
|
|
731
806
|
for _ in range(header_length - 1):
|
|
732
807
|
# Read the record length
|
|
733
|
-
record_length =
|
|
808
|
+
record_length = ObsSequence.read_record_length(f)
|
|
734
809
|
if record_length is None: # End of file
|
|
735
810
|
break
|
|
736
811
|
|
|
@@ -747,7 +822,7 @@ class obs_sequence:
|
|
|
747
822
|
obs.append(f"OBS {obs_num}")
|
|
748
823
|
for _ in range(n): # number of copies
|
|
749
824
|
# Read the record length
|
|
750
|
-
record_length =
|
|
825
|
+
record_length = ObsSequence.read_record_length(f)
|
|
751
826
|
if record_length is None:
|
|
752
827
|
break
|
|
753
828
|
# Read the actual record (copie)
|
|
@@ -755,10 +830,10 @@ class obs_sequence:
|
|
|
755
830
|
obs.append(struct.unpack("d", record)[0])
|
|
756
831
|
|
|
757
832
|
# Read the trailing record length (should match the leading one)
|
|
758
|
-
|
|
833
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
759
834
|
|
|
760
835
|
# linked list info
|
|
761
|
-
record_length =
|
|
836
|
+
record_length = ObsSequence.read_record_length(f)
|
|
762
837
|
if record_length is None:
|
|
763
838
|
break
|
|
764
839
|
|
|
@@ -767,17 +842,17 @@ class obs_sequence:
|
|
|
767
842
|
linked_list_string = f"{int1:<12} {int2:<10} {int3:<12}"
|
|
768
843
|
obs.append(linked_list_string)
|
|
769
844
|
|
|
770
|
-
|
|
845
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
771
846
|
|
|
772
847
|
# location (note no location header "loc3d" or "loc1d" for binary files)
|
|
773
848
|
obs.append("loc3d")
|
|
774
|
-
record_length =
|
|
849
|
+
record_length = ObsSequence.read_record_length(f)
|
|
775
850
|
record = f.read(record_length)
|
|
776
851
|
x, y, z, vert = struct.unpack("dddi", record[:28])
|
|
777
852
|
location_string = f"{x} {y} {z} {vert}"
|
|
778
853
|
obs.append(location_string)
|
|
779
854
|
|
|
780
|
-
|
|
855
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
781
856
|
|
|
782
857
|
# kind (type of observation) value
|
|
783
858
|
obs.append("kind")
|
|
@@ -787,23 +862,23 @@ class obs_sequence:
|
|
|
787
862
|
kind = f"{struct.unpack('i', record)[0]}"
|
|
788
863
|
obs.append(kind)
|
|
789
864
|
|
|
790
|
-
|
|
865
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
791
866
|
|
|
792
867
|
# time (seconds, days)
|
|
793
|
-
record_length =
|
|
868
|
+
record_length = ObsSequence.read_record_length(f)
|
|
794
869
|
record = f.read(record_length)
|
|
795
870
|
seconds, days = struct.unpack("ii", record)[:8]
|
|
796
871
|
time_string = f"{seconds} {days}"
|
|
797
872
|
obs.append(time_string)
|
|
798
873
|
|
|
799
|
-
|
|
874
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
800
875
|
|
|
801
876
|
# obs error variance
|
|
802
|
-
record_length =
|
|
877
|
+
record_length = ObsSequence.read_record_length(f)
|
|
803
878
|
record = f.read(record_length)
|
|
804
879
|
obs.append(struct.unpack("d", record)[0])
|
|
805
880
|
|
|
806
|
-
|
|
881
|
+
ObsSequence.check_trailing_record_length(f, record_length)
|
|
807
882
|
|
|
808
883
|
yield obs
|
|
809
884
|
|
|
@@ -816,7 +891,8 @@ class obs_sequence:
|
|
|
816
891
|
components and adds them to the DataFrame.
|
|
817
892
|
|
|
818
893
|
Args:
|
|
819
|
-
composite_types (str, optional): The YAML configuration for composite types.
|
|
894
|
+
composite_types (str, optional): The YAML configuration for composite types.
|
|
895
|
+
If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
|
|
820
896
|
|
|
821
897
|
Returns:
|
|
822
898
|
pd.DataFrame: The updated DataFrame with the new composite rows added.
|
|
@@ -838,69 +914,68 @@ class obs_sequence:
|
|
|
838
914
|
if len(components) != len(set(components)):
|
|
839
915
|
raise Exception("There are repeat values in components.")
|
|
840
916
|
|
|
917
|
+
# data frame for the composite types
|
|
841
918
|
df_comp = self.df[
|
|
842
919
|
self.df["type"]
|
|
843
920
|
.str.upper()
|
|
844
921
|
.isin([component.upper() for component in components])
|
|
845
922
|
]
|
|
846
|
-
df_no_comp = self.df[
|
|
847
|
-
~self.df["type"]
|
|
848
|
-
.str.upper()
|
|
849
|
-
.isin([component.upper() for component in components])
|
|
850
|
-
]
|
|
851
923
|
|
|
924
|
+
df = pd.DataFrame()
|
|
852
925
|
for key in self.composite_types_dict:
|
|
853
926
|
df_new = construct_composit(
|
|
854
927
|
df_comp, key, self.composite_types_dict[key]["components"]
|
|
855
928
|
)
|
|
856
|
-
|
|
929
|
+
df = pd.concat([df, df_new], axis=0)
|
|
857
930
|
|
|
858
|
-
|
|
931
|
+
# add the composite types to the DataFrame
|
|
932
|
+
self.df = pd.concat([self.df, df], axis=0)
|
|
933
|
+
return
|
|
859
934
|
|
|
860
935
|
@classmethod
|
|
861
936
|
def join(cls, obs_sequences, copies=None):
|
|
862
937
|
"""
|
|
863
938
|
Join a list of observation sequences together.
|
|
864
939
|
|
|
865
|
-
This method combines the headers and observations from a list of
|
|
866
|
-
into a single
|
|
940
|
+
This method combines the headers and observations from a list of ObsSequence objects
|
|
941
|
+
into a single ObsSequence object.
|
|
867
942
|
|
|
868
943
|
Args:
|
|
869
|
-
obs_sequences (list of
|
|
944
|
+
obs_sequences (list of ObsSequences): The list of observation sequences objects to join.
|
|
870
945
|
copies (list of str, optional): A list of copy names to include in the combined data.
|
|
871
946
|
If not provided, all copies are included.
|
|
872
947
|
|
|
873
948
|
Returns:
|
|
874
|
-
A new
|
|
949
|
+
A new ObsSequence object containing the combined data.
|
|
875
950
|
|
|
876
951
|
Example:
|
|
877
952
|
.. code-block:: python
|
|
878
953
|
|
|
879
|
-
obs_seq1 =
|
|
880
|
-
obs_seq2 =
|
|
881
|
-
obs_seq3 =
|
|
882
|
-
combined =
|
|
954
|
+
obs_seq1 = ObsSequence(file='obs_seq1.final')
|
|
955
|
+
obs_seq2 = ObsSequence(file='obs_seq2.final')
|
|
956
|
+
obs_seq3 = ObsSequence(file='obs_seq3.final')
|
|
957
|
+
combined = ObsSequence.join([obs_seq1, obs_seq2, obs_seq3])
|
|
883
958
|
"""
|
|
884
959
|
if not obs_sequences:
|
|
885
960
|
raise ValueError("The list of observation sequences is empty.")
|
|
886
961
|
|
|
887
|
-
# Create a new
|
|
962
|
+
# Create a new ObsSequence object with the combined data
|
|
888
963
|
combo = cls(file=None)
|
|
889
964
|
|
|
890
965
|
# Check if all obs_sequences have compatible attributes
|
|
891
966
|
first_loc_mod = obs_sequences[0].loc_mod
|
|
892
|
-
first_has_assimilation_info = obs_sequences[0].has_assimilation_info
|
|
893
|
-
first_has_posterior = obs_sequences[0].has_posterior
|
|
967
|
+
first_has_assimilation_info = obs_sequences[0].has_assimilation_info()
|
|
968
|
+
first_has_posterior = obs_sequences[0].has_posterior()
|
|
894
969
|
for obs_seq in obs_sequences:
|
|
895
970
|
if obs_seq.loc_mod != first_loc_mod:
|
|
896
971
|
raise ValueError(
|
|
897
972
|
"All observation sequences must have the same loc_mod."
|
|
898
973
|
)
|
|
899
|
-
if obs_seq.has_assimilation_info != first_has_assimilation_info:
|
|
974
|
+
if obs_seq.has_assimilation_info() != first_has_assimilation_info:
|
|
900
975
|
raise ValueError(
|
|
901
976
|
"All observation sequences must have assimilation info."
|
|
902
977
|
)
|
|
903
|
-
if obs_seq.has_posterior != first_has_posterior:
|
|
978
|
+
if obs_seq.has_posterior() != first_has_posterior:
|
|
904
979
|
raise ValueError(
|
|
905
980
|
"All observation sequences must have the posterior info."
|
|
906
981
|
)
|
|
@@ -908,7 +983,7 @@ class obs_sequence:
|
|
|
908
983
|
combo.loc_mod = first_loc_mod
|
|
909
984
|
|
|
910
985
|
# check the copies are compatible (list of copies to combine?)
|
|
911
|
-
# subset of copies if needed
|
|
986
|
+
# subset of copies if needed # @todo HK 1d or 3d
|
|
912
987
|
if copies:
|
|
913
988
|
start_required_columns = ["obs_num", "observation"]
|
|
914
989
|
end_required_columns = [
|
|
@@ -1009,30 +1084,40 @@ class obs_sequence:
|
|
|
1009
1084
|
|
|
1010
1085
|
# create linked list for obs
|
|
1011
1086
|
combo.df = combined_df.sort_values(by="time").reset_index(drop=True)
|
|
1012
|
-
combo.df["linked_list"] =
|
|
1087
|
+
combo.df["linked_list"] = ObsSequence.generate_linked_list_pattern(
|
|
1013
1088
|
len(combo.df)
|
|
1014
1089
|
)
|
|
1015
1090
|
combo.df["obs_num"] = combined_df.index + 1
|
|
1016
1091
|
combo.create_header(len(combo.df))
|
|
1017
1092
|
|
|
1018
|
-
# set assimilation info (mean and spread) (prior and posterior)
|
|
1019
|
-
combo.has_assimilation_info = "prior_ensemble_mean".casefold() in map(
|
|
1020
|
-
str.casefold, combo.df.columns
|
|
1021
|
-
)
|
|
1022
|
-
combo.has_assimilation_info = "prior_ensemble_spread".casefold() in map(
|
|
1023
|
-
str.casefold, combo.df.columns
|
|
1024
|
-
)
|
|
1025
|
-
combo.has_posterior = "posterior_ensemble_mean".casefold() in map(
|
|
1026
|
-
str.casefold, combo.df.columns
|
|
1027
|
-
)
|
|
1028
|
-
combo.has_posterior = "posterior_ensemble_spread".casefold() in map(
|
|
1029
|
-
str.casefold, combo.df.columns
|
|
1030
|
-
)
|
|
1031
|
-
|
|
1032
1093
|
return combo
|
|
1033
1094
|
|
|
1095
|
+
def has_assimilation_info(self):
|
|
1096
|
+
"""
|
|
1097
|
+
Check if the DataFrame has prior information.
|
|
1098
|
+
|
|
1099
|
+
Returns:
|
|
1100
|
+
bool: True if both 'prior_ensemble_mean' and 'prior_ensemble_spread' columns are present, False otherwise.
|
|
1101
|
+
"""
|
|
1102
|
+
return "prior_ensemble_mean".casefold() in map(
|
|
1103
|
+
str.casefold, self.df.columns
|
|
1104
|
+
) and "prior_ensemble_spread".casefold() in map(str.casefold, self.df.columns)
|
|
1105
|
+
|
|
1106
|
+
def has_posterior(self):
|
|
1107
|
+
"""
|
|
1108
|
+
Check if the DataFrame has posterior information.
|
|
1109
|
+
|
|
1110
|
+
Returns:
|
|
1111
|
+
bool: True if both 'posterior_ensemble_mean' and 'posterior_ensemble_spread' columns are present, False otherwise.
|
|
1112
|
+
"""
|
|
1113
|
+
return "posterior_ensemble_mean".casefold() in map(
|
|
1114
|
+
str.casefold, self.df.columns
|
|
1115
|
+
) and "posterior_ensemble_spread".casefold() in map(
|
|
1116
|
+
str.casefold, self.df.columns
|
|
1117
|
+
)
|
|
1118
|
+
|
|
1034
1119
|
def create_header(self, n):
|
|
1035
|
-
"""Create a header for the obs_seq file from the
|
|
1120
|
+
"""Create a header for the obs_seq file from the ObsSequence object."""
|
|
1036
1121
|
assert (
|
|
1037
1122
|
self.n_copies == self.n_non_qc + self.n_qc
|
|
1038
1123
|
), "n_copies must be equal to n_non_qc + n_qc"
|
|
@@ -1065,7 +1150,7 @@ def load_yaml_to_dict(file_path):
|
|
|
1065
1150
|
return yaml.safe_load(file)
|
|
1066
1151
|
except Exception as e:
|
|
1067
1152
|
print(f"Error loading YAML file: {e}")
|
|
1068
|
-
|
|
1153
|
+
raise
|
|
1069
1154
|
|
|
1070
1155
|
|
|
1071
1156
|
def convert_dart_time(seconds, days):
|
|
@@ -1093,17 +1178,39 @@ def construct_composit(df_comp, composite, components):
|
|
|
1093
1178
|
components (list of str): A list containing the type names of the two components to be combined.
|
|
1094
1179
|
|
|
1095
1180
|
Returns:
|
|
1096
|
-
merged_df (pd.DataFrame):
|
|
1181
|
+
merged_df (pd.DataFrame): A DataFrame containing the new composite rows.
|
|
1097
1182
|
"""
|
|
1098
1183
|
selected_rows = df_comp[df_comp["type"] == components[0].upper()]
|
|
1099
1184
|
selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
|
|
1100
1185
|
|
|
1101
|
-
|
|
1102
|
-
|
|
1186
|
+
prior_columns_to_combine = df_comp.filter(regex="prior_ensemble").columns.tolist()
|
|
1187
|
+
posterior_columns_to_combine = df_comp.filter(
|
|
1188
|
+
regex="posterior_ensemble"
|
|
1189
|
+
).columns.tolist()
|
|
1190
|
+
columns_to_combine = (
|
|
1191
|
+
prior_columns_to_combine
|
|
1192
|
+
+ posterior_columns_to_combine
|
|
1193
|
+
+ ["observation", "obs_err_var"]
|
|
1194
|
+
)
|
|
1103
1195
|
merge_columns = ["latitude", "longitude", "vertical", "time"]
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1196
|
+
same_obs_columns = merge_columns + [
|
|
1197
|
+
"observation",
|
|
1198
|
+
"obs_err_var",
|
|
1199
|
+
] # same observation is duplicated
|
|
1200
|
+
|
|
1201
|
+
if (
|
|
1202
|
+
selected_rows[same_obs_columns].duplicated().sum() > 0
|
|
1203
|
+
or selected_rows_v[same_obs_columns].duplicated().sum() > 0
|
|
1204
|
+
):
|
|
1205
|
+
print(
|
|
1206
|
+
f"{selected_rows[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
|
|
1207
|
+
)
|
|
1208
|
+
print(f"{selected_rows[same_obs_columns]}")
|
|
1209
|
+
print(
|
|
1210
|
+
f"{selected_rows_v[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
|
|
1211
|
+
)
|
|
1212
|
+
print(f"{selected_rows_v[same_obs_columns]}")
|
|
1213
|
+
raise Exception("There are duplicates in the components.")
|
|
1107
1214
|
|
|
1108
1215
|
# Merge the two DataFrames on location and time columns
|
|
1109
1216
|
merged_df = pd.merge(
|