pydartdiags 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydartdiags might be problematic. Click here for more details.
- pydartdiags/matplots/matplots.py +200 -20
- pydartdiags/obs_sequence/composite_types.yaml +35 -0
- pydartdiags/obs_sequence/obs_sequence.py +198 -83
- pydartdiags/stats/stats.py +141 -32
- {pydartdiags-0.5.0.dist-info → pydartdiags-0.5.1.dist-info}/METADATA +3 -2
- pydartdiags-0.5.1.dist-info/RECORD +15 -0
- {pydartdiags-0.5.0.dist-info → pydartdiags-0.5.1.dist-info}/WHEEL +1 -1
- pydartdiags-0.5.0.dist-info/RECORD +0 -14
- {pydartdiags-0.5.0.dist-info → pydartdiags-0.5.1.dist-info/licenses}/LICENSE +0 -0
- {pydartdiags-0.5.0.dist-info → pydartdiags-0.5.1.dist-info}/top_level.txt +0 -0
pydartdiags/matplots/matplots.py
CHANGED
|
@@ -28,35 +28,56 @@ def plot_profile(obs_seq, levels, type, bias=True, rmse=True, totalspread=True):
|
|
|
28
28
|
|
|
29
29
|
# calculate stats and add to dataframe
|
|
30
30
|
stats.diag_stats(obs_seq.df)
|
|
31
|
-
qc0 = obs_seq.
|
|
31
|
+
qc0 = stats.select_used_qcs(obs_seq.df) # filter only qc=0, qc=2
|
|
32
32
|
|
|
33
33
|
# filter by type
|
|
34
34
|
qc0 = qc0[qc0["type"] == type]
|
|
35
|
-
|
|
35
|
+
if qc0.empty:
|
|
36
|
+
print(f"No rows found for type: {type}")
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
all_df = obs_seq.df[obs_seq.df["type"] == type] # for possible vs used
|
|
40
|
+
|
|
41
|
+
if all_df["vert_unit"].nunique() > 1:
|
|
42
|
+
print(
|
|
43
|
+
f"Multiple vertical units found in the data: {all_df['vert_unit'].unique()} for type: {type}"
|
|
44
|
+
)
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
vert_unit = all_df.iloc[0]["vert_unit"]
|
|
48
|
+
if vert_unit == "pressure (Pa)":
|
|
49
|
+
conversion = 0.01 # from Pa to hPa
|
|
50
|
+
else:
|
|
51
|
+
conversion = 1.0 # no conversion needed
|
|
36
52
|
|
|
37
53
|
# grand statistics
|
|
38
54
|
grand = stats.grand_statistics(qc0)
|
|
39
55
|
|
|
40
56
|
# add level bins to the dataframe
|
|
41
|
-
stats.bin_by_layer(all_df, levels)
|
|
42
|
-
stats.bin_by_layer(qc0, levels)
|
|
57
|
+
stats.bin_by_layer(all_df, levels, verticalUnit=vert_unit)
|
|
58
|
+
stats.bin_by_layer(qc0, levels, verticalUnit=vert_unit)
|
|
43
59
|
|
|
44
60
|
# aggregate by layer
|
|
45
61
|
df_pvu = stats.possible_vs_used_by_layer(all_df) # possible vs used
|
|
46
62
|
df = stats.layer_statistics(qc0) # bias, rmse, totalspread for plotting
|
|
47
63
|
|
|
48
|
-
|
|
64
|
+
# using rmse because mean_sqrt vs mean for bias (get a column with 0 obs)
|
|
65
|
+
if "prior_rmse" not in df.columns:
|
|
66
|
+
print(f"All layers empty for type: {type}")
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
fig, ax1 = plt.subplots(figsize=(8, 8))
|
|
49
70
|
|
|
50
71
|
# convert to hPa HK @todo only for Pressure (Pa)
|
|
51
72
|
df["midpoint"] = df["midpoint"].astype(float)
|
|
52
|
-
df["midpoint"] = df["midpoint"]
|
|
73
|
+
df["midpoint"] = df["midpoint"] * conversion
|
|
53
74
|
|
|
54
75
|
df_pvu["midpoint"] = df_pvu["midpoint"].astype(float)
|
|
55
|
-
df_pvu["midpoint"] = df_pvu["midpoint"]
|
|
76
|
+
df_pvu["midpoint"] = df_pvu["midpoint"] * conversion
|
|
56
77
|
|
|
57
78
|
# Add horizontal stripes alternating between gray and white to represent the vertical levels
|
|
58
|
-
left = df["vlevels"].apply(lambda x: x.left
|
|
59
|
-
right = df["vlevels"].apply(lambda x: x.right
|
|
79
|
+
left = df["vlevels"].apply(lambda x: x.left * conversion) # todo convert to HPa
|
|
80
|
+
right = df["vlevels"].apply(lambda x: x.right * conversion)
|
|
60
81
|
for i in range(len(left)):
|
|
61
82
|
color = "gray" if i % 2 == 0 else "white"
|
|
62
83
|
ax1.axhspan(left.iloc[i], right.iloc[i], color=color, alpha=0.3)
|
|
@@ -150,33 +171,41 @@ def plot_profile(obs_seq, levels, type, bias=True, rmse=True, totalspread=True):
|
|
|
150
171
|
)
|
|
151
172
|
ax3.set_xlim(left=0)
|
|
152
173
|
|
|
153
|
-
|
|
174
|
+
if vert_unit == "pressure (Pa)":
|
|
175
|
+
ax1.invert_yaxis()
|
|
154
176
|
ax1.set_title(type)
|
|
155
|
-
|
|
156
|
-
|
|
177
|
+
# Build the datalabel string
|
|
178
|
+
datalabel = []
|
|
179
|
+
if bias:
|
|
180
|
+
datalabel.append("bias")
|
|
181
|
+
if rmse:
|
|
182
|
+
datalabel.append("rmse")
|
|
183
|
+
if totalspread:
|
|
184
|
+
datalabel.append("totalspread")
|
|
185
|
+
ax1.set_xlabel(", ".join(datalabel))
|
|
157
186
|
|
|
158
187
|
lines1, labels1 = ax1.get_legend_handles_labels()
|
|
159
188
|
ax1.legend(lines1, labels1, loc="upper left", bbox_to_anchor=(1.05, 1))
|
|
160
189
|
|
|
161
190
|
ax1.text(
|
|
162
|
-
0.
|
|
191
|
+
0.6, -0.08, obs_seq.file, ha="center", va="center", transform=ax1.transAxes
|
|
163
192
|
)
|
|
164
193
|
|
|
165
194
|
# Add a text box with information below the legend
|
|
166
195
|
textstr = "Grand statistics:\n"
|
|
167
196
|
if bias:
|
|
168
|
-
textstr += f"
|
|
197
|
+
textstr += f"prior_bias: {bias_prior:.7f}\n"
|
|
169
198
|
if rmse:
|
|
170
|
-
textstr += f"
|
|
199
|
+
textstr += f"rmse_prior: {rmse_prior:.7f}\n"
|
|
171
200
|
if totalspread:
|
|
172
|
-
textstr += f"
|
|
201
|
+
textstr += f"totalspread_prior: {totalspread_prior:.7f}\n"
|
|
173
202
|
if "posterior_bias" in df.columns:
|
|
174
203
|
if bias:
|
|
175
|
-
textstr += f"
|
|
204
|
+
textstr += f"posterior_bias: {bias_posterior:.7f}\n"
|
|
176
205
|
if rmse:
|
|
177
|
-
textstr += f"
|
|
206
|
+
textstr += f"rmse_posterior: {rmse_posterior:.7f}\n"
|
|
178
207
|
if totalspread:
|
|
179
|
-
textstr += f"
|
|
208
|
+
textstr += f"totalspread_posterior: {totalspread_posterior:.7f}\n"
|
|
180
209
|
|
|
181
210
|
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)
|
|
182
211
|
ax1.text(
|
|
@@ -189,6 +218,7 @@ def plot_profile(obs_seq, levels, type, bias=True, rmse=True, totalspread=True):
|
|
|
189
218
|
bbox=props,
|
|
190
219
|
)
|
|
191
220
|
|
|
221
|
+
plt.tight_layout()
|
|
192
222
|
plt.show()
|
|
193
223
|
|
|
194
224
|
return fig
|
|
@@ -196,7 +226,7 @@ def plot_profile(obs_seq, levels, type, bias=True, rmse=True, totalspread=True):
|
|
|
196
226
|
|
|
197
227
|
def plot_rank_histogram(obs_seq, levels, type, ens_size):
|
|
198
228
|
|
|
199
|
-
qc0 = obs_seq.
|
|
229
|
+
qc0 = stats.select_used_qcs(obs_seq.df) # filter only qc=0, qc=2
|
|
200
230
|
qc0 = qc0[qc0["type"] == type] # filter by type
|
|
201
231
|
stats.bin_by_layer(qc0, levels) # bin by level
|
|
202
232
|
|
|
@@ -241,3 +271,153 @@ def plot_rank_histogram(obs_seq, levels, type, ens_size):
|
|
|
241
271
|
plt.show()
|
|
242
272
|
|
|
243
273
|
return fig
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def plot_evolution(
|
|
277
|
+
obs_seq,
|
|
278
|
+
type,
|
|
279
|
+
time_bin_width,
|
|
280
|
+
stat,
|
|
281
|
+
levels=None,
|
|
282
|
+
tick_interval=2,
|
|
283
|
+
time_format="%m-%d",
|
|
284
|
+
plot_pvu=True,
|
|
285
|
+
):
|
|
286
|
+
"""
|
|
287
|
+
Plot the time evolution of the requested statistics and optionally used vs possible observations.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
obs_seq: The observation sequence object.
|
|
291
|
+
type (str): The type of observation to filter by.
|
|
292
|
+
time_bin_width (str): The width of each time bin (e.g., '3600s' for 1 hour).
|
|
293
|
+
stat (str): The statistic to plot. Default is "prior_rmse".
|
|
294
|
+
levels (list, optional): The levels to bin by. If None, no binning by level.
|
|
295
|
+
tick_interval (int): Interval for x-axis ticks (default is 2).
|
|
296
|
+
time_format (str): Format string for time labels on the x-axis (default is '%m-%d').
|
|
297
|
+
plot_pvu (bool): Whether to plot possible vs used observations (default is True).
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
fig: The matplotlib figure object.
|
|
301
|
+
"""
|
|
302
|
+
# Calculate stats and add to dataframe
|
|
303
|
+
stats.diag_stats(obs_seq.df)
|
|
304
|
+
qc0 = stats.select_used_qcs(obs_seq.df) # filter only qc=0, qc=2
|
|
305
|
+
qc0 = qc0[qc0["type"] == type] # filter by type
|
|
306
|
+
|
|
307
|
+
if qc0.empty:
|
|
308
|
+
print(f"No data found for type: {type}")
|
|
309
|
+
return
|
|
310
|
+
|
|
311
|
+
all_df = obs_seq.df[obs_seq.df["type"] == type] # for possible vs used
|
|
312
|
+
|
|
313
|
+
if levels:
|
|
314
|
+
stats.bin_by_layer(qc0, levels) # bin by level
|
|
315
|
+
midpoints = qc0["midpoint"].unique()
|
|
316
|
+
|
|
317
|
+
for level in sorted(midpoints):
|
|
318
|
+
df = qc0[qc0["midpoint"] == level]
|
|
319
|
+
|
|
320
|
+
# Bin by time
|
|
321
|
+
stats.bin_by_time(df, time_bin_width)
|
|
322
|
+
|
|
323
|
+
# Aggregate by time bin
|
|
324
|
+
df = stats.time_statistics(df)
|
|
325
|
+
|
|
326
|
+
# Calculate possible vs used if enabled
|
|
327
|
+
df_pvu = None
|
|
328
|
+
if plot_pvu:
|
|
329
|
+
stats.bin_by_time(all_df, time_bin_width)
|
|
330
|
+
df_pvu = stats.possible_vs_used_by_time(all_df)
|
|
331
|
+
|
|
332
|
+
# Plot the time evolution of requested stats
|
|
333
|
+
plot_time_evolution(
|
|
334
|
+
df, df_pvu, stat, type, level, tick_interval, time_format, plot_pvu
|
|
335
|
+
)
|
|
336
|
+
else:
|
|
337
|
+
# Bin by time
|
|
338
|
+
stats.bin_by_time(qc0, time_bin_width)
|
|
339
|
+
|
|
340
|
+
# Aggregate by time bin
|
|
341
|
+
df = stats.time_statistics(qc0)
|
|
342
|
+
|
|
343
|
+
# Calculate possible vs used if enabled
|
|
344
|
+
df_pvu = None
|
|
345
|
+
if plot_pvu:
|
|
346
|
+
stats.bin_by_time(all_df, time_bin_width)
|
|
347
|
+
df_pvu = stats.possible_vs_used_by_time(all_df)
|
|
348
|
+
|
|
349
|
+
# Plot the time evolution of requested stats
|
|
350
|
+
return plot_time_evolution(
|
|
351
|
+
df, df_pvu, stat, type, None, tick_interval, time_format, plot_pvu
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def plot_time_evolution(
|
|
356
|
+
df, df_pvu, stat, type, level, tick_interval, time_format, plot_pvu
|
|
357
|
+
):
|
|
358
|
+
"""
|
|
359
|
+
Plot the time evolution of the requested statistics and optionally used vs possible observations.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
df (pd.DataFrame): The aggregated DataFrame for statistics.
|
|
363
|
+
df_pvu (pd.DataFrame): The DataFrame for possible vs used observations (if plot_pvu is True).
|
|
364
|
+
stat (str): The statistic to plot.
|
|
365
|
+
type (str): The type of observation.
|
|
366
|
+
level (float or None): The vertical level (if applicable).
|
|
367
|
+
tick_interval (int): Interval for x-axis ticks (default is 2).
|
|
368
|
+
time_format (str): Format string for time labels on the x-axis.
|
|
369
|
+
plot_pvu (bool): Whether to plot possible vs used observations (default is True).
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
fig: The matplotlib figure object.
|
|
373
|
+
"""
|
|
374
|
+
fig, ax1 = plt.subplots()
|
|
375
|
+
|
|
376
|
+
# Plot prior and posterior statistics
|
|
377
|
+
if f"prior_{stat}" in df.columns:
|
|
378
|
+
ax1.plot(df["time_bin_midpoint"], df[f"prior_{stat}"], label=f"prior {stat}")
|
|
379
|
+
if f"posterior_{stat}" in df.columns:
|
|
380
|
+
ax1.plot(
|
|
381
|
+
df["time_bin_midpoint"], df[f"posterior_{stat}"], label=f"posterior {stat}"
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# Set x-axis ticks every 'tick_interval' values
|
|
385
|
+
tick_positions = df["time_bin_midpoint"][::tick_interval]
|
|
386
|
+
ax1.set_xticks(tick_positions)
|
|
387
|
+
ax1.set_xticklabels(
|
|
388
|
+
tick_positions.dt.strftime(time_format), rotation=45, ha="right"
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
# Add a secondary y-axis for possible vs used observations if enabled
|
|
392
|
+
if plot_pvu and df_pvu is not None:
|
|
393
|
+
ax2 = ax1.twinx()
|
|
394
|
+
ax2.set_ylabel("# obs (o=possible; +=assimilated)", color="red")
|
|
395
|
+
ax2.tick_params(axis="y", colors="red")
|
|
396
|
+
|
|
397
|
+
# Plot possible and used observations
|
|
398
|
+
ax2.plot(
|
|
399
|
+
df_pvu["time_bin_midpoint"],
|
|
400
|
+
df_pvu["possible"],
|
|
401
|
+
color="red",
|
|
402
|
+
marker="o",
|
|
403
|
+
linestyle="",
|
|
404
|
+
markerfacecolor="none",
|
|
405
|
+
)
|
|
406
|
+
ax2.plot(
|
|
407
|
+
df_pvu["time_bin_midpoint"],
|
|
408
|
+
df_pvu["used"],
|
|
409
|
+
color="red",
|
|
410
|
+
marker="+",
|
|
411
|
+
linestyle="",
|
|
412
|
+
)
|
|
413
|
+
ax2.set_ylim(bottom=0)
|
|
414
|
+
|
|
415
|
+
ax1.legend(loc="upper right")
|
|
416
|
+
title = f"{type}" if level is None else f"{type} at level {level}"
|
|
417
|
+
ax1.set_title(title)
|
|
418
|
+
ax1.set_xlabel("Time")
|
|
419
|
+
ax1.set_ylabel(stat)
|
|
420
|
+
|
|
421
|
+
plt.tight_layout()
|
|
422
|
+
|
|
423
|
+
return fig
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
acars_horizontal_wind:
|
|
2
|
+
description: ACARS-derived Horizontal wind speed
|
|
3
|
+
components:
|
|
4
|
+
- acars_u_wind_component
|
|
5
|
+
- acars_v_wind_component
|
|
6
|
+
|
|
7
|
+
sat_horizontal_wind:
|
|
8
|
+
description: Satellite-derived horizontal wind speed
|
|
9
|
+
components:
|
|
10
|
+
- sat_u_wind_component
|
|
11
|
+
- sat_v_wind_component
|
|
12
|
+
|
|
13
|
+
radiosonde_horizontal_wind:
|
|
14
|
+
description: Radiosonde-derived horizontal wind speed
|
|
15
|
+
components:
|
|
16
|
+
- radiosonde_u_wind_component
|
|
17
|
+
- radiosonde_v_wind_component
|
|
18
|
+
|
|
19
|
+
aircraft_horizontal_wind:
|
|
20
|
+
description: Aircraft-derived horizontal wind speed
|
|
21
|
+
components:
|
|
22
|
+
- aircraft_u_wind_component
|
|
23
|
+
- aircraft_v_wind_component
|
|
24
|
+
|
|
25
|
+
10_m_horizontal_wind:
|
|
26
|
+
description: 10 meter horizontal wind speed
|
|
27
|
+
components:
|
|
28
|
+
- 10m_u_wind_component
|
|
29
|
+
- 10m_v_wind_component
|
|
30
|
+
|
|
31
|
+
marine_sfc_horizontal_wind:
|
|
32
|
+
description: Marine surface horizontal wind speed
|
|
33
|
+
components:
|
|
34
|
+
- marine_sfc_u_wind_component
|
|
35
|
+
- marine_sfc_v_wind_component
|
|
@@ -9,7 +9,7 @@ import struct
|
|
|
9
9
|
|
|
10
10
|
def requires_assimilation_info(func):
|
|
11
11
|
def wrapper(self, *args, **kwargs):
|
|
12
|
-
if self.has_assimilation_info:
|
|
12
|
+
if self.has_assimilation_info():
|
|
13
13
|
return func(self, *args, **kwargs)
|
|
14
14
|
else:
|
|
15
15
|
raise ValueError(
|
|
@@ -19,16 +19,6 @@ def requires_assimilation_info(func):
|
|
|
19
19
|
return wrapper
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
def requires_posterior_info(func):
|
|
23
|
-
def wrapper(self, *args, **kwargs):
|
|
24
|
-
if self.has_posterior:
|
|
25
|
-
return func(self, *args, **kwargs)
|
|
26
|
-
else:
|
|
27
|
-
raise ValueError("Posterior information is required to call this function.")
|
|
28
|
-
|
|
29
|
-
return wrapper
|
|
30
|
-
|
|
31
|
-
|
|
32
22
|
class obs_sequence:
|
|
33
23
|
"""
|
|
34
24
|
Initialize an obs_sequence object from an ASCII or binary observation sequence file,
|
|
@@ -69,7 +59,7 @@ class obs_sequence:
|
|
|
69
59
|
reverse_types (dict): Dictionary of types with keys and values reversed, e.g
|
|
70
60
|
{'ACARS_TEMPERATURE': 23}
|
|
71
61
|
synonyms_for_obs (list): List of synonyms for the observation column in the DataFrame.
|
|
72
|
-
The
|
|
62
|
+
The default list is
|
|
73
63
|
|
|
74
64
|
.. code-block:: python
|
|
75
65
|
|
|
@@ -87,8 +77,6 @@ class obs_sequence:
|
|
|
87
77
|
|
|
88
78
|
obs_sequence(file, synonyms=['synonym1', 'synonym2']).df
|
|
89
79
|
|
|
90
|
-
has_assimilation_info (bool): Indicates if assimilation information is present.
|
|
91
|
-
has_posterior (bool): Indicates if posterior information is present.
|
|
92
80
|
seq (generator): Generator of observations from the observation sequence file.
|
|
93
81
|
all_obs (list): List of all observations, each observation is a list.
|
|
94
82
|
Valid when the obs_sequence is created from a file.
|
|
@@ -119,6 +107,8 @@ class obs_sequence:
|
|
|
119
107
|
|
|
120
108
|
Returns:
|
|
121
109
|
an obs_sequence object
|
|
110
|
+
1D observations are given a datetime of days, seconds since 2000-01-01 00:00:00
|
|
111
|
+
3D observations are given a datetime of days, seconds since 1601-01-01 00:00:00 (DART Gregorian calendar)
|
|
122
112
|
|
|
123
113
|
Examples:
|
|
124
114
|
|
|
@@ -129,8 +119,6 @@ class obs_sequence:
|
|
|
129
119
|
"""
|
|
130
120
|
|
|
131
121
|
self.loc_mod = "None"
|
|
132
|
-
self.has_assimilation_info = False
|
|
133
|
-
self.has_posterior = False
|
|
134
122
|
self.file = file
|
|
135
123
|
self.synonyms_for_obs = [
|
|
136
124
|
"NCEP BUFR observation",
|
|
@@ -146,6 +134,9 @@ class obs_sequence:
|
|
|
146
134
|
else:
|
|
147
135
|
self.synonyms_for_obs.append(synonyms)
|
|
148
136
|
|
|
137
|
+
module_dir = os.path.dirname(__file__)
|
|
138
|
+
self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
|
|
139
|
+
|
|
149
140
|
if file is None:
|
|
150
141
|
# Early exit - for testing purposes or creating obs_seq objects from scratch
|
|
151
142
|
self.df = pd.DataFrame()
|
|
@@ -161,9 +152,6 @@ class obs_sequence:
|
|
|
161
152
|
self.all_obs = []
|
|
162
153
|
return
|
|
163
154
|
|
|
164
|
-
module_dir = os.path.dirname(__file__)
|
|
165
|
-
self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
|
|
166
|
-
|
|
167
155
|
if self.is_binary(file):
|
|
168
156
|
self.header = self.read_binary_header(file)
|
|
169
157
|
else:
|
|
@@ -204,12 +192,6 @@ class obs_sequence:
|
|
|
204
192
|
}
|
|
205
193
|
self.df = self.df.rename(columns=rename_dict)
|
|
206
194
|
|
|
207
|
-
# check if the assimilation info is present
|
|
208
|
-
if "prior_ensemble_mean".casefold() in map(str.casefold, self.columns):
|
|
209
|
-
self.has_assimilation_info = True
|
|
210
|
-
if "posterior_ensemble_mean".casefold() in map(str.casefold, self.columns):
|
|
211
|
-
self.has_posterior = True
|
|
212
|
-
|
|
213
195
|
def create_all_obs(self):
|
|
214
196
|
"""steps through the generator to create a
|
|
215
197
|
list of all observations in the sequence
|
|
@@ -261,9 +243,13 @@ class obs_sequence:
|
|
|
261
243
|
time = obs[-2].split()
|
|
262
244
|
data.append(int(time[0])) # seconds
|
|
263
245
|
data.append(int(time[1])) # days
|
|
264
|
-
|
|
265
|
-
convert_dart_time(int(time[0]), int(time[1]))
|
|
266
|
-
|
|
246
|
+
if self.loc_mod == "loc3d":
|
|
247
|
+
data.append(convert_dart_time(int(time[0]), int(time[1])))
|
|
248
|
+
else: # HK todo what is appropriate for 1d models?
|
|
249
|
+
data.append(
|
|
250
|
+
dt.datetime(2000, 1, 1)
|
|
251
|
+
+ dt.timedelta(seconds=int(time[0]), days=int(time[1]))
|
|
252
|
+
)
|
|
267
253
|
data.append(float(obs[-1])) # obs error variance ?convert to sd?
|
|
268
254
|
|
|
269
255
|
return data
|
|
@@ -355,20 +341,13 @@ class obs_sequence:
|
|
|
355
341
|
obsq.write_obs_seq('obs_seq.new')
|
|
356
342
|
|
|
357
343
|
"""
|
|
358
|
-
with open(file, "w") as f:
|
|
359
344
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
new_header = [
|
|
364
|
-
replacement_string if "num_obs" in element else element
|
|
365
|
-
for element in self.header
|
|
366
|
-
]
|
|
345
|
+
self.create_header_from_dataframe()
|
|
346
|
+
|
|
347
|
+
with open(file, "w") as f:
|
|
367
348
|
|
|
368
|
-
for line in
|
|
349
|
+
for line in self.header:
|
|
369
350
|
f.write(str(line) + "\n")
|
|
370
|
-
first = 1
|
|
371
|
-
f.write(f"first: {first:>12} last: {num_rows:>12}\n")
|
|
372
351
|
|
|
373
352
|
# TODO HK is there something better than copying the whole thing here?
|
|
374
353
|
df_copy = self.df.copy() # copy since you want to change for writing.
|
|
@@ -376,14 +355,23 @@ class obs_sequence:
|
|
|
376
355
|
if self.loc_mod == "loc3d":
|
|
377
356
|
df_copy["longitude"] = np.deg2rad(self.df["longitude"]).round(16)
|
|
378
357
|
df_copy["latitude"] = np.deg2rad(self.df["latitude"]).round(16)
|
|
379
|
-
if "
|
|
380
|
-
df_copy = df_copy.drop(
|
|
358
|
+
if "prior_bias" in df_copy.columns:
|
|
359
|
+
df_copy = df_copy.drop(
|
|
360
|
+
columns=["prior_bias", "prior_sq_err", "prior_totalvar"]
|
|
361
|
+
)
|
|
362
|
+
if "posterior_bias" in df_copy.columns:
|
|
363
|
+
df_copy = df_copy.drop(
|
|
364
|
+
columns=["posterior_bias", "posterior_sq_err", "posterior_totalvar"]
|
|
365
|
+
)
|
|
366
|
+
if "midpoint" in df_copy.columns:
|
|
367
|
+
df_copy = df_copy.drop(columns=["midpoint", "vlevels"])
|
|
381
368
|
|
|
382
369
|
# linked list for reading by dart programs
|
|
383
370
|
df_copy = df_copy.sort_values(
|
|
384
371
|
by=["time"], kind="stable"
|
|
385
372
|
) # sort the DataFrame by time
|
|
386
|
-
df_copy
|
|
373
|
+
df_copy.reset_index(drop=True, inplace=True)
|
|
374
|
+
df_copy["obs_num"] = df_copy.index + 1 # obs_num in time order
|
|
387
375
|
df_copy["linked_list"] = obs_sequence.generate_linked_list_pattern(
|
|
388
376
|
len(df_copy)
|
|
389
377
|
) # linked list pattern
|
|
@@ -395,6 +383,97 @@ class obs_sequence:
|
|
|
395
383
|
|
|
396
384
|
df_copy.apply(write_row, axis=1)
|
|
397
385
|
|
|
386
|
+
@staticmethod
|
|
387
|
+
def update_types_dicts(df, reverse_types):
|
|
388
|
+
"""
|
|
389
|
+
Ensure all unique observation types are in the reverse_types dictionary and create
|
|
390
|
+
the types dictionary.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
df (pd.DataFrame): The DataFrame containing the observation sequence data.
|
|
394
|
+
reverse_types (dict): The dictionary mapping observation types to their corresponding integer values.
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
dict: The updated reverse_types dictionary.
|
|
398
|
+
dict: The types dictionary with keys sorted in numerical order.
|
|
399
|
+
"""
|
|
400
|
+
# Create a dictionary of observation types from the dataframe
|
|
401
|
+
unique_types = df["type"].unique()
|
|
402
|
+
|
|
403
|
+
# Ensure all unique types are in reverse_types
|
|
404
|
+
for obs_type in unique_types:
|
|
405
|
+
if obs_type not in reverse_types:
|
|
406
|
+
new_id = int(max(reverse_types.values(), default=0)) + 1
|
|
407
|
+
reverse_types[obs_type] = str(new_id)
|
|
408
|
+
|
|
409
|
+
not_sorted_types = {
|
|
410
|
+
reverse_types[obs_type]: obs_type for obs_type in unique_types
|
|
411
|
+
}
|
|
412
|
+
types = {
|
|
413
|
+
k: not_sorted_types[k] for k in sorted(not_sorted_types)
|
|
414
|
+
} # to get keys in numerical order
|
|
415
|
+
|
|
416
|
+
return reverse_types, types
|
|
417
|
+
|
|
418
|
+
def create_header_from_dataframe(self):
|
|
419
|
+
"""
|
|
420
|
+
Create a header for the observation sequence based on the data in the DataFrame.
|
|
421
|
+
|
|
422
|
+
It creates a dictionary of unique observation types, counts the
|
|
423
|
+
number of observations, and constructs the header with necessary information.
|
|
424
|
+
|
|
425
|
+
Example:
|
|
426
|
+
self.create_header_from_dataframe()
|
|
427
|
+
|
|
428
|
+
"""
|
|
429
|
+
|
|
430
|
+
self.reverse_types, self.types = self.update_types_dicts(
|
|
431
|
+
self.df, self.reverse_types
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
num_obs = len(self.df)
|
|
435
|
+
|
|
436
|
+
self.header = []
|
|
437
|
+
self.header.append("obs_sequence")
|
|
438
|
+
self.header.append("obs_type_definitions")
|
|
439
|
+
self.header.append(f"{len(self.types)}")
|
|
440
|
+
for key, value in self.types.items():
|
|
441
|
+
self.header.append(f"{key} {value}")
|
|
442
|
+
self.header.append(
|
|
443
|
+
f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}"
|
|
444
|
+
) # @todo HK not keeping track if num_qc changes
|
|
445
|
+
self.header.append(f"num_obs: {num_obs:>10} max_num_obs: {num_obs:>10}")
|
|
446
|
+
stats_cols = [
|
|
447
|
+
"prior_bias",
|
|
448
|
+
"prior_sq_err",
|
|
449
|
+
"prior_totalvar",
|
|
450
|
+
"posterior_bias",
|
|
451
|
+
"posterior_sq_err",
|
|
452
|
+
"posterior_totalvar",
|
|
453
|
+
]
|
|
454
|
+
level_cols = ["vlevels", "midpoint"]
|
|
455
|
+
non_copie_cols = [
|
|
456
|
+
"obs_num",
|
|
457
|
+
"linked_list",
|
|
458
|
+
"longitude",
|
|
459
|
+
"latitude",
|
|
460
|
+
"vertical",
|
|
461
|
+
"vert_unit",
|
|
462
|
+
"type",
|
|
463
|
+
"metadata",
|
|
464
|
+
"external_FO",
|
|
465
|
+
"seconds",
|
|
466
|
+
"days",
|
|
467
|
+
"time",
|
|
468
|
+
"obs_err_var",
|
|
469
|
+
"location",
|
|
470
|
+
]
|
|
471
|
+
for copie in self.df.columns:
|
|
472
|
+
if copie not in stats_cols + non_copie_cols + level_cols:
|
|
473
|
+
self.header.append(copie.replace("_", " "))
|
|
474
|
+
first = 1
|
|
475
|
+
self.header.append(f"first: {first:>12} last: {num_obs:>12}")
|
|
476
|
+
|
|
398
477
|
def column_headers(self):
|
|
399
478
|
"""define the columns for the dataframe"""
|
|
400
479
|
heading = []
|
|
@@ -440,14 +519,18 @@ class obs_sequence:
|
|
|
440
519
|
return self.df[self.df["DART_quality_control"] == dart_qc]
|
|
441
520
|
|
|
442
521
|
@requires_assimilation_info
|
|
443
|
-
def
|
|
522
|
+
def select_used_qcs(self):
|
|
444
523
|
"""
|
|
445
|
-
Select rows from the DataFrame where the
|
|
524
|
+
Select rows from the DataFrame where the observation was used.
|
|
525
|
+
Includes observations for which the posterior forward observation operators failed.
|
|
446
526
|
|
|
447
527
|
Returns:
|
|
448
|
-
pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag
|
|
528
|
+
pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag 0 or 2.
|
|
449
529
|
"""
|
|
450
|
-
return self.df[
|
|
530
|
+
return self.df[
|
|
531
|
+
(self.df["DART_quality_control"] == 0)
|
|
532
|
+
| (self.df["DART_quality_control"] == 2)
|
|
533
|
+
]
|
|
451
534
|
|
|
452
535
|
@requires_assimilation_info
|
|
453
536
|
def possible_vs_used(self):
|
|
@@ -456,7 +539,7 @@ class obs_sequence:
|
|
|
456
539
|
|
|
457
540
|
This function takes a DataFrame containing observation data, including a 'type' column for the observation
|
|
458
541
|
type and an 'observation' column. The number of used observations ('used'), is the total number
|
|
459
|
-
|
|
542
|
+
of assimilated observations (as determined by the `select_used_qcs` function).
|
|
460
543
|
The result is a DataFrame with each observation type, the count of possible observations, and the count of
|
|
461
544
|
used observations.
|
|
462
545
|
|
|
@@ -468,8 +551,8 @@ class obs_sequence:
|
|
|
468
551
|
possible = self.df.groupby("type")["observation"].count()
|
|
469
552
|
possible.rename("possible", inplace=True)
|
|
470
553
|
|
|
471
|
-
|
|
472
|
-
used =
|
|
554
|
+
used_qcs = self.select_used_qcs().groupby("type")["observation"].count()
|
|
555
|
+
used = used_qcs.reindex(possible.index, fill_value=0)
|
|
473
556
|
used.rename("used", inplace=True)
|
|
474
557
|
|
|
475
558
|
return pd.concat([possible, used], axis=1).reset_index()
|
|
@@ -816,7 +899,8 @@ class obs_sequence:
|
|
|
816
899
|
components and adds them to the DataFrame.
|
|
817
900
|
|
|
818
901
|
Args:
|
|
819
|
-
composite_types (str, optional): The YAML configuration for composite types.
|
|
902
|
+
composite_types (str, optional): The YAML configuration for composite types.
|
|
903
|
+
If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
|
|
820
904
|
|
|
821
905
|
Returns:
|
|
822
906
|
pd.DataFrame: The updated DataFrame with the new composite rows added.
|
|
@@ -838,24 +922,23 @@ class obs_sequence:
|
|
|
838
922
|
if len(components) != len(set(components)):
|
|
839
923
|
raise Exception("There are repeat values in components.")
|
|
840
924
|
|
|
925
|
+
# data frame for the composite types
|
|
841
926
|
df_comp = self.df[
|
|
842
927
|
self.df["type"]
|
|
843
928
|
.str.upper()
|
|
844
929
|
.isin([component.upper() for component in components])
|
|
845
930
|
]
|
|
846
|
-
df_no_comp = self.df[
|
|
847
|
-
~self.df["type"]
|
|
848
|
-
.str.upper()
|
|
849
|
-
.isin([component.upper() for component in components])
|
|
850
|
-
]
|
|
851
931
|
|
|
932
|
+
df = pd.DataFrame()
|
|
852
933
|
for key in self.composite_types_dict:
|
|
853
934
|
df_new = construct_composit(
|
|
854
935
|
df_comp, key, self.composite_types_dict[key]["components"]
|
|
855
936
|
)
|
|
856
|
-
|
|
937
|
+
df = pd.concat([df, df_new], axis=0)
|
|
857
938
|
|
|
858
|
-
|
|
939
|
+
# add the composite types to the DataFrame
|
|
940
|
+
self.df = pd.concat([self.df, df], axis=0)
|
|
941
|
+
return
|
|
859
942
|
|
|
860
943
|
@classmethod
|
|
861
944
|
def join(cls, obs_sequences, copies=None):
|
|
@@ -889,18 +972,18 @@ class obs_sequence:
|
|
|
889
972
|
|
|
890
973
|
# Check if all obs_sequences have compatible attributes
|
|
891
974
|
first_loc_mod = obs_sequences[0].loc_mod
|
|
892
|
-
first_has_assimilation_info = obs_sequences[0].has_assimilation_info
|
|
893
|
-
first_has_posterior = obs_sequences[0].has_posterior
|
|
975
|
+
first_has_assimilation_info = obs_sequences[0].has_assimilation_info()
|
|
976
|
+
first_has_posterior = obs_sequences[0].has_posterior()
|
|
894
977
|
for obs_seq in obs_sequences:
|
|
895
978
|
if obs_seq.loc_mod != first_loc_mod:
|
|
896
979
|
raise ValueError(
|
|
897
980
|
"All observation sequences must have the same loc_mod."
|
|
898
981
|
)
|
|
899
|
-
if obs_seq.has_assimilation_info != first_has_assimilation_info:
|
|
982
|
+
if obs_seq.has_assimilation_info() != first_has_assimilation_info:
|
|
900
983
|
raise ValueError(
|
|
901
984
|
"All observation sequences must have assimilation info."
|
|
902
985
|
)
|
|
903
|
-
if obs_seq.has_posterior != first_has_posterior:
|
|
986
|
+
if obs_seq.has_posterior() != first_has_posterior:
|
|
904
987
|
raise ValueError(
|
|
905
988
|
"All observation sequences must have the posterior info."
|
|
906
989
|
)
|
|
@@ -908,7 +991,7 @@ class obs_sequence:
|
|
|
908
991
|
combo.loc_mod = first_loc_mod
|
|
909
992
|
|
|
910
993
|
# check the copies are compatible (list of copies to combine?)
|
|
911
|
-
# subset of copies if needed
|
|
994
|
+
# subset of copies if needed # @todo HK 1d or 3d
|
|
912
995
|
if copies:
|
|
913
996
|
start_required_columns = ["obs_num", "observation"]
|
|
914
997
|
end_required_columns = [
|
|
@@ -1015,22 +1098,32 @@ class obs_sequence:
|
|
|
1015
1098
|
combo.df["obs_num"] = combined_df.index + 1
|
|
1016
1099
|
combo.create_header(len(combo.df))
|
|
1017
1100
|
|
|
1018
|
-
# set assimilation info (mean and spread) (prior and posterior)
|
|
1019
|
-
combo.has_assimilation_info = "prior_ensemble_mean".casefold() in map(
|
|
1020
|
-
str.casefold, combo.df.columns
|
|
1021
|
-
)
|
|
1022
|
-
combo.has_assimilation_info = "prior_ensemble_spread".casefold() in map(
|
|
1023
|
-
str.casefold, combo.df.columns
|
|
1024
|
-
)
|
|
1025
|
-
combo.has_posterior = "posterior_ensemble_mean".casefold() in map(
|
|
1026
|
-
str.casefold, combo.df.columns
|
|
1027
|
-
)
|
|
1028
|
-
combo.has_posterior = "posterior_ensemble_spread".casefold() in map(
|
|
1029
|
-
str.casefold, combo.df.columns
|
|
1030
|
-
)
|
|
1031
|
-
|
|
1032
1101
|
return combo
|
|
1033
1102
|
|
|
1103
|
+
def has_assimilation_info(self):
|
|
1104
|
+
"""
|
|
1105
|
+
Check if the DataFrame has prior information.
|
|
1106
|
+
|
|
1107
|
+
Returns:
|
|
1108
|
+
bool: True if both 'prior_ensemble_mean' and 'prior_ensemble_spread' columns are present, False otherwise.
|
|
1109
|
+
"""
|
|
1110
|
+
return "prior_ensemble_mean".casefold() in map(
|
|
1111
|
+
str.casefold, self.df.columns
|
|
1112
|
+
) and "prior_ensemble_spread".casefold() in map(str.casefold, self.df.columns)
|
|
1113
|
+
|
|
1114
|
+
def has_posterior(self):
|
|
1115
|
+
"""
|
|
1116
|
+
Check if the DataFrame has posterior information.
|
|
1117
|
+
|
|
1118
|
+
Returns:
|
|
1119
|
+
bool: True if both 'posterior_ensemble_mean' and 'posterior_ensemble_spread' columns are present, False otherwise.
|
|
1120
|
+
"""
|
|
1121
|
+
return "posterior_ensemble_mean".casefold() in map(
|
|
1122
|
+
str.casefold, self.df.columns
|
|
1123
|
+
) and "posterior_ensemble_spread".casefold() in map(
|
|
1124
|
+
str.casefold, self.df.columns
|
|
1125
|
+
)
|
|
1126
|
+
|
|
1034
1127
|
def create_header(self, n):
|
|
1035
1128
|
"""Create a header for the obs_seq file from the obs_sequence object."""
|
|
1036
1129
|
assert (
|
|
@@ -1065,7 +1158,7 @@ def load_yaml_to_dict(file_path):
|
|
|
1065
1158
|
return yaml.safe_load(file)
|
|
1066
1159
|
except Exception as e:
|
|
1067
1160
|
print(f"Error loading YAML file: {e}")
|
|
1068
|
-
|
|
1161
|
+
raise
|
|
1069
1162
|
|
|
1070
1163
|
|
|
1071
1164
|
def convert_dart_time(seconds, days):
|
|
@@ -1093,17 +1186,39 @@ def construct_composit(df_comp, composite, components):
|
|
|
1093
1186
|
components (list of str): A list containing the type names of the two components to be combined.
|
|
1094
1187
|
|
|
1095
1188
|
Returns:
|
|
1096
|
-
merged_df (pd.DataFrame):
|
|
1189
|
+
merged_df (pd.DataFrame): A DataFrame containing the new composite rows.
|
|
1097
1190
|
"""
|
|
1098
1191
|
selected_rows = df_comp[df_comp["type"] == components[0].upper()]
|
|
1099
1192
|
selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
|
|
1100
1193
|
|
|
1101
|
-
|
|
1102
|
-
|
|
1194
|
+
prior_columns_to_combine = df_comp.filter(regex="prior_ensemble").columns.tolist()
|
|
1195
|
+
posterior_columns_to_combine = df_comp.filter(
|
|
1196
|
+
regex="posterior_ensemble"
|
|
1197
|
+
).columns.tolist()
|
|
1198
|
+
columns_to_combine = (
|
|
1199
|
+
prior_columns_to_combine
|
|
1200
|
+
+ posterior_columns_to_combine
|
|
1201
|
+
+ ["observation", "obs_err_var"]
|
|
1202
|
+
)
|
|
1103
1203
|
merge_columns = ["latitude", "longitude", "vertical", "time"]
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1204
|
+
same_obs_columns = merge_columns + [
|
|
1205
|
+
"observation",
|
|
1206
|
+
"obs_err_var",
|
|
1207
|
+
] # same observation is duplicated
|
|
1208
|
+
|
|
1209
|
+
if (
|
|
1210
|
+
selected_rows[same_obs_columns].duplicated().sum() > 0
|
|
1211
|
+
or selected_rows_v[same_obs_columns].duplicated().sum() > 0
|
|
1212
|
+
):
|
|
1213
|
+
print(
|
|
1214
|
+
f"{selected_rows[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
|
|
1215
|
+
)
|
|
1216
|
+
print(f"{selected_rows[same_obs_columns]}")
|
|
1217
|
+
print(
|
|
1218
|
+
f"{selected_rows_v[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
|
|
1219
|
+
)
|
|
1220
|
+
print(f"{selected_rows_v[same_obs_columns]}")
|
|
1221
|
+
raise Exception("There are duplicates in the components.")
|
|
1107
1222
|
|
|
1108
1223
|
# Merge the two DataFrames on location and time columns
|
|
1109
1224
|
merged_df = pd.merge(
|
pydartdiags/stats/stats.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import numpy as np
|
|
4
4
|
from functools import wraps
|
|
5
|
+
from datetime import datetime, timedelta
|
|
5
6
|
|
|
6
7
|
# from pydartdiags.obs_sequence import obs_sequence as obsq
|
|
7
8
|
|
|
@@ -39,20 +40,30 @@ def apply_to_phases_by_type_return_df(func):
|
|
|
39
40
|
result = func(df, phase, *args, **kwargs)
|
|
40
41
|
results.append(result)
|
|
41
42
|
|
|
42
|
-
if
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
43
|
+
if not results:
|
|
44
|
+
return (
|
|
45
|
+
pd.DataFrame()
|
|
46
|
+
) # Return an empty DataFrame if no results are generated
|
|
47
|
+
|
|
48
|
+
# Dynamically determine merge keys based on common columns
|
|
49
|
+
common_columns = set(results[0].columns)
|
|
50
|
+
for result in results[1:]:
|
|
51
|
+
common_columns &= set(result.columns)
|
|
52
|
+
|
|
53
|
+
# Exclude phase-specific columns from the merge keys
|
|
54
|
+
phase_specific_columns = {
|
|
55
|
+
f"{phase}_sq_err",
|
|
56
|
+
f"{phase}_bias",
|
|
57
|
+
f"{phase}_totalvar",
|
|
58
|
+
f"{phase}_rmse",
|
|
59
|
+
f"{phase}_totalspread",
|
|
60
|
+
}
|
|
61
|
+
merge_keys = list(common_columns - phase_specific_columns)
|
|
62
|
+
|
|
63
|
+
if len(results) == 2:
|
|
64
|
+
return pd.merge(results[0], results[1], on=merge_keys)
|
|
51
65
|
else:
|
|
52
|
-
|
|
53
|
-
return pd.merge(results[0], results[1], on="type")
|
|
54
|
-
else:
|
|
55
|
-
return results[0]
|
|
66
|
+
return results[0]
|
|
56
67
|
|
|
57
68
|
return wrapper
|
|
58
69
|
|
|
@@ -211,15 +222,41 @@ def bin_by_layer(df, levels, verticalUnit="pressure (Pa)"):
|
|
|
211
222
|
df.loc[df["vert_unit"] == verticalUnit, "vlevels"] = pd.cut(
|
|
212
223
|
df.loc[df["vert_unit"] == verticalUnit, "vertical"], levels
|
|
213
224
|
)
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
225
|
+
df.loc[:, "midpoint"] = df["vlevels"].apply(lambda x: x.mid)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def bin_by_time(df, time_value):
|
|
229
|
+
"""
|
|
230
|
+
Bin observations by time and add 'time_bin' and 'time_bin_midpoint' columns to the DataFrame.
|
|
231
|
+
The first bin starts 1 second before the minimum time value, so the minimum time is included in the
|
|
232
|
+
first bin. The last bin is inclusive of the maximum time value.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
df (pd.DataFrame): The input DataFrame containing a 'time' column.
|
|
236
|
+
time_value (str): The width of each time bin (e.g., '3600S' for 1 hour).
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
None: The function modifies the DataFrame in place by adding 'time_bin' and 'time_bin_midpoint' columns.
|
|
240
|
+
"""
|
|
241
|
+
# Create time bins
|
|
242
|
+
start = df["time"].min() - timedelta(seconds=1)
|
|
243
|
+
end = df["time"].max()
|
|
244
|
+
# Determine if the end time aligns with the bin boundary
|
|
245
|
+
time_delta = pd.Timedelta(time_value)
|
|
246
|
+
aligned_end = (pd.Timestamp(end) + time_delta).floor(time_value)
|
|
247
|
+
|
|
248
|
+
time_bins = pd.date_range(
|
|
249
|
+
start=start,
|
|
250
|
+
end=aligned_end,
|
|
251
|
+
freq=time_value,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
df["time_bin"] = pd.cut(df["time"], bins=time_bins)
|
|
255
|
+
|
|
256
|
+
# Calculate the midpoint of each time bin
|
|
257
|
+
df["time_bin_midpoint"] = df["time_bin"].apply(
|
|
258
|
+
lambda x: x.left + (x.right - x.left) / 2 if pd.notnull(x) else None
|
|
259
|
+
)
|
|
223
260
|
|
|
224
261
|
|
|
225
262
|
@apply_to_phases_by_type_return_df
|
|
@@ -270,13 +307,48 @@ def layer_statistics(df, phase):
|
|
|
270
307
|
return layer_stats
|
|
271
308
|
|
|
272
309
|
|
|
310
|
+
@apply_to_phases_by_type_return_df
|
|
311
|
+
def time_statistics(df, phase):
|
|
312
|
+
"""
|
|
313
|
+
Calculate time-based statistics for a given phase and return a new DataFrame.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
df (pandas.DataFrame): The input DataFrame containing observation data and ensemble statistics.
|
|
317
|
+
phase (str): The phase for which to calculate the statistics ('prior' or 'posterior').
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
pandas.DataFrame: A DataFrame containing time-based statistics for the specified phase.
|
|
321
|
+
"""
|
|
322
|
+
# Assuming diag_stats has been called
|
|
323
|
+
time_stats = (
|
|
324
|
+
df.groupby(["time_bin_midpoint", "type"], observed=False)
|
|
325
|
+
.agg(
|
|
326
|
+
{
|
|
327
|
+
f"{phase}_sq_err": mean_then_sqrt,
|
|
328
|
+
f"{phase}_bias": "mean",
|
|
329
|
+
f"{phase}_totalvar": mean_then_sqrt,
|
|
330
|
+
"time_bin": "first",
|
|
331
|
+
"time": "first",
|
|
332
|
+
}
|
|
333
|
+
)
|
|
334
|
+
.reset_index()
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
time_stats.rename(columns={f"{phase}_sq_err": f"{phase}_rmse"}, inplace=True)
|
|
338
|
+
time_stats.rename(
|
|
339
|
+
columns={f"{phase}_totalvar": f"{phase}_totalspread"}, inplace=True
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
return time_stats
|
|
343
|
+
|
|
344
|
+
|
|
273
345
|
def possible_vs_used(df):
|
|
274
346
|
"""
|
|
275
347
|
Calculates the count of possible vs. used observations by type.
|
|
276
348
|
|
|
277
349
|
This function takes a DataFrame containing observation data, including a 'type' column for the observation
|
|
278
350
|
type and an 'observation' column. The number of used observations ('used'), is the total number
|
|
279
|
-
|
|
351
|
+
of assimilated observations (as determined by the `select_used_qcs` function).
|
|
280
352
|
The result is a DataFrame with each observation type, the count of possible observations, and the count of
|
|
281
353
|
used observations.
|
|
282
354
|
|
|
@@ -288,8 +360,8 @@ def possible_vs_used(df):
|
|
|
288
360
|
possible = df.groupby("type")["observation"].count()
|
|
289
361
|
possible.rename("possible", inplace=True)
|
|
290
362
|
|
|
291
|
-
|
|
292
|
-
used =
|
|
363
|
+
used_qcs = select_used_qcs(df).groupby("type")["observation"].count()
|
|
364
|
+
used = used_qcs.reindex(possible.index, fill_value=0)
|
|
293
365
|
used.rename("used", inplace=True)
|
|
294
366
|
|
|
295
367
|
return pd.concat([possible, used], axis=1).reset_index()
|
|
@@ -302,22 +374,59 @@ def possible_vs_used_by_layer(df):
|
|
|
302
374
|
possible = df.groupby(["type", "midpoint"], observed=False)["type"].count()
|
|
303
375
|
possible.rename("possible", inplace=True)
|
|
304
376
|
|
|
305
|
-
|
|
306
|
-
|
|
377
|
+
used_qcs = (
|
|
378
|
+
select_used_qcs(df)
|
|
307
379
|
.groupby(["type", "midpoint"], observed=False)["type"]
|
|
308
380
|
.count()
|
|
309
381
|
)
|
|
310
|
-
|
|
382
|
+
|
|
383
|
+
used = used_qcs.reindex(possible.index, fill_value=0)
|
|
311
384
|
used.rename("used", inplace=True)
|
|
312
385
|
|
|
313
386
|
return pd.concat([possible, used], axis=1).reset_index()
|
|
314
387
|
|
|
315
388
|
|
|
316
|
-
def
|
|
389
|
+
def select_used_qcs(df):
|
|
317
390
|
"""
|
|
318
|
-
Select rows from the DataFrame where the
|
|
391
|
+
Select rows from the DataFrame where the observation was used.
|
|
392
|
+
Includes observations for which the posterior forward observation operators failed.
|
|
319
393
|
|
|
320
394
|
Returns:
|
|
321
|
-
pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag
|
|
395
|
+
pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag 0 or 2.
|
|
396
|
+
"""
|
|
397
|
+
return df[(df["DART_quality_control"] == 0) | (df["DART_quality_control"] == 2)]
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def possible_vs_used_by_time(df):
|
|
322
401
|
"""
|
|
323
|
-
|
|
402
|
+
Calculates the count of possible vs. used observations by type and time bin.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
df (pd.DataFrame): The input DataFrame containing observation data. The DataFrame must include:
|
|
406
|
+
- 'type': The observation type.
|
|
407
|
+
- 'time_bin_midpoint': The midpoint of the time bin.
|
|
408
|
+
- 'observation': The observation values.
|
|
409
|
+
- 'DART_quality_control': The quality control flag.
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
pd.DataFrame: A DataFrame with the following columns:
|
|
413
|
+
- 'time_bin_midpoint': The midpoint of the time bin.
|
|
414
|
+
- 'type': The observation type.
|
|
415
|
+
- 'possible': The count of all observations in the time bin.
|
|
416
|
+
- 'used': The count of observations in the time bin that passed quality control checks.
|
|
417
|
+
"""
|
|
418
|
+
# Count all observations (possible) grouped by time_bin_midpoint and type
|
|
419
|
+
possible = df.groupby(["time_bin_midpoint", "type"], observed=False)["type"].count()
|
|
420
|
+
possible.rename("possible", inplace=True)
|
|
421
|
+
|
|
422
|
+
# Count used observations (QC=0 or QC=2) grouped by time_bin_midpoint and type
|
|
423
|
+
used_qcs = (
|
|
424
|
+
select_used_qcs(df)
|
|
425
|
+
.groupby(["time_bin_midpoint", "type"], observed=False)["type"]
|
|
426
|
+
.count()
|
|
427
|
+
)
|
|
428
|
+
used = used_qcs.reindex(possible.index, fill_value=0)
|
|
429
|
+
used.rename("used", inplace=True)
|
|
430
|
+
|
|
431
|
+
# Combine possible and used into a single DataFrame
|
|
432
|
+
return pd.concat([possible, used], axis=1).reset_index()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: pydartdiags
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Observation Sequence Diagnostics for DART
|
|
5
5
|
Home-page: https://github.com/NCAR/pyDARTdiags.git
|
|
6
6
|
Author: Helen Kershaw
|
|
@@ -21,6 +21,7 @@ Requires-Dist: pyyaml>=6.0.2
|
|
|
21
21
|
Requires-Dist: matplotlib>=3.9.4
|
|
22
22
|
Dynamic: author
|
|
23
23
|
Dynamic: home-page
|
|
24
|
+
Dynamic: license-file
|
|
24
25
|
Dynamic: requires-python
|
|
25
26
|
|
|
26
27
|
[](https://opensource.org/licenses/Apache-2.0)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
pydartdiags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
pydartdiags/matplots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
pydartdiags/matplots/matplots.py,sha256=Bo0TTz1gvsHEvTfTfLfdTi_3hNRN1okmyY5a5yYgtzk,13455
|
|
4
|
+
pydartdiags/obs_sequence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
pydartdiags/obs_sequence/composite_types.yaml,sha256=PVLMU6x6KcVMCwPB-U65C_e0YQUemfqUhYMpf1DhFOY,917
|
|
6
|
+
pydartdiags/obs_sequence/obs_sequence.py,sha256=8RGUzfWxSlGtPx_uz5lhLJaUaG8ju6qmiIU7da43nwk,48444
|
|
7
|
+
pydartdiags/plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
pydartdiags/plots/plots.py,sha256=U7WQjE_qN-5a8-85D-PkkgILSFBzTJQ1mcGBa7l5DHI,6464
|
|
9
|
+
pydartdiags/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
pydartdiags/stats/stats.py,sha256=HbRj3toQRx63mX1a1FXHA5_7yGITz8JKHbhjMoAHChk,16163
|
|
11
|
+
pydartdiags-0.5.1.dist-info/licenses/LICENSE,sha256=ROglds_Eg_ylXp-1MHmEawDqMw_UsCB4r9sk7z9PU9M,11377
|
|
12
|
+
pydartdiags-0.5.1.dist-info/METADATA,sha256=Fn3KsjQZma-696rO-yGpAHrHqV2izTNpVmBnYPx9z6k,2413
|
|
13
|
+
pydartdiags-0.5.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
14
|
+
pydartdiags-0.5.1.dist-info/top_level.txt,sha256=LfMoPLnSd0VhhlWev1eeX9t6AzvyASOloag0LO_ppWg,12
|
|
15
|
+
pydartdiags-0.5.1.dist-info/RECORD,,
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
pydartdiags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
pydartdiags/matplots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
pydartdiags/matplots/matplots.py,sha256=44MlD98gaQsrCT0mW6M9f0a2-clm3KEGrdYqkTUO0RI,7478
|
|
4
|
-
pydartdiags/obs_sequence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
-
pydartdiags/obs_sequence/obs_sequence.py,sha256=kdPOWAqgiyuv6cTdhYx1u9Ru6zCKF0Wd--7-sM3m5F8,44527
|
|
6
|
-
pydartdiags/plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
pydartdiags/plots/plots.py,sha256=U7WQjE_qN-5a8-85D-PkkgILSFBzTJQ1mcGBa7l5DHI,6464
|
|
8
|
-
pydartdiags/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
pydartdiags/stats/stats.py,sha256=tzjE6HBrw6s9Li0UlJ_sNMcGEU8loT_BA5SDZp-UTOc,12138
|
|
10
|
-
pydartdiags-0.5.0.dist-info/LICENSE,sha256=ROglds_Eg_ylXp-1MHmEawDqMw_UsCB4r9sk7z9PU9M,11377
|
|
11
|
-
pydartdiags-0.5.0.dist-info/METADATA,sha256=F6znTR7qrj2qoGBYNojmWiaOqa9EAETgphV7i0HW0xc,2391
|
|
12
|
-
pydartdiags-0.5.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
13
|
-
pydartdiags-0.5.0.dist-info/top_level.txt,sha256=LfMoPLnSd0VhhlWev1eeX9t6AzvyASOloag0LO_ppWg,12
|
|
14
|
-
pydartdiags-0.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|