pydartdiags 0.0.43__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydartdiags might be problematic. Click here for more details.
- pydartdiags/matplots/__init__.py +0 -0
- pydartdiags/matplots/matplots.py +423 -0
- pydartdiags/obs_sequence/composite_types.yaml +35 -0
- pydartdiags/obs_sequence/obs_sequence.py +756 -343
- pydartdiags/plots/plots.py +80 -228
- pydartdiags/stats/__init__.py +0 -0
- pydartdiags/stats/stats.py +432 -0
- {pydartdiags-0.0.43.dist-info → pydartdiags-0.5.1.dist-info}/METADATA +10 -5
- pydartdiags-0.5.1.dist-info/RECORD +15 -0
- {pydartdiags-0.0.43.dist-info → pydartdiags-0.5.1.dist-info}/WHEEL +1 -1
- pydartdiags-0.0.43.dist-info/RECORD +0 -10
- {pydartdiags-0.0.43.dist-info → pydartdiags-0.5.1.dist-info/licenses}/LICENSE +0 -0
- {pydartdiags-0.0.43.dist-info → pydartdiags-0.5.1.dist-info}/top_level.txt +0 -0
pydartdiags/plots/plots.py
CHANGED
|
@@ -1,144 +1,52 @@
|
|
|
1
|
-
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
2
|
import numpy as np
|
|
3
3
|
import plotly.express as px
|
|
4
4
|
import plotly.graph_objects as go
|
|
5
5
|
import pandas as pd
|
|
6
|
+
from pydartdiags.stats import stats
|
|
7
|
+
|
|
6
8
|
|
|
7
|
-
def plot_rank_histogram(df):
|
|
9
|
+
def plot_rank_histogram(df, phase, ens_size):
|
|
8
10
|
"""
|
|
9
11
|
Plots a rank histogram colored by observation type.
|
|
10
12
|
|
|
11
|
-
All histogram bars are
|
|
13
|
+
All histogram bars are initialized to be hidden and can be toggled visible in the plot's legend
|
|
12
14
|
"""
|
|
13
|
-
|
|
14
|
-
|
|
15
|
+
fig = px.histogram(
|
|
16
|
+
df,
|
|
17
|
+
x=f"{phase}_rank",
|
|
18
|
+
color="type",
|
|
19
|
+
title="Histogram Colored by obs type",
|
|
20
|
+
nbins=ens_size,
|
|
21
|
+
)
|
|
22
|
+
fig.update_xaxes(range=[1, ens_size + 1])
|
|
15
23
|
for trace in fig.data:
|
|
16
|
-
trace.visible =
|
|
24
|
+
trace.visible = "legendonly"
|
|
17
25
|
fig.show()
|
|
18
26
|
|
|
19
27
|
|
|
20
|
-
def
|
|
21
|
-
"""
|
|
22
|
-
Calculate the rank of observations within an ensemble.
|
|
23
|
-
|
|
24
|
-
This function takes a DataFrame containing ensemble predictions and observed values,
|
|
25
|
-
adds sampling noise to the ensemble predictions, and calculates the rank of the observed
|
|
26
|
-
value within the perturbed ensemble for each observation. The rank indicates the position
|
|
27
|
-
of the observed value within the sorted ensemble values, with 1 being the lowest. If the
|
|
28
|
-
observed value is larger than the largest ensemble member, its rank is set to the ensemble
|
|
29
|
-
size plus one.
|
|
30
|
-
|
|
31
|
-
Parameters:
|
|
32
|
-
df (pd.DataFrame): A DataFrame with columns for mean, standard deviation, observed values,
|
|
33
|
-
ensemble size, and observation type. The DataFrame should have one row per observation.
|
|
34
|
-
|
|
35
|
-
Returns:
|
|
36
|
-
tuple: A tuple containing the rank array, ensemble size, and a result DataFrame. The result
|
|
37
|
-
DataFrame contains columns for 'rank' and 'obstype'.
|
|
38
|
-
"""
|
|
39
|
-
ensemble_values = df.filter(regex='prior_ensemble_member').to_numpy().copy()
|
|
40
|
-
std_dev = np.sqrt(df['obs_err_var']).to_numpy()
|
|
41
|
-
obsvalue = df['observation'].to_numpy()
|
|
42
|
-
obstype = df['type'].to_numpy()
|
|
43
|
-
ens_size = ensemble_values.shape[1]
|
|
44
|
-
mean = 0.0 # mean of the sampling noise
|
|
45
|
-
rank = np.zeros(obsvalue.shape[0], dtype=int)
|
|
46
|
-
|
|
47
|
-
for obs in range(ensemble_values.shape[0]):
|
|
48
|
-
sampling_noise = np.random.normal(mean, std_dev[obs], ens_size)
|
|
49
|
-
ensemble_values[obs] += sampling_noise
|
|
50
|
-
ensemble_values[obs].sort()
|
|
51
|
-
for i, ens in enumerate(ensemble_values[obs]):
|
|
52
|
-
if obsvalue[obs] <= ens:
|
|
53
|
-
rank[obs] = i + 1
|
|
54
|
-
break
|
|
55
|
-
|
|
56
|
-
if rank[obs] == 0: # observation is larger than largest ensemble member
|
|
57
|
-
rank[obs] = ens_size + 1
|
|
58
|
-
|
|
59
|
-
result_df = pd.DataFrame({
|
|
60
|
-
'rank': rank,
|
|
61
|
-
'obstype': obstype
|
|
62
|
-
})
|
|
63
|
-
|
|
64
|
-
return (rank, ens_size, result_df)
|
|
65
|
-
|
|
66
|
-
def plot_profile(df, levels, verticalUnit = "pressure (Pa)"):
|
|
67
|
-
"""
|
|
68
|
-
Plots RMSE, bias, and total spread profiles for different observation types across specified vertical levels.
|
|
69
|
-
|
|
70
|
-
This function takes a DataFrame containing observational data and model predictions, categorizes
|
|
71
|
-
the data into specified vertical levels, and calculates the RMSE, bias and total spread for each level and
|
|
72
|
-
observation type. It then plots three line charts: one for RMSE, one for bias, one for total spread, as functions
|
|
73
|
-
of vertical level. The vertical levels are plotted on the y-axis in reversed order to represent
|
|
74
|
-
the vertical profile in the atmosphere correctly if the vertical units are pressure.
|
|
28
|
+
def plot_profile(df_in, verticalUnit):
|
|
29
|
+
"""Assumes diag_stats has been run on the dataframe and the resulting dataframe is passed in"""
|
|
75
30
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
Bias.
|
|
80
|
-
levels (array-like): The bin edges for categorizing the 'vertical' column values into the desired
|
|
81
|
-
vertical levels.
|
|
82
|
-
verticalUnit (string) (optional): The vertical unit to be used. Only observations in df which have this
|
|
83
|
-
string in the vert_unit column will be plotted. Defaults to 'pressure (Pa)'.
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
tuple: A tuple containing the DataFrame with RMSE, bias and total spread calculations,
|
|
87
|
-
The DataFrame includes a 'vlevels' column representing the categorized vertical levels
|
|
88
|
-
and 'midpoint' column representing the midpoint of each vertical level bin. And the three figures.
|
|
89
|
-
|
|
90
|
-
Raises:
|
|
91
|
-
ValueError: If there are missing values in the 'vertical' column of the input DataFrame.
|
|
92
|
-
ValueError: If none of the input obs have 'verticalUnit' in the 'vert_unit' column of the input DataFrame.
|
|
93
|
-
|
|
94
|
-
Note:
|
|
95
|
-
- The function modifies the input DataFrame by adding 'vlevels' and 'midpoint' columns.
|
|
96
|
-
- The 'midpoint' values are calculated as half the midpoint of each vertical level bin, which may need
|
|
97
|
-
adjustment based on the specific requirements for vertical level representation.
|
|
98
|
-
- The plots are generated using Plotly Express and are displayed inline. The y-axis of the plots is
|
|
99
|
-
reversed to align with standard atmospheric pressure level representation if the vertical units
|
|
100
|
-
are atmospheric pressure.
|
|
101
|
-
"""
|
|
102
|
-
|
|
103
|
-
pd.options.mode.copy_on_write = True
|
|
104
|
-
if df['vertical'].isnull().values.any(): # what about horizontal observations?
|
|
105
|
-
raise ValueError("Missing values in 'vertical' column.")
|
|
106
|
-
elif verticalUnit not in df['vert_unit'].values:
|
|
107
|
-
raise ValueError("No obs with expected vertical unit '"+verticalUnit+"'.")
|
|
108
|
-
else:
|
|
109
|
-
df = df[df["vert_unit"].isin({verticalUnit})] # Subset to only rows with the correct vertical unit
|
|
110
|
-
df.loc[:,'vlevels'] = pd.cut(df['vertical'], levels)
|
|
111
|
-
if verticalUnit == "pressure (Pa)":
|
|
112
|
-
df.loc[:,'midpoint'] = df['vlevels'].apply(lambda x: x.mid / 100.) # HK todo units
|
|
113
|
-
else:
|
|
114
|
-
df.loc[:,'midpoint'] = df['vlevels'].apply(lambda x: x.mid)
|
|
115
|
-
|
|
116
|
-
# Calculations
|
|
117
|
-
df_profile_prior = rmse_bias_totalspread(df, phase='prior')
|
|
118
|
-
df_profile_posterior = None
|
|
119
|
-
if 'posterior_ensemble_mean' in df.columns:
|
|
120
|
-
df_profile_posterior = rmse_bias_totalspread(df, phase='posterior')
|
|
121
|
-
|
|
122
|
-
# Merge prior and posterior dataframes
|
|
123
|
-
if df_profile_posterior is not None:
|
|
124
|
-
df_profile = pd.merge(df_profile_prior, df_profile_posterior, on=['midpoint', 'type'], suffixes=('_prior', '_posterior'))
|
|
125
|
-
fig_rmse = plot_profile_prior_post(df_profile, 'rmse', verticalUnit)
|
|
31
|
+
df = stats.layer_statistics(df_in)
|
|
32
|
+
if "posterior_rmse" in df.columns:
|
|
33
|
+
fig_rmse = plot_profile_prior_post(df, "rmse", verticalUnit)
|
|
126
34
|
fig_rmse.show()
|
|
127
|
-
fig_bias = plot_profile_prior_post(
|
|
35
|
+
fig_bias = plot_profile_prior_post(df, "bias", verticalUnit)
|
|
128
36
|
fig_bias.show()
|
|
129
|
-
fig_ts = plot_profile_prior_post(
|
|
37
|
+
fig_ts = plot_profile_prior_post(df, "totalspread", verticalUnit)
|
|
130
38
|
fig_ts.show()
|
|
131
39
|
else:
|
|
132
|
-
|
|
133
|
-
fig_rmse = plot_profile_prior(df_profile, 'rmse', verticalUnit)
|
|
40
|
+
fig_rmse = plot_profile_prior(df, "rmse", verticalUnit)
|
|
134
41
|
fig_rmse.show()
|
|
135
|
-
fig_bias = plot_profile_prior(
|
|
42
|
+
fig_bias = plot_profile_prior(df, "bias", verticalUnit)
|
|
136
43
|
fig_bias.show()
|
|
137
|
-
fig_ts = plot_profile_prior(
|
|
138
|
-
fig_ts.show()
|
|
44
|
+
fig_ts = plot_profile_prior(df, "totalspread", verticalUnit)
|
|
45
|
+
fig_ts.show()
|
|
46
|
+
|
|
47
|
+
return fig_rmse, fig_ts, fig_bias
|
|
48
|
+
|
|
139
49
|
|
|
140
|
-
return df_profile, fig_rmse, fig_ts, fig_bias
|
|
141
|
-
|
|
142
50
|
def plot_profile_prior_post(df_profile, stat, verticalUnit):
|
|
143
51
|
"""
|
|
144
52
|
Plots prior and posterior statistics by vertical level for different observation types.
|
|
@@ -151,22 +59,27 @@ def plot_profile_prior_post(df_profile, stat, verticalUnit):
|
|
|
151
59
|
Returns:
|
|
152
60
|
plotly.graph_objects.Figure: The generated Plotly figure.
|
|
153
61
|
"""
|
|
62
|
+
# Filter the DataFrame to include only rows with the required verticalUnit
|
|
63
|
+
df_filtered = df_profile[df_profile["vert_unit"] == verticalUnit]
|
|
64
|
+
|
|
154
65
|
# Reshape DataFrame to long format for easier plotting
|
|
155
66
|
df_long = pd.melt(
|
|
156
67
|
df_profile,
|
|
157
68
|
id_vars=["midpoint", "type"],
|
|
158
|
-
value_vars=["prior_"+stat, "posterior_"+stat],
|
|
159
|
-
var_name=stat+"_type",
|
|
160
|
-
value_name=stat+"_value"
|
|
69
|
+
value_vars=["prior_" + stat, "posterior_" + stat],
|
|
70
|
+
var_name=stat + "_type",
|
|
71
|
+
value_name=stat + "_value",
|
|
161
72
|
)
|
|
162
73
|
|
|
163
74
|
# Define a color mapping for observation each type
|
|
164
75
|
unique_types = df_long["type"].unique()
|
|
165
76
|
colors = px.colors.qualitative.Plotly
|
|
166
|
-
color_mapping = {
|
|
77
|
+
color_mapping = {
|
|
78
|
+
type_: colors[i % len(colors)] for i, type_ in enumerate(unique_types)
|
|
79
|
+
}
|
|
167
80
|
|
|
168
81
|
# Create a mapping for line styles based on stat
|
|
169
|
-
line_styles = {"prior_"+stat: "solid", "posterior_"+stat: "dash"}
|
|
82
|
+
line_styles = {"prior_" + stat: "solid", "posterior_" + stat: "dash"}
|
|
170
83
|
|
|
171
84
|
# Create the figure
|
|
172
85
|
fig_stat = go.Figure()
|
|
@@ -175,27 +88,35 @@ def plot_profile_prior_post(df_profile, stat, verticalUnit):
|
|
|
175
88
|
for t in df_long["type"].unique():
|
|
176
89
|
for stat_type, dash_style in line_styles.items():
|
|
177
90
|
# Filter the DataFrame for this type and stat
|
|
178
|
-
df_filtered = df_long[
|
|
179
|
-
|
|
91
|
+
df_filtered = df_long[
|
|
92
|
+
(df_long[stat + "_type"] == stat_type) & (df_long["type"] == t)
|
|
93
|
+
]
|
|
94
|
+
|
|
180
95
|
# Add a trace
|
|
181
|
-
fig_stat.add_trace(
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
96
|
+
fig_stat.add_trace(
|
|
97
|
+
go.Scatter(
|
|
98
|
+
x=df_filtered[stat + "_value"],
|
|
99
|
+
y=df_filtered["midpoint"],
|
|
100
|
+
mode="lines+markers",
|
|
101
|
+
name=(
|
|
102
|
+
"prior " + t if stat_type == "prior_" + stat else "post "
|
|
103
|
+
), # Show legend for "prior_stat OBS TYPE" only
|
|
104
|
+
line=dict(
|
|
105
|
+
dash=dash_style, color=color_mapping[t]
|
|
106
|
+
), # Same color for all traces in group
|
|
107
|
+
marker=dict(size=8, color=color_mapping[t]),
|
|
108
|
+
legendgroup=t, # Group traces by type
|
|
109
|
+
)
|
|
110
|
+
)
|
|
190
111
|
|
|
191
112
|
# Update layout
|
|
192
113
|
fig_stat.update_layout(
|
|
193
|
-
title=
|
|
114
|
+
title=stat + " by Level",
|
|
194
115
|
xaxis_title=stat,
|
|
195
116
|
yaxis_title=verticalUnit,
|
|
196
117
|
width=800,
|
|
197
118
|
height=800,
|
|
198
|
-
template="plotly_white"
|
|
119
|
+
template="plotly_white",
|
|
199
120
|
)
|
|
200
121
|
|
|
201
122
|
if verticalUnit == "pressure (Pa)":
|
|
@@ -221,15 +142,17 @@ def plot_profile_prior(df_profile, stat, verticalUnit):
|
|
|
221
142
|
df_long = pd.melt(
|
|
222
143
|
df_profile,
|
|
223
144
|
id_vars=["midpoint", "type"],
|
|
224
|
-
value_vars=["prior_"+stat],
|
|
225
|
-
var_name=stat+"_type",
|
|
226
|
-
value_name=stat+"_value"
|
|
145
|
+
value_vars=["prior_" + stat],
|
|
146
|
+
var_name=stat + "_type",
|
|
147
|
+
value_name=stat + "_value",
|
|
227
148
|
)
|
|
228
149
|
|
|
229
150
|
# Define a color mapping for observation each type
|
|
230
151
|
unique_types = df_long["type"].unique()
|
|
231
152
|
colors = px.colors.qualitative.Plotly
|
|
232
|
-
color_mapping = {
|
|
153
|
+
color_mapping = {
|
|
154
|
+
type_: colors[i % len(colors)] for i, type_ in enumerate(unique_types)
|
|
155
|
+
}
|
|
233
156
|
|
|
234
157
|
# Create the figure
|
|
235
158
|
fig_stat = go.Figure()
|
|
@@ -238,102 +161,31 @@ def plot_profile_prior(df_profile, stat, verticalUnit):
|
|
|
238
161
|
for t in df_long["type"].unique():
|
|
239
162
|
# Filter the DataFrame for this type and stat
|
|
240
163
|
df_filtered = df_long[(df_long["type"] == t)]
|
|
241
|
-
|
|
164
|
+
|
|
242
165
|
# Add a trace
|
|
243
|
-
fig_stat.add_trace(
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
166
|
+
fig_stat.add_trace(
|
|
167
|
+
go.Scatter(
|
|
168
|
+
x=df_filtered[stat + "_value"],
|
|
169
|
+
y=df_filtered["midpoint"],
|
|
170
|
+
mode="lines+markers",
|
|
171
|
+
name="prior " + t,
|
|
172
|
+
line=dict(color=color_mapping[t]), # Same color for all traces in group
|
|
173
|
+
marker=dict(size=8, color=color_mapping[t]),
|
|
174
|
+
legendgroup=t, # Group traces by type
|
|
175
|
+
)
|
|
176
|
+
)
|
|
252
177
|
|
|
253
178
|
# Update layout
|
|
254
179
|
fig_stat.update_layout(
|
|
255
|
-
title=stat +
|
|
180
|
+
title=stat + " by Level",
|
|
256
181
|
xaxis_title=stat,
|
|
257
182
|
yaxis_title=verticalUnit,
|
|
258
183
|
width=800,
|
|
259
184
|
height=800,
|
|
260
|
-
template="plotly_white"
|
|
185
|
+
template="plotly_white",
|
|
261
186
|
)
|
|
262
187
|
|
|
263
188
|
if verticalUnit == "pressure (Pa)":
|
|
264
189
|
fig_stat.update_yaxes(autorange="reversed")
|
|
265
|
-
|
|
266
|
-
return fig_stat
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
def mean_then_sqrt(x):
|
|
270
|
-
"""
|
|
271
|
-
Calculates the mean of an array-like object and then takes the square root of the result.
|
|
272
|
-
|
|
273
|
-
Parameters:
|
|
274
|
-
arr (array-like): An array-like object (such as a list or a pandas Series).
|
|
275
|
-
The elements should be numeric.
|
|
276
|
-
|
|
277
|
-
Returns:
|
|
278
|
-
float: The square root of the mean of the input array.
|
|
279
|
-
|
|
280
|
-
Raises:
|
|
281
|
-
TypeError: If the input is not an array-like object containing numeric values.
|
|
282
|
-
ValueError: If the input array is empty.
|
|
283
|
-
"""
|
|
284
|
-
|
|
285
|
-
return np.sqrt(np.mean(x))
|
|
286
|
-
|
|
287
|
-
def rmse_bias_totalspread(df, phase='prior'):
|
|
288
|
-
if phase == 'prior':
|
|
289
|
-
sq_err_column = 'prior_sq_err'
|
|
290
|
-
bias_column = 'prior_bias'
|
|
291
|
-
rmse_column = 'prior_rmse'
|
|
292
|
-
spread_column = 'prior_ensemble_spread'
|
|
293
|
-
totalspread_column = 'prior_totalspread'
|
|
294
|
-
elif phase == 'posterior':
|
|
295
|
-
sq_err_column = 'posterior_sq_err'
|
|
296
|
-
bias_column = 'posterior_bias'
|
|
297
|
-
rmse_column = 'posterior_rmse'
|
|
298
|
-
spread_column = 'posterior_ensemble_spread'
|
|
299
|
-
totalspread_column = 'posterior_totalspread'
|
|
300
|
-
else:
|
|
301
|
-
raise ValueError("Invalid phase. Must be 'prior' or 'posterior'.")
|
|
302
|
-
|
|
303
|
-
rmse_bias_ts_df = df.groupby(['midpoint', 'type'], observed=False).agg({
|
|
304
|
-
sq_err_column: mean_then_sqrt,
|
|
305
|
-
bias_column: 'mean',
|
|
306
|
-
spread_column: mean_then_sqrt,
|
|
307
|
-
'obs_err_var': mean_then_sqrt
|
|
308
|
-
}).reset_index()
|
|
309
|
-
|
|
310
|
-
# Add column for totalspread
|
|
311
|
-
rmse_bias_ts_df[totalspread_column] = np.sqrt(rmse_bias_ts_df[spread_column] + rmse_bias_ts_df['obs_err_var'])
|
|
312
|
-
|
|
313
|
-
# Rename square error to root mean square error
|
|
314
|
-
rmse_bias_ts_df.rename(columns={sq_err_column: rmse_column}, inplace=True)
|
|
315
|
-
|
|
316
|
-
return rmse_bias_ts_df
|
|
317
|
-
|
|
318
|
-
def rmse_bias_by_obs_type(df, obs_type):
|
|
319
|
-
"""
|
|
320
|
-
Calculate the RMSE and bias for a given observation type.
|
|
321
|
-
|
|
322
|
-
Parameters:
|
|
323
|
-
df (DataFrame): A pandas DataFrame.
|
|
324
|
-
obs_type (str): The observation type for which to calculate the RMSE and bias.
|
|
325
|
-
|
|
326
|
-
Returns:
|
|
327
|
-
DataFrame: A DataFrame containing the RMSE and bias for the given observation type.
|
|
328
|
-
|
|
329
|
-
Raises:
|
|
330
|
-
ValueError: If the observation type is not present in the DataFrame.
|
|
331
|
-
"""
|
|
332
|
-
if obs_type not in df['type'].unique():
|
|
333
|
-
raise ValueError(f"Observation type '{obs_type}' not found in DataFrame.")
|
|
334
|
-
else:
|
|
335
|
-
obs_type_df = df[df['type'] == obs_type]
|
|
336
|
-
obs_type_agg = obs_type_df.groupby('vlevels', observed=False).agg({'sq_err':mean_then_sqrt, 'bias':'mean'}).reset_index()
|
|
337
|
-
obs_type_agg.rename(columns={'sq_err':'rmse'}, inplace=True)
|
|
338
|
-
return obs_type_agg
|
|
339
190
|
|
|
191
|
+
return fig_stat
|
|
File without changes
|