pydartdiags 0.0.43__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydartdiags might be problematic. Click here for more details.

@@ -1,144 +1,52 @@
1
-
1
+ # SPDX-License-Identifier: Apache-2.0
2
2
  import numpy as np
3
3
  import plotly.express as px
4
4
  import plotly.graph_objects as go
5
5
  import pandas as pd
6
+ from pydartdiags.stats import stats
7
+
6
8
 
7
- def plot_rank_histogram(df):
9
+ def plot_rank_histogram(df, phase, ens_size):
8
10
  """
9
11
  Plots a rank histogram colored by observation type.
10
12
 
11
- All histogram bars are initalized to be hidden and can be toggled visible in the plot's legend
13
+ All histogram bars are initialized to be hidden and can be toggled visible in the plot's legend
12
14
  """
13
- _, _, df_hist = calculate_rank(df)
14
- fig = px.histogram(df_hist, x='rank', color='obstype', title='Histogram Colored by obstype')
15
+ fig = px.histogram(
16
+ df,
17
+ x=f"{phase}_rank",
18
+ color="type",
19
+ title="Histogram Colored by obs type",
20
+ nbins=ens_size,
21
+ )
22
+ fig.update_xaxes(range=[1, ens_size + 1])
15
23
  for trace in fig.data:
16
- trace.visible = 'legendonly'
24
+ trace.visible = "legendonly"
17
25
  fig.show()
18
26
 
19
27
 
20
- def calculate_rank(df):
21
- """
22
- Calculate the rank of observations within an ensemble.
23
-
24
- This function takes a DataFrame containing ensemble predictions and observed values,
25
- adds sampling noise to the ensemble predictions, and calculates the rank of the observed
26
- value within the perturbed ensemble for each observation. The rank indicates the position
27
- of the observed value within the sorted ensemble values, with 1 being the lowest. If the
28
- observed value is larger than the largest ensemble member, its rank is set to the ensemble
29
- size plus one.
30
-
31
- Parameters:
32
- df (pd.DataFrame): A DataFrame with columns for mean, standard deviation, observed values,
33
- ensemble size, and observation type. The DataFrame should have one row per observation.
34
-
35
- Returns:
36
- tuple: A tuple containing the rank array, ensemble size, and a result DataFrame. The result
37
- DataFrame contains columns for 'rank' and 'obstype'.
38
- """
39
- ensemble_values = df.filter(regex='prior_ensemble_member').to_numpy().copy()
40
- std_dev = np.sqrt(df['obs_err_var']).to_numpy()
41
- obsvalue = df['observation'].to_numpy()
42
- obstype = df['type'].to_numpy()
43
- ens_size = ensemble_values.shape[1]
44
- mean = 0.0 # mean of the sampling noise
45
- rank = np.zeros(obsvalue.shape[0], dtype=int)
46
-
47
- for obs in range(ensemble_values.shape[0]):
48
- sampling_noise = np.random.normal(mean, std_dev[obs], ens_size)
49
- ensemble_values[obs] += sampling_noise
50
- ensemble_values[obs].sort()
51
- for i, ens in enumerate(ensemble_values[obs]):
52
- if obsvalue[obs] <= ens:
53
- rank[obs] = i + 1
54
- break
55
-
56
- if rank[obs] == 0: # observation is larger than largest ensemble member
57
- rank[obs] = ens_size + 1
58
-
59
- result_df = pd.DataFrame({
60
- 'rank': rank,
61
- 'obstype': obstype
62
- })
63
-
64
- return (rank, ens_size, result_df)
65
-
66
- def plot_profile(df, levels, verticalUnit = "pressure (Pa)"):
67
- """
68
- Plots RMSE, bias, and total spread profiles for different observation types across specified vertical levels.
69
-
70
- This function takes a DataFrame containing observational data and model predictions, categorizes
71
- the data into specified vertical levels, and calculates the RMSE, bias and total spread for each level and
72
- observation type. It then plots three line charts: one for RMSE, one for bias, one for total spread, as functions
73
- of vertical level. The vertical levels are plotted on the y-axis in reversed order to represent
74
- the vertical profile in the atmosphere correctly if the vertical units are pressure.
28
+ def plot_profile(df_in, verticalUnit):
29
+ """Assumes diag_stats has been run on the dataframe and the resulting dataframe is passed in"""
75
30
 
76
- Parameters:
77
- df (pd.DataFrame): The input DataFrame containing at least the 'vertical' column for vertical levels,
78
- the vert_unit column, and other columns required by the `rmse_bias` function for calculating RMSE and
79
- Bias.
80
- levels (array-like): The bin edges for categorizing the 'vertical' column values into the desired
81
- vertical levels.
82
- verticalUnit (string) (optional): The vertical unit to be used. Only observations in df which have this
83
- string in the vert_unit column will be plotted. Defaults to 'pressure (Pa)'.
84
-
85
- Returns:
86
- tuple: A tuple containing the DataFrame with RMSE, bias and total spread calculations,
87
- The DataFrame includes a 'vlevels' column representing the categorized vertical levels
88
- and 'midpoint' column representing the midpoint of each vertical level bin. And the three figures.
89
-
90
- Raises:
91
- ValueError: If there are missing values in the 'vertical' column of the input DataFrame.
92
- ValueError: If none of the input obs have 'verticalUnit' in the 'vert_unit' column of the input DataFrame.
93
-
94
- Note:
95
- - The function modifies the input DataFrame by adding 'vlevels' and 'midpoint' columns.
96
- - The 'midpoint' values are calculated as half the midpoint of each vertical level bin, which may need
97
- adjustment based on the specific requirements for vertical level representation.
98
- - The plots are generated using Plotly Express and are displayed inline. The y-axis of the plots is
99
- reversed to align with standard atmospheric pressure level representation if the vertical units
100
- are atmospheric pressure.
101
- """
102
-
103
- pd.options.mode.copy_on_write = True
104
- if df['vertical'].isnull().values.any(): # what about horizontal observations?
105
- raise ValueError("Missing values in 'vertical' column.")
106
- elif verticalUnit not in df['vert_unit'].values:
107
- raise ValueError("No obs with expected vertical unit '"+verticalUnit+"'.")
108
- else:
109
- df = df[df["vert_unit"].isin({verticalUnit})] # Subset to only rows with the correct vertical unit
110
- df.loc[:,'vlevels'] = pd.cut(df['vertical'], levels)
111
- if verticalUnit == "pressure (Pa)":
112
- df.loc[:,'midpoint'] = df['vlevels'].apply(lambda x: x.mid / 100.) # HK todo units
113
- else:
114
- df.loc[:,'midpoint'] = df['vlevels'].apply(lambda x: x.mid)
115
-
116
- # Calculations
117
- df_profile_prior = rmse_bias_totalspread(df, phase='prior')
118
- df_profile_posterior = None
119
- if 'posterior_ensemble_mean' in df.columns:
120
- df_profile_posterior = rmse_bias_totalspread(df, phase='posterior')
121
-
122
- # Merge prior and posterior dataframes
123
- if df_profile_posterior is not None:
124
- df_profile = pd.merge(df_profile_prior, df_profile_posterior, on=['midpoint', 'type'], suffixes=('_prior', '_posterior'))
125
- fig_rmse = plot_profile_prior_post(df_profile, 'rmse', verticalUnit)
31
+ df = stats.layer_statistics(df_in)
32
+ if "posterior_rmse" in df.columns:
33
+ fig_rmse = plot_profile_prior_post(df, "rmse", verticalUnit)
126
34
  fig_rmse.show()
127
- fig_bias = plot_profile_prior_post(df_profile, 'bias', verticalUnit)
35
+ fig_bias = plot_profile_prior_post(df, "bias", verticalUnit)
128
36
  fig_bias.show()
129
- fig_ts = plot_profile_prior_post(df_profile, 'totalspread', verticalUnit)
37
+ fig_ts = plot_profile_prior_post(df, "totalspread", verticalUnit)
130
38
  fig_ts.show()
131
39
  else:
132
- df_profile = df_profile_prior
133
- fig_rmse = plot_profile_prior(df_profile, 'rmse', verticalUnit)
40
+ fig_rmse = plot_profile_prior(df, "rmse", verticalUnit)
134
41
  fig_rmse.show()
135
- fig_bias = plot_profile_prior(df_profile, 'bias', verticalUnit)
42
+ fig_bias = plot_profile_prior(df, "bias", verticalUnit)
136
43
  fig_bias.show()
137
- fig_ts = plot_profile_prior(df_profile, 'totalspread', verticalUnit)
138
- fig_ts.show()
44
+ fig_ts = plot_profile_prior(df, "totalspread", verticalUnit)
45
+ fig_ts.show()
46
+
47
+ return fig_rmse, fig_ts, fig_bias
48
+
139
49
 
140
- return df_profile, fig_rmse, fig_ts, fig_bias
141
-
142
50
  def plot_profile_prior_post(df_profile, stat, verticalUnit):
143
51
  """
144
52
  Plots prior and posterior statistics by vertical level for different observation types.
@@ -151,22 +59,27 @@ def plot_profile_prior_post(df_profile, stat, verticalUnit):
151
59
  Returns:
152
60
  plotly.graph_objects.Figure: The generated Plotly figure.
153
61
  """
62
+ # Filter the DataFrame to include only rows with the required verticalUnit
63
+ df_filtered = df_profile[df_profile["vert_unit"] == verticalUnit]
64
+
154
65
  # Reshape DataFrame to long format for easier plotting
155
66
  df_long = pd.melt(
156
67
  df_profile,
157
68
  id_vars=["midpoint", "type"],
158
- value_vars=["prior_"+stat, "posterior_"+stat],
159
- var_name=stat+"_type",
160
- value_name=stat+"_value"
69
+ value_vars=["prior_" + stat, "posterior_" + stat],
70
+ var_name=stat + "_type",
71
+ value_name=stat + "_value",
161
72
  )
162
73
 
163
74
  # Define a color mapping for observation each type
164
75
  unique_types = df_long["type"].unique()
165
76
  colors = px.colors.qualitative.Plotly
166
- color_mapping = {type_: colors[i % len(colors)] for i, type_ in enumerate(unique_types)}
77
+ color_mapping = {
78
+ type_: colors[i % len(colors)] for i, type_ in enumerate(unique_types)
79
+ }
167
80
 
168
81
  # Create a mapping for line styles based on stat
169
- line_styles = {"prior_"+stat: "solid", "posterior_"+stat: "dash"}
82
+ line_styles = {"prior_" + stat: "solid", "posterior_" + stat: "dash"}
170
83
 
171
84
  # Create the figure
172
85
  fig_stat = go.Figure()
@@ -175,27 +88,35 @@ def plot_profile_prior_post(df_profile, stat, verticalUnit):
175
88
  for t in df_long["type"].unique():
176
89
  for stat_type, dash_style in line_styles.items():
177
90
  # Filter the DataFrame for this type and stat
178
- df_filtered = df_long[(df_long[stat+"_type"] == stat_type) & (df_long["type"] == t)]
179
-
91
+ df_filtered = df_long[
92
+ (df_long[stat + "_type"] == stat_type) & (df_long["type"] == t)
93
+ ]
94
+
180
95
  # Add a trace
181
- fig_stat.add_trace(go.Scatter(
182
- x=df_filtered[stat+"_value"],
183
- y=df_filtered["midpoint"],
184
- mode='lines+markers',
185
- name='prior '+t if stat_type == "prior_"+stat else 'post ', # Show legend for "prior_stat OBS TYPE" only
186
- line=dict(dash=dash_style, color=color_mapping[t]), # Same color for all traces in group
187
- marker=dict(size=8, color=color_mapping[t]),
188
- legendgroup=t # Group traces by type
189
- ))
96
+ fig_stat.add_trace(
97
+ go.Scatter(
98
+ x=df_filtered[stat + "_value"],
99
+ y=df_filtered["midpoint"],
100
+ mode="lines+markers",
101
+ name=(
102
+ "prior " + t if stat_type == "prior_" + stat else "post "
103
+ ), # Show legend for "prior_stat OBS TYPE" only
104
+ line=dict(
105
+ dash=dash_style, color=color_mapping[t]
106
+ ), # Same color for all traces in group
107
+ marker=dict(size=8, color=color_mapping[t]),
108
+ legendgroup=t, # Group traces by type
109
+ )
110
+ )
190
111
 
191
112
  # Update layout
192
113
  fig_stat.update_layout(
193
- title= stat+' by Level',
114
+ title=stat + " by Level",
194
115
  xaxis_title=stat,
195
116
  yaxis_title=verticalUnit,
196
117
  width=800,
197
118
  height=800,
198
- template="plotly_white"
119
+ template="plotly_white",
199
120
  )
200
121
 
201
122
  if verticalUnit == "pressure (Pa)":
@@ -221,15 +142,17 @@ def plot_profile_prior(df_profile, stat, verticalUnit):
221
142
  df_long = pd.melt(
222
143
  df_profile,
223
144
  id_vars=["midpoint", "type"],
224
- value_vars=["prior_"+stat],
225
- var_name=stat+"_type",
226
- value_name=stat+"_value"
145
+ value_vars=["prior_" + stat],
146
+ var_name=stat + "_type",
147
+ value_name=stat + "_value",
227
148
  )
228
149
 
229
150
  # Define a color mapping for observation each type
230
151
  unique_types = df_long["type"].unique()
231
152
  colors = px.colors.qualitative.Plotly
232
- color_mapping = {type_: colors[i % len(colors)] for i, type_ in enumerate(unique_types)}
153
+ color_mapping = {
154
+ type_: colors[i % len(colors)] for i, type_ in enumerate(unique_types)
155
+ }
233
156
 
234
157
  # Create the figure
235
158
  fig_stat = go.Figure()
@@ -238,102 +161,31 @@ def plot_profile_prior(df_profile, stat, verticalUnit):
238
161
  for t in df_long["type"].unique():
239
162
  # Filter the DataFrame for this type and stat
240
163
  df_filtered = df_long[(df_long["type"] == t)]
241
-
164
+
242
165
  # Add a trace
243
- fig_stat.add_trace(go.Scatter(
244
- x=df_filtered[stat+"_value"],
245
- y=df_filtered["midpoint"],
246
- mode='lines+markers',
247
- name='prior ' + t,
248
- line=dict(color=color_mapping[t]), # Same color for all traces in group
249
- marker=dict(size=8, color=color_mapping[t]),
250
- legendgroup=t # Group traces by type
251
- ))
166
+ fig_stat.add_trace(
167
+ go.Scatter(
168
+ x=df_filtered[stat + "_value"],
169
+ y=df_filtered["midpoint"],
170
+ mode="lines+markers",
171
+ name="prior " + t,
172
+ line=dict(color=color_mapping[t]), # Same color for all traces in group
173
+ marker=dict(size=8, color=color_mapping[t]),
174
+ legendgroup=t, # Group traces by type
175
+ )
176
+ )
252
177
 
253
178
  # Update layout
254
179
  fig_stat.update_layout(
255
- title=stat + ' by Level',
180
+ title=stat + " by Level",
256
181
  xaxis_title=stat,
257
182
  yaxis_title=verticalUnit,
258
183
  width=800,
259
184
  height=800,
260
- template="plotly_white"
185
+ template="plotly_white",
261
186
  )
262
187
 
263
188
  if verticalUnit == "pressure (Pa)":
264
189
  fig_stat.update_yaxes(autorange="reversed")
265
-
266
- return fig_stat
267
-
268
-
269
- def mean_then_sqrt(x):
270
- """
271
- Calculates the mean of an array-like object and then takes the square root of the result.
272
-
273
- Parameters:
274
- arr (array-like): An array-like object (such as a list or a pandas Series).
275
- The elements should be numeric.
276
-
277
- Returns:
278
- float: The square root of the mean of the input array.
279
-
280
- Raises:
281
- TypeError: If the input is not an array-like object containing numeric values.
282
- ValueError: If the input array is empty.
283
- """
284
-
285
- return np.sqrt(np.mean(x))
286
-
287
- def rmse_bias_totalspread(df, phase='prior'):
288
- if phase == 'prior':
289
- sq_err_column = 'prior_sq_err'
290
- bias_column = 'prior_bias'
291
- rmse_column = 'prior_rmse'
292
- spread_column = 'prior_ensemble_spread'
293
- totalspread_column = 'prior_totalspread'
294
- elif phase == 'posterior':
295
- sq_err_column = 'posterior_sq_err'
296
- bias_column = 'posterior_bias'
297
- rmse_column = 'posterior_rmse'
298
- spread_column = 'posterior_ensemble_spread'
299
- totalspread_column = 'posterior_totalspread'
300
- else:
301
- raise ValueError("Invalid phase. Must be 'prior' or 'posterior'.")
302
-
303
- rmse_bias_ts_df = df.groupby(['midpoint', 'type'], observed=False).agg({
304
- sq_err_column: mean_then_sqrt,
305
- bias_column: 'mean',
306
- spread_column: mean_then_sqrt,
307
- 'obs_err_var': mean_then_sqrt
308
- }).reset_index()
309
-
310
- # Add column for totalspread
311
- rmse_bias_ts_df[totalspread_column] = np.sqrt(rmse_bias_ts_df[spread_column] + rmse_bias_ts_df['obs_err_var'])
312
-
313
- # Rename square error to root mean square error
314
- rmse_bias_ts_df.rename(columns={sq_err_column: rmse_column}, inplace=True)
315
-
316
- return rmse_bias_ts_df
317
-
318
- def rmse_bias_by_obs_type(df, obs_type):
319
- """
320
- Calculate the RMSE and bias for a given observation type.
321
-
322
- Parameters:
323
- df (DataFrame): A pandas DataFrame.
324
- obs_type (str): The observation type for which to calculate the RMSE and bias.
325
-
326
- Returns:
327
- DataFrame: A DataFrame containing the RMSE and bias for the given observation type.
328
-
329
- Raises:
330
- ValueError: If the observation type is not present in the DataFrame.
331
- """
332
- if obs_type not in df['type'].unique():
333
- raise ValueError(f"Observation type '{obs_type}' not found in DataFrame.")
334
- else:
335
- obs_type_df = df[df['type'] == obs_type]
336
- obs_type_agg = obs_type_df.groupby('vlevels', observed=False).agg({'sq_err':mean_then_sqrt, 'bias':'mean'}).reset_index()
337
- obs_type_agg.rename(columns={'sq_err':'rmse'}, inplace=True)
338
- return obs_type_agg
339
190
 
191
+ return fig_stat
File without changes