pydartdiags 0.0.43__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydartdiags might be problematic. Click here for more details.

@@ -0,0 +1,323 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ import pandas as pd
3
+ import numpy as np
4
+ from functools import wraps
5
+
6
+ # from pydartdiags.obs_sequence import obs_sequence as obsq
7
+
8
+
9
+ def apply_to_phases_in_place(func):
10
+ """
11
+ Decorator to apply a function to both 'prior' and 'posterior' phases
12
+ and modify the DataFrame in place.
13
+
14
+ The decorated function should accept 'phase' as its first argument.
15
+ """
16
+
17
+ @wraps(func)
18
+ def wrapper(df, *args, **kwargs):
19
+ for phase in ["prior", "posterior"]:
20
+ if f"{phase}_ensemble_spread" in df.columns:
21
+ func(df, phase, *args, **kwargs)
22
+ return df
23
+
24
+ return wrapper
25
+
26
+
27
+ def apply_to_phases_by_type_return_df(func):
28
+ """
29
+ Decorator to apply a function to both 'prior' and 'posterior' phases and return a new DataFrame.
30
+
31
+ The decorated function should accept 'phase' as its first argument and return a DataFrame.
32
+ """
33
+
34
+ @wraps(func)
35
+ def wrapper(df, *args, **kwargs):
36
+ results = []
37
+ for phase in ["prior", "posterior"]:
38
+ if f"{phase}_ensemble_mean" in df.columns:
39
+ result = func(df, phase, *args, **kwargs)
40
+ results.append(result)
41
+
42
+ if "midpoint" in result.columns:
43
+ if len(results) == 2:
44
+ return pd.merge(
45
+ results[0],
46
+ results[1],
47
+ on=["midpoint", "vlevels", "type", "vert_unit"],
48
+ )
49
+ else:
50
+ return results[0]
51
+ else:
52
+ if len(results) == 2:
53
+ return pd.merge(results[0], results[1], on="type")
54
+ else:
55
+ return results[0]
56
+
57
+ return wrapper
58
+
59
+
60
+ def apply_to_phases_by_obs(func):
61
+ """
62
+ Decorator to apply a function to both 'prior' and 'posterior' phases and return a new DataFrame.
63
+
64
+ The decorated function should accept 'phase' as its first argument and return a DataFrame.
65
+ """
66
+
67
+ @wraps(func)
68
+ def wrapper(df, *args, **kwargs):
69
+
70
+ res_df = func(df, "prior", *args, **kwargs)
71
+ if "posterior_ensemble_mean" in df.columns:
72
+ posterior_df = func(df, "posterior", *args, **kwargs)
73
+ res_df["posterior_rank"] = posterior_df["posterior_rank"]
74
+
75
+ return res_df
76
+
77
+ return wrapper
78
+
79
+
80
+ @apply_to_phases_by_obs
81
+ def calculate_rank(df, phase):
82
+ """
83
+ Calculate the rank of observations within an ensemble.
84
+
85
+ This function takes a DataFrame containing ensemble predictions and observed values,
86
+ adds sampling noise to the ensemble predictions, and calculates the rank of the observed
87
+ value within the perturbed ensemble for each observation. The rank indicates the position
88
+ of the observed value within the sorted ensemble values, with 1 being the lowest. If the
89
+ observed value is larger than the largest ensemble member, its rank is set to the ensemble
90
+ size plus one.
91
+
92
+ Parameters:
93
+ df (pd.DataFrame): A DataFrame with columns for rank, and observation type.
94
+
95
+ phase (str): The phase for which to calculate the statistics ('prior' or 'posterior')
96
+
97
+ Returns:
98
+ DataFrame containing columns for 'rank' and observation 'type'.
99
+ """
100
+ column = f"{phase}_ensemble_member"
101
+ ensemble_values = df.filter(regex=column).to_numpy().copy()
102
+ std_dev = np.sqrt(df["obs_err_var"]).to_numpy()
103
+ obsvalue = df["observation"].to_numpy()
104
+ obstype = df["type"].to_numpy()
105
+ ens_size = ensemble_values.shape[1]
106
+ mean = 0.0 # mean of the sampling noise
107
+ rank = np.zeros(obsvalue.shape[0], dtype=int)
108
+
109
+ for obs in range(ensemble_values.shape[0]):
110
+ sampling_noise = np.random.normal(mean, std_dev[obs], ens_size)
111
+ ensemble_values[obs] += sampling_noise
112
+ ensemble_values[obs].sort()
113
+ for i, ens in enumerate(ensemble_values[obs]):
114
+ if obsvalue[obs] <= ens:
115
+ rank[obs] = i + 1
116
+ break
117
+
118
+ if rank[obs] == 0: # observation is larger than largest ensemble member
119
+ rank[obs] = ens_size + 1
120
+
121
+ result_df = pd.DataFrame({"type": obstype, f"{phase}_rank": rank})
122
+
123
+ return result_df
124
+
125
+
126
+ def mean_then_sqrt(x):
127
+ """
128
+ Calculates the mean of an array-like object and then takes the square root of the result.
129
+
130
+ Parameters:
131
+ arr (array-like): An array-like object (such as a list or a pandas Series).
132
+ The elements should be numeric.
133
+
134
+ Returns:
135
+ float: The square root of the mean of the input array.
136
+
137
+ Raises:
138
+ TypeError: If the input is not an array-like object containing numeric values.
139
+ ValueError: If the input array is empty.
140
+ """
141
+
142
+ return np.sqrt(np.mean(x))
143
+
144
+
145
+ @apply_to_phases_in_place
146
+ def diag_stats(df, phase):
147
+ """
148
+ Calculate diagnostic statistics for a given phase and add them to the DataFrame.
149
+
150
+ Args:
151
+ df (pandas.DataFrame): The input DataFrame containing observation data and ensemble statistics.
152
+ The DataFrame must include the following columns:
153
+ - 'observation': The actual observation values.
154
+ - 'obs_err_var': The variance of the observation error.
155
+ - 'prior_ensemble_mean' and/or 'posterior_ensemble_mean': The mean of the ensemble.
156
+ - 'prior_ensemble_spread' and/or 'posterior_ensemble_spread': The spread of the ensemble.
157
+
158
+ phase (str): The phase for which to calculate the statistics ('prior' or 'posterior')
159
+
160
+ Returns:
161
+ None: The function modifies the DataFrame in place by adding the following columns:
162
+ - 'prior_sq_err' and/or 'posterior_sq_err': The square error for the 'prior' and 'posterior' phases.
163
+ - 'prior_bias' and/or 'posterior_bias': The bias for the 'prior' and 'posterior' phases.
164
+ - 'prior_totalvar' and/or 'posterior_totalvar': The total variance for the 'prior' and 'posterior' phases.
165
+
166
+ Notes:
167
+ - Spread is the standard deviation of the ensemble.
168
+ - The function modifies the input DataFrame by adding new columns for the calculated statistics.
169
+ """
170
+ pd.options.mode.copy_on_write = True
171
+
172
+ # input from the observation sequence
173
+ spread_column = f"{phase}_ensemble_spread"
174
+ mean_column = f"{phase}_ensemble_mean"
175
+
176
+ # Calculated from the observation sequence
177
+ sq_err_column = f"{phase}_sq_err"
178
+ bias_column = f"{phase}_bias"
179
+ totalvar_column = f"{phase}_totalvar"
180
+
181
+ df[sq_err_column] = (df[mean_column] - df["observation"]) ** 2
182
+ df[bias_column] = df[mean_column] - df["observation"]
183
+ df[totalvar_column] = df["obs_err_var"] + df[spread_column] ** 2
184
+
185
+
186
+ def bin_by_layer(df, levels, verticalUnit="pressure (Pa)"):
187
+ """
188
+ Bin observations by vertical layers and add 'vlevels' and 'midpoint' columns to the DataFrame.
189
+
190
+ This function bins the observations in the DataFrame based on the specified vertical levels and adds two new columns:
191
+ 'vlevels', which represents the categorized vertical levels, and 'midpoint', which represents the midpoint of each
192
+ vertical level bin. Only observations (row) with the specified vertical unit are binned.
193
+
194
+ Args:
195
+ df (pandas.DataFrame): The input DataFrame containing observation data. The DataFrame must include the following columns:
196
+ - 'vertical': The vertical coordinate values of the observations.
197
+ - 'vert_unit': The unit of the vertical coordinate values.
198
+ levels (list): A list of bin edges for the vertical levels.
199
+ verticalUnit (str, optional): The unit of the vertical axis (e.g., 'pressure (Pa)'). Default is 'pressure (Pa)'.
200
+
201
+ Returns:
202
+ pandas.DataFrame: The input DataFrame with additional columns for the binned vertical levels and their midpoints:
203
+ - 'vlevels': The categorized vertical levels.
204
+ - 'midpoint': The midpoint of each vertical level bin.
205
+
206
+ Notes:
207
+ - The function modifies the input DataFrame by adding 'vlevels' and 'midpoint' columns.
208
+ - The 'midpoint' values are calculated as half the midpoint of each vertical level bin.
209
+ """
210
+ pd.options.mode.copy_on_write = True
211
+ df.loc[df["vert_unit"] == verticalUnit, "vlevels"] = pd.cut(
212
+ df.loc[df["vert_unit"] == verticalUnit, "vertical"], levels
213
+ )
214
+ if verticalUnit == "pressure (Pa)":
215
+ df.loc[:, "midpoint"] = df["vlevels"].apply(
216
+ lambda x: x.mid
217
+ ) # HK todo units HPa - change now or in plotting?
218
+ df.loc[:, "vlevels"] = df["vlevels"].apply(
219
+ lambda x: x
220
+ ) # HK todo units HPa - change now or in plotting?
221
+ else:
222
+ df.loc[:, "midpoint"] = df["vlevels"].apply(lambda x: x.mid)
223
+
224
+
225
+ @apply_to_phases_by_type_return_df
226
+ def grand_statistics(df, phase):
227
+
228
+ # assuming diag_stats has been called
229
+ grand = (
230
+ df.groupby(["type"], observed=False)
231
+ .agg(
232
+ {
233
+ f"{phase}_sq_err": mean_then_sqrt,
234
+ f"{phase}_bias": "mean",
235
+ f"{phase}_totalvar": mean_then_sqrt,
236
+ }
237
+ )
238
+ .reset_index()
239
+ )
240
+
241
+ grand.rename(columns={f"{phase}_sq_err": f"{phase}_rmse"}, inplace=True)
242
+ grand.rename(columns={f"{phase}_totalvar": f"{phase}_totalspread"}, inplace=True)
243
+
244
+ return grand
245
+
246
+
247
+ @apply_to_phases_by_type_return_df
248
+ def layer_statistics(df, phase):
249
+
250
+ # assuming diag_stats has been called
251
+ layer_stats = (
252
+ df.groupby(["midpoint", "type"], observed=False)
253
+ .agg(
254
+ {
255
+ f"{phase}_sq_err": mean_then_sqrt,
256
+ f"{phase}_bias": "mean",
257
+ f"{phase}_totalvar": mean_then_sqrt,
258
+ "vert_unit": "first",
259
+ "vlevels": "first",
260
+ }
261
+ )
262
+ .reset_index()
263
+ )
264
+
265
+ layer_stats.rename(columns={f"{phase}_sq_err": f"{phase}_rmse"}, inplace=True)
266
+ layer_stats.rename(
267
+ columns={f"{phase}_totalvar": f"{phase}_totalspread"}, inplace=True
268
+ )
269
+
270
+ return layer_stats
271
+
272
+
273
+ def possible_vs_used(df):
274
+ """
275
+ Calculates the count of possible vs. used observations by type.
276
+
277
+ This function takes a DataFrame containing observation data, including a 'type' column for the observation
278
+ type and an 'observation' column. The number of used observations ('used'), is the total number
279
+ minus the observations that failed quality control checks (as determined by the `select_failed_qcs` function).
280
+ The result is a DataFrame with each observation type, the count of possible observations, and the count of
281
+ used observations.
282
+
283
+ Returns:
284
+ pd.DataFrame: A DataFrame with three columns: 'type', 'possible', and 'used'. 'type' is the observation type,
285
+ 'possible' is the count of all observations of that type, and 'used' is the count of observations of that type
286
+ that passed quality control checks.
287
+ """
288
+ possible = df.groupby("type")["observation"].count()
289
+ possible.rename("possible", inplace=True)
290
+
291
+ failed_qcs = select_failed_qcs(df).groupby("type")["observation"].count()
292
+ used = possible - failed_qcs.reindex(possible.index, fill_value=0)
293
+ used.rename("used", inplace=True)
294
+
295
+ return pd.concat([possible, used], axis=1).reset_index()
296
+
297
+
298
+ def possible_vs_used_by_layer(df):
299
+ """
300
+ Calculates the count of possible vs. used observations by type and vertical level.
301
+ """
302
+ possible = df.groupby(["type", "midpoint"], observed=False)["type"].count()
303
+ possible.rename("possible", inplace=True)
304
+
305
+ failed_qcs = (
306
+ select_failed_qcs(df)
307
+ .groupby(["type", "midpoint"], observed=False)["type"]
308
+ .count()
309
+ )
310
+ used = possible - failed_qcs.reindex(possible.index, fill_value=0)
311
+ used.rename("used", inplace=True)
312
+
313
+ return pd.concat([possible, used], axis=1).reset_index()
314
+
315
+
316
+ def select_failed_qcs(df):
317
+ """
318
+ Select rows from the DataFrame where the DART quality control flag is greater than 0.
319
+
320
+ Returns:
321
+ pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
322
+ """
323
+ return df[df["DART_quality_control"] > 0]
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: pydartdiags
3
- Version: 0.0.43
3
+ Version: 0.5.0
4
4
  Summary: Observation Sequence Diagnostics for DART
5
5
  Home-page: https://github.com/NCAR/pyDARTdiags.git
6
6
  Author: Helen Kershaw
@@ -18,22 +18,26 @@ Requires-Dist: pandas>=2.2.0
18
18
  Requires-Dist: numpy>=1.26
19
19
  Requires-Dist: plotly>=5.22.0
20
20
  Requires-Dist: pyyaml>=6.0.2
21
+ Requires-Dist: matplotlib>=3.9.4
22
+ Dynamic: author
23
+ Dynamic: home-page
24
+ Dynamic: requires-python
21
25
 
22
26
  [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
23
27
  [![codecov](https://codecov.io/gh/NCAR/pyDARTdiags/graph/badge.svg?token=VK55SQZSVD)](https://codecov.io/gh/NCAR/pyDARTdiags)
24
28
  [![PyPI version](https://badge.fury.io/py/pydartdiags.svg)](https://pypi.org/project/pydartdiags/)
25
-
29
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
26
30
 
27
31
  # pyDARTdiags
28
32
 
29
- pyDARTdiags is a Python library for obsevation space diagnostics for the Data Assimilation Research Testbed ([DART](https://github.com/NCAR/DART)).
33
+ pyDARTdiags is a Python library for observation space diagnostics for the Data Assimilation Research Testbed ([DART](https://github.com/NCAR/DART)).
30
34
 
31
35
  pyDARTdiags is under initial development, so please use caution.
32
36
  The MATLAB [observation space diagnostics](https://docs.dart.ucar.edu/en/latest/guide/matlab-observation-space.html) are available through [DART](https://github.com/NCAR/DART).
33
37
 
34
38
 
35
39
  pyDARTdiags can be installed through pip: https://pypi.org/project/pydartdiags/
36
- Documenation : https://ncar.github.io/pyDARTdiags/
40
+ Documentation : https://ncar.github.io/pyDARTdiags/
37
41
 
38
42
  ## Contributing
39
43
  Contributions are welcome! If you have a feature request, bug report, or a suggestion, please open an issue on our GitHub repository.
@@ -0,0 +1,14 @@
1
+ pydartdiags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ pydartdiags/matplots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ pydartdiags/matplots/matplots.py,sha256=44MlD98gaQsrCT0mW6M9f0a2-clm3KEGrdYqkTUO0RI,7478
4
+ pydartdiags/obs_sequence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ pydartdiags/obs_sequence/obs_sequence.py,sha256=kdPOWAqgiyuv6cTdhYx1u9Ru6zCKF0Wd--7-sM3m5F8,44527
6
+ pydartdiags/plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ pydartdiags/plots/plots.py,sha256=U7WQjE_qN-5a8-85D-PkkgILSFBzTJQ1mcGBa7l5DHI,6464
8
+ pydartdiags/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ pydartdiags/stats/stats.py,sha256=tzjE6HBrw6s9Li0UlJ_sNMcGEU8loT_BA5SDZp-UTOc,12138
10
+ pydartdiags-0.5.0.dist-info/LICENSE,sha256=ROglds_Eg_ylXp-1MHmEawDqMw_UsCB4r9sk7z9PU9M,11377
11
+ pydartdiags-0.5.0.dist-info/METADATA,sha256=F6znTR7qrj2qoGBYNojmWiaOqa9EAETgphV7i0HW0xc,2391
12
+ pydartdiags-0.5.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
13
+ pydartdiags-0.5.0.dist-info/top_level.txt,sha256=LfMoPLnSd0VhhlWev1eeX9t6AzvyASOloag0LO_ppWg,12
14
+ pydartdiags-0.5.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.7.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,10 +0,0 @@
1
- pydartdiags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- pydartdiags/obs_sequence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- pydartdiags/obs_sequence/obs_sequence.py,sha256=2pddiJ6VRFkaDizYq8HvGUpC4rw7TTV14XjmemjqCNg,34187
4
- pydartdiags/plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- pydartdiags/plots/plots.py,sha256=UecLgWauO9L_EaGhEVxW3IuKcSU95uRA2mptsxh4-0E,13901
6
- pydartdiags-0.0.43.dist-info/LICENSE,sha256=ROglds_Eg_ylXp-1MHmEawDqMw_UsCB4r9sk7z9PU9M,11377
7
- pydartdiags-0.0.43.dist-info/METADATA,sha256=udwmddMTrqFpyj0tjOffWVf2xbTI_3IwQCS4ZVvnnuU,2185
8
- pydartdiags-0.0.43.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
9
- pydartdiags-0.0.43.dist-info/top_level.txt,sha256=LfMoPLnSd0VhhlWev1eeX9t6AzvyASOloag0LO_ppWg,12
10
- pydartdiags-0.0.43.dist-info/RECORD,,