pydartdiags 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydartdiags might be problematic. Click here for more details.

@@ -2,8 +2,7 @@
2
2
  import pandas as pd
3
3
  import numpy as np
4
4
  from functools import wraps
5
-
6
- # from pydartdiags.obs_sequence import obs_sequence as obsq
5
+ from datetime import datetime, timedelta
7
6
 
8
7
 
9
8
  def apply_to_phases_in_place(func):
@@ -39,20 +38,30 @@ def apply_to_phases_by_type_return_df(func):
39
38
  result = func(df, phase, *args, **kwargs)
40
39
  results.append(result)
41
40
 
42
- if "midpoint" in result.columns:
43
- if len(results) == 2:
44
- return pd.merge(
45
- results[0],
46
- results[1],
47
- on=["midpoint", "vlevels", "type", "vert_unit"],
48
- )
49
- else:
50
- return results[0]
41
+ if not results:
42
+ return (
43
+ pd.DataFrame()
44
+ ) # Return an empty DataFrame if no results are generated
45
+
46
+ # Dynamically determine merge keys based on common columns
47
+ common_columns = set(results[0].columns)
48
+ for result in results[1:]:
49
+ common_columns &= set(result.columns)
50
+
51
+ # Exclude phase-specific columns from the merge keys
52
+ phase_specific_columns = {
53
+ f"{phase}_sq_err",
54
+ f"{phase}_bias",
55
+ f"{phase}_totalvar",
56
+ f"{phase}_rmse",
57
+ f"{phase}_totalspread",
58
+ }
59
+ merge_keys = list(common_columns - phase_specific_columns)
60
+
61
+ if len(results) == 2:
62
+ return pd.merge(results[0], results[1], on=merge_keys)
51
63
  else:
52
- if len(results) == 2:
53
- return pd.merge(results[0], results[1], on="type")
54
- else:
55
- return results[0]
64
+ return results[0]
56
65
 
57
66
  return wrapper
58
67
 
@@ -82,6 +91,12 @@ def calculate_rank(df, phase):
82
91
  """
83
92
  Calculate the rank of observations within an ensemble.
84
93
 
94
+ Note:
95
+
96
+ This function is decorated with @apply_to_phases_by_obs, which modifies its usage.
97
+ You should call it as calculate_rank(df), and the decorator will automatically apply the
98
+ function to all relevant phases (‘prior’ and ‘posterior’).
99
+
85
100
  This function takes a DataFrame containing ensemble predictions and observed values,
86
101
  adds sampling noise to the ensemble predictions, and calculates the rank of the observed
87
102
  value within the perturbed ensemble for each observation. The rank indicates the position
@@ -92,8 +107,6 @@ def calculate_rank(df, phase):
92
107
  Parameters:
93
108
  df (pd.DataFrame): A DataFrame with columns for rank, and observation type.
94
109
 
95
- phase (str): The phase for which to calculate the statistics ('prior' or 'posterior')
96
-
97
110
  Returns:
98
111
  DataFrame containing columns for 'rank' and observation 'type'.
99
112
  """
@@ -147,15 +160,20 @@ def diag_stats(df, phase):
147
160
  """
148
161
  Calculate diagnostic statistics for a given phase and add them to the DataFrame.
149
162
 
163
+ Note:
164
+ This function is decorated with @apply_to_phases_in_place, which modifies its usage.
165
+ You should call it as diag_stats(df), and the decorator will automatically apply the
166
+ function to all relevant phases (‘prior’ and ‘posterior’) modifying the DataFrame
167
+ in place.
168
+
150
169
  Args:
151
170
  df (pandas.DataFrame): The input DataFrame containing observation data and ensemble statistics.
152
- The DataFrame must include the following columns:
153
- - 'observation': The actual observation values.
154
- - 'obs_err_var': The variance of the observation error.
155
- - 'prior_ensemble_mean' and/or 'posterior_ensemble_mean': The mean of the ensemble.
156
- - 'prior_ensemble_spread' and/or 'posterior_ensemble_spread': The spread of the ensemble.
171
+ The DataFrame must include the following columns:
157
172
 
158
- phase (str): The phase for which to calculate the statistics ('prior' or 'posterior')
173
+ - 'observation': The actual observation values.
174
+ - 'obs_err_var': The variance of the observation error.
175
+ - 'prior_ensemble_mean' and/or 'posterior_ensemble_mean': The mean of the ensemble.
176
+ - 'prior_ensemble_spread' and/or 'posterior_ensemble_spread': The spread of the ensemble.
159
177
 
160
178
  Returns:
161
179
  None: The function modifies the DataFrame in place by adding the following columns:
@@ -192,9 +210,12 @@ def bin_by_layer(df, levels, verticalUnit="pressure (Pa)"):
192
210
  vertical level bin. Only observations (row) with the specified vertical unit are binned.
193
211
 
194
212
  Args:
195
- df (pandas.DataFrame): The input DataFrame containing observation data. The DataFrame must include the following columns:
213
+ df (pandas.DataFrame): The input DataFrame containing observation data.
214
+ The DataFrame must include the following columns:
215
+
196
216
  - 'vertical': The vertical coordinate values of the observations.
197
217
  - 'vert_unit': The unit of the vertical coordinate values.
218
+
198
219
  levels (list): A list of bin edges for the vertical levels.
199
220
  verticalUnit (str, optional): The unit of the vertical axis (e.g., 'pressure (Pa)'). Default is 'pressure (Pa)'.
200
221
 
@@ -211,19 +232,67 @@ def bin_by_layer(df, levels, verticalUnit="pressure (Pa)"):
211
232
  df.loc[df["vert_unit"] == verticalUnit, "vlevels"] = pd.cut(
212
233
  df.loc[df["vert_unit"] == verticalUnit, "vertical"], levels
213
234
  )
214
- if verticalUnit == "pressure (Pa)":
215
- df.loc[:, "midpoint"] = df["vlevels"].apply(
216
- lambda x: x.mid
217
- ) # HK todo units HPa - change now or in plotting?
218
- df.loc[:, "vlevels"] = df["vlevels"].apply(
219
- lambda x: x
220
- ) # HK todo units HPa - change now or in plotting?
221
- else:
222
- df.loc[:, "midpoint"] = df["vlevels"].apply(lambda x: x.mid)
235
+ df.loc[:, "midpoint"] = df["vlevels"].apply(lambda x: x.mid)
236
+
237
+
238
+ def bin_by_time(df, time_value):
239
+ """
240
+ Bin observations by time and add 'time_bin' and 'time_bin_midpoint' columns to the DataFrame.
241
+ The first bin starts 1 second before the minimum time value, so the minimum time is included in the
242
+ first bin. The last bin is inclusive of the maximum time value.
243
+
244
+ Args:
245
+ df (pd.DataFrame): The input DataFrame containing a 'time' column.
246
+ time_value (str): The width of each time bin (e.g., '3600S' for 1 hour).
247
+
248
+ Returns:
249
+ None: The function modifies the DataFrame in place by adding 'time_bin' and 'time_bin_midpoint' columns.
250
+ """
251
+ # Create time bins
252
+ start = df["time"].min() - timedelta(seconds=1)
253
+ end = df["time"].max()
254
+ # Determine if the end time aligns with the bin boundary
255
+ time_delta = pd.Timedelta(time_value)
256
+ aligned_end = (pd.Timestamp(end) + time_delta).floor(time_value)
257
+
258
+ time_bins = pd.date_range(
259
+ start=start,
260
+ end=aligned_end,
261
+ freq=time_value,
262
+ )
263
+
264
+ df["time_bin"] = pd.cut(df["time"], bins=time_bins)
265
+
266
+ # Calculate the midpoint of each time bin
267
+ df["time_bin_midpoint"] = df["time_bin"].apply(
268
+ lambda x: x.left + (x.right - x.left) / 2 if pd.notnull(x) else None
269
+ )
223
270
 
224
271
 
225
272
  @apply_to_phases_by_type_return_df
226
273
  def grand_statistics(df, phase):
274
+ """
275
+ Calculate grand statistics (RMSE, bias, total spread) for each observation type and phase.
276
+
277
+ This function assumes that diagnostic statistics (such as squared error, bias, and total variance)
278
+ have already been computed by :func:`diag_stats` and are present in the DataFrame. It groups the data by observation
279
+ type and computes the root mean square error (RMSE), mean bias, and total spread for the specified phase.
280
+
281
+ Note:
282
+ This function is decorated with @apply_to_phases_by_type_return_df, which modifies its usage
283
+ You should call it as grand_statistics(df), and the decorator will automatically apply the function
284
+ to all relevant phases ('prior' and 'posterior') and return a merged DataFrame.
285
+
286
+ Args:
287
+ df (pandas.DataFrame): The input DataFrame containing diagnostic statistics for observations.
288
+
289
+ Returns:
290
+ pandas.DataFrame: A DataFrame with columns:
291
+ - 'type': The observation type.
292
+ - '{phase}_rmse': The root mean square error for the phase.
293
+ - '{phase}_bias': The mean bias for the phase.
294
+ - '{phase}_totalspread': The total spread for the phase.
295
+ """
227
296
 
228
297
  # assuming diag_stats has been called
229
298
  grand = (
@@ -246,6 +315,33 @@ def grand_statistics(df, phase):
246
315
 
247
316
  @apply_to_phases_by_type_return_df
248
317
  def layer_statistics(df, phase):
318
+ """
319
+ Calculate statistics (RMSE, bias, total spread) for each observation type and vertical layer.
320
+
321
+ This function assumes that diagnostic statistics (such as squared error, bias, and total variance)
322
+ have already been computed with :func:`diag_stats` and are present in the DataFrame. It groups the data by
323
+ vertical layer midpoint and observation type, and computes the root mean square error (RMSE),
324
+ mean bias, and total spread for the specified phase for each vertical layer.
325
+
326
+ Note:
327
+ This function is decorated with @apply_to_phases_by_type_return_df, which modifies its usage
328
+ You should call it as layer_statistics(df), and the decorator will automatically apply the function
329
+ to all relevant phases ('prior' and 'posterior') and return a merged DataFrame.
330
+
331
+ Args:
332
+ df (pandas.DataFrame): The input DataFrame containing diagnostic statistics for observations.
333
+ phase (str): The phase for which to calculate the statistics ('prior' or 'posterior').
334
+
335
+ Returns:
336
+ pandas.DataFrame: A DataFrame with columns:
337
+ - 'midpoint': The midpoint of the vertical layer.
338
+ - 'type': The observation type.
339
+ - '{phase}_rmse': The root mean square error for the phase.
340
+ - '{phase}_bias': The mean bias for the phase.
341
+ - '{phase}_totalspread': The total spread for the phase.
342
+ - 'vert_unit': The vertical unit.
343
+ - 'vlevels': The categorized vertical level.
344
+ """
249
345
 
250
346
  # assuming diag_stats has been called
251
347
  layer_stats = (
@@ -270,13 +366,65 @@ def layer_statistics(df, phase):
270
366
  return layer_stats
271
367
 
272
368
 
369
+ @apply_to_phases_by_type_return_df
370
+ def time_statistics(df, phase):
371
+ """
372
+ Calculate time-based statistics (RMSE, bias, total spread) for each observation type and time bin.
373
+
374
+ This function assumes that diagnostic statistics (such as squared error, bias, and total variance)
375
+ have already been computed by :func:`diag_stats` and are present in the DataFrame. It groups the data
376
+ by time bin midpoint and observation type, and computes the root mean square error (RMSE), mean bias,
377
+ and total spread for the specified phase for each time bin.
378
+
379
+ Note:
380
+ This function is decorated with @apply_to_phases_by_type_return_df.
381
+ You should call it as time_statistics(df), and the decorator will automatically apply the function
382
+ to all relevant phases ('prior' and 'posterior') and return a merged DataFrame.
383
+
384
+ Args:
385
+ df (pandas.DataFrame): The input DataFrame containing diagnostic statistics for observations.
386
+ phase (str): The phase for which to calculate the statistics ('prior' or 'posterior').
387
+
388
+ Returns:
389
+ pandas.DataFrame: A DataFrame with columns:
390
+ - 'time_bin_midpoint': The midpoint of the time bin.
391
+ - 'type': The observation type.
392
+ - '{phase}_rmse': The root mean square error for the phase.
393
+ - '{phase}_bias': The mean bias for the phase.
394
+ - '{phase}_totalspread': The total spread for the phase.
395
+ - 'time_bin': The time bin interval.
396
+ - 'time': The first time value in the bin.
397
+ """
398
+ # Assuming diag_stats has been called
399
+ time_stats = (
400
+ df.groupby(["time_bin_midpoint", "type"], observed=False)
401
+ .agg(
402
+ {
403
+ f"{phase}_sq_err": mean_then_sqrt,
404
+ f"{phase}_bias": "mean",
405
+ f"{phase}_totalvar": mean_then_sqrt,
406
+ "time_bin": "first",
407
+ "time": "first",
408
+ }
409
+ )
410
+ .reset_index()
411
+ )
412
+
413
+ time_stats.rename(columns={f"{phase}_sq_err": f"{phase}_rmse"}, inplace=True)
414
+ time_stats.rename(
415
+ columns={f"{phase}_totalvar": f"{phase}_totalspread"}, inplace=True
416
+ )
417
+
418
+ return time_stats
419
+
420
+
273
421
  def possible_vs_used(df):
274
422
  """
275
423
  Calculates the count of possible vs. used observations by type.
276
424
 
277
425
  This function takes a DataFrame containing observation data, including a 'type' column for the observation
278
426
  type and an 'observation' column. The number of used observations ('used'), is the total number
279
- minus the observations that failed quality control checks (as determined by the `select_failed_qcs` function).
427
+ of assimilated observations (as determined by the `select_used_qcs` function).
280
428
  The result is a DataFrame with each observation type, the count of possible observations, and the count of
281
429
  used observations.
282
430
 
@@ -288,8 +436,8 @@ def possible_vs_used(df):
288
436
  possible = df.groupby("type")["observation"].count()
289
437
  possible.rename("possible", inplace=True)
290
438
 
291
- failed_qcs = select_failed_qcs(df).groupby("type")["observation"].count()
292
- used = possible - failed_qcs.reindex(possible.index, fill_value=0)
439
+ used_qcs = select_used_qcs(df).groupby("type")["observation"].count()
440
+ used = used_qcs.reindex(possible.index, fill_value=0)
293
441
  used.rename("used", inplace=True)
294
442
 
295
443
  return pd.concat([possible, used], axis=1).reset_index()
@@ -302,22 +450,61 @@ def possible_vs_used_by_layer(df):
302
450
  possible = df.groupby(["type", "midpoint"], observed=False)["type"].count()
303
451
  possible.rename("possible", inplace=True)
304
452
 
305
- failed_qcs = (
306
- select_failed_qcs(df)
453
+ used_qcs = (
454
+ select_used_qcs(df)
307
455
  .groupby(["type", "midpoint"], observed=False)["type"]
308
456
  .count()
309
457
  )
310
- used = possible - failed_qcs.reindex(possible.index, fill_value=0)
458
+
459
+ used = used_qcs.reindex(possible.index, fill_value=0)
311
460
  used.rename("used", inplace=True)
312
461
 
313
462
  return pd.concat([possible, used], axis=1).reset_index()
314
463
 
315
464
 
316
- def select_failed_qcs(df):
465
+ def select_used_qcs(df):
317
466
  """
318
- Select rows from the DataFrame where the DART quality control flag is greater than 0.
467
+ Select rows from the DataFrame where the observation was used.
468
+ Includes observations for which the posterior forward observation operators failed.
319
469
 
320
470
  Returns:
321
- pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
471
+ pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag 0 or 2.
472
+ """
473
+ return df[(df["DART_quality_control"] == 0) | (df["DART_quality_control"] == 2)]
474
+
475
+
476
+ def possible_vs_used_by_time(df):
322
477
  """
323
- return df[df["DART_quality_control"] > 0]
478
+ Calculates the count of possible vs. used observations by type and time bin.
479
+
480
+ Args:
481
+ df (pd.DataFrame): The input DataFrame containing observation data.
482
+ The DataFrame must include:
483
+
484
+ - 'type': The observation type.
485
+ - 'time_bin_midpoint': The midpoint of the time bin.
486
+ - 'observation': The observation values.
487
+ - 'DART_quality_control': The quality control flag.
488
+
489
+ Returns:
490
+ pd.DataFrame: A DataFrame with the following columns:
491
+ - 'time_bin_midpoint': The midpoint of the time bin.
492
+ - 'type': The observation type.
493
+ - 'possible': The count of all observations in the time bin.
494
+ - 'used': The count of observations in the time bin that passed quality control checks.
495
+ """
496
+ # Count all observations (possible) grouped by time_bin_midpoint and type
497
+ possible = df.groupby(["time_bin_midpoint", "type"], observed=False)["type"].count()
498
+ possible.rename("possible", inplace=True)
499
+
500
+ # Count used observations (QC=0 or QC=2) grouped by time_bin_midpoint and type
501
+ used_qcs = (
502
+ select_used_qcs(df)
503
+ .groupby(["time_bin_midpoint", "type"], observed=False)["type"]
504
+ .count()
505
+ )
506
+ used = used_qcs.reindex(possible.index, fill_value=0)
507
+ used.rename("used", inplace=True)
508
+
509
+ # Combine possible and used into a single DataFrame
510
+ return pd.concat([possible, used], axis=1).reset_index()
@@ -1,15 +1,15 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: pydartdiags
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Observation Sequence Diagnostics for DART
5
5
  Home-page: https://github.com/NCAR/pyDARTdiags.git
6
6
  Author: Helen Kershaw
7
7
  Author-email: Helen Kershaw <hkershaw@ucar.edu>
8
+ License-Expression: Apache-2.0
8
9
  Project-URL: Homepage, https://github.com/NCAR/pyDARTdiags.git
9
10
  Project-URL: Issues, https://github.com/NCAR/pyDARTdiags/issues
10
11
  Project-URL: Documentation, https://ncar.github.io/pyDARTdiags
11
12
  Classifier: Programming Language :: Python :: 3
12
- Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Operating System :: OS Independent
14
14
  Requires-Python: >=3.8
15
15
  Description-Content-Type: text/markdown
@@ -21,6 +21,7 @@ Requires-Dist: pyyaml>=6.0.2
21
21
  Requires-Dist: matplotlib>=3.9.4
22
22
  Dynamic: author
23
23
  Dynamic: home-page
24
+ Dynamic: license-file
24
25
  Dynamic: requires-python
25
26
 
26
27
  [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
@@ -0,0 +1,15 @@
1
+ pydartdiags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ pydartdiags/matplots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ pydartdiags/matplots/matplots.py,sha256=Bo0TTz1gvsHEvTfTfLfdTi_3hNRN1okmyY5a5yYgtzk,13455
4
+ pydartdiags/obs_sequence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ pydartdiags/obs_sequence/composite_types.yaml,sha256=PVLMU6x6KcVMCwPB-U65C_e0YQUemfqUhYMpf1DhFOY,917
6
+ pydartdiags/obs_sequence/obs_sequence.py,sha256=5HfqOPoF2DyZQrUiGrYEwLJ9Iewe5DIzq0pdxR3bsnk,48037
7
+ pydartdiags/plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ pydartdiags/plots/plots.py,sha256=U7WQjE_qN-5a8-85D-PkkgILSFBzTJQ1mcGBa7l5DHI,6464
9
+ pydartdiags/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ pydartdiags/stats/stats.py,sha256=a88VuLoHOlhbjYjnrVPHVNnhiDx-4B3YA1jbc6FUSyU,20193
11
+ pydartdiags-0.6.0.dist-info/licenses/LICENSE,sha256=ROglds_Eg_ylXp-1MHmEawDqMw_UsCB4r9sk7z9PU9M,11377
12
+ pydartdiags-0.6.0.dist-info/METADATA,sha256=ZeVGK6hTX2tgIiedCVcavDPn195yCh8LO9-ziliePog,2381
13
+ pydartdiags-0.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ pydartdiags-0.6.0.dist-info/top_level.txt,sha256=LfMoPLnSd0VhhlWev1eeX9t6AzvyASOloag0LO_ppWg,12
15
+ pydartdiags-0.6.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,14 +0,0 @@
1
- pydartdiags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- pydartdiags/matplots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- pydartdiags/matplots/matplots.py,sha256=44MlD98gaQsrCT0mW6M9f0a2-clm3KEGrdYqkTUO0RI,7478
4
- pydartdiags/obs_sequence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- pydartdiags/obs_sequence/obs_sequence.py,sha256=kdPOWAqgiyuv6cTdhYx1u9Ru6zCKF0Wd--7-sM3m5F8,44527
6
- pydartdiags/plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- pydartdiags/plots/plots.py,sha256=U7WQjE_qN-5a8-85D-PkkgILSFBzTJQ1mcGBa7l5DHI,6464
8
- pydartdiags/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- pydartdiags/stats/stats.py,sha256=tzjE6HBrw6s9Li0UlJ_sNMcGEU8loT_BA5SDZp-UTOc,12138
10
- pydartdiags-0.5.0.dist-info/LICENSE,sha256=ROglds_Eg_ylXp-1MHmEawDqMw_UsCB4r9sk7z9PU9M,11377
11
- pydartdiags-0.5.0.dist-info/METADATA,sha256=F6znTR7qrj2qoGBYNojmWiaOqa9EAETgphV7i0HW0xc,2391
12
- pydartdiags-0.5.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
13
- pydartdiags-0.5.0.dist-info/top_level.txt,sha256=LfMoPLnSd0VhhlWev1eeX9t6AzvyASOloag0LO_ppWg,12
14
- pydartdiags-0.5.0.dist-info/RECORD,,