disdrodb 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. disdrodb/__init__.py +4 -0
  2. disdrodb/_version.py +2 -2
  3. disdrodb/api/checks.py +70 -47
  4. disdrodb/api/configs.py +0 -2
  5. disdrodb/api/info.py +3 -3
  6. disdrodb/api/io.py +48 -8
  7. disdrodb/api/path.py +116 -133
  8. disdrodb/api/search.py +12 -3
  9. disdrodb/cli/disdrodb_create_summary.py +103 -0
  10. disdrodb/cli/disdrodb_create_summary_station.py +1 -1
  11. disdrodb/cli/disdrodb_run_l0a_station.py +1 -1
  12. disdrodb/cli/disdrodb_run_l0b_station.py +2 -2
  13. disdrodb/cli/disdrodb_run_l0c_station.py +2 -2
  14. disdrodb/cli/disdrodb_run_l1_station.py +2 -2
  15. disdrodb/cli/disdrodb_run_l2e_station.py +2 -2
  16. disdrodb/cli/disdrodb_run_l2m_station.py +2 -2
  17. disdrodb/data_transfer/download_data.py +123 -7
  18. disdrodb/issue/writer.py +2 -0
  19. disdrodb/l0/l0a_processing.py +10 -5
  20. disdrodb/l0/l0b_nc_processing.py +10 -6
  21. disdrodb/l0/l0b_processing.py +26 -61
  22. disdrodb/l0/l0c_processing.py +369 -251
  23. disdrodb/l0/readers/LPM/ARM/ARM_LPM.py +7 -0
  24. disdrodb/l0/readers/PARSIVEL2/ARM/ARM_PARSIVEL2.py +4 -0
  25. disdrodb/l0/readers/PARSIVEL2/CANADA/UQAM_NC.py +69 -0
  26. disdrodb/l0/readers/PARSIVEL2/MPI/BCO_PARSIVEL2.py +136 -0
  27. disdrodb/l0/readers/PARSIVEL2/MPI/BOWTIE.py +220 -0
  28. disdrodb/l0/readers/PARSIVEL2/NASA/LPVEX.py +109 -0
  29. disdrodb/l0/readers/PARSIVEL2/NETHERLANDS/DELFT_NC.py +3 -0
  30. disdrodb/l1/fall_velocity.py +46 -0
  31. disdrodb/l1/processing.py +1 -1
  32. disdrodb/l2/processing.py +1 -1
  33. disdrodb/metadata/checks.py +132 -125
  34. disdrodb/psd/fitting.py +172 -205
  35. disdrodb/psd/models.py +1 -1
  36. disdrodb/routines/__init__.py +54 -0
  37. disdrodb/{l0/routines.py → routines/l0.py} +288 -418
  38. disdrodb/{l1/routines.py → routines/l1.py} +60 -92
  39. disdrodb/{l2/routines.py → routines/l2.py} +249 -462
  40. disdrodb/{routines.py → routines/wrappers.py} +95 -7
  41. disdrodb/scattering/axis_ratio.py +5 -1
  42. disdrodb/scattering/permittivity.py +18 -0
  43. disdrodb/scattering/routines.py +56 -36
  44. disdrodb/summary/routines.py +110 -34
  45. disdrodb/utils/archiving.py +434 -0
  46. disdrodb/utils/cli.py +5 -5
  47. disdrodb/utils/dask.py +62 -1
  48. disdrodb/utils/decorators.py +31 -0
  49. disdrodb/utils/encoding.py +5 -1
  50. disdrodb/{l2 → utils}/event.py +1 -66
  51. disdrodb/utils/logger.py +1 -1
  52. disdrodb/utils/manipulations.py +22 -12
  53. disdrodb/utils/routines.py +166 -0
  54. disdrodb/utils/time.py +3 -291
  55. disdrodb/utils/xarray.py +3 -0
  56. disdrodb/viz/plots.py +85 -14
  57. {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/METADATA +2 -2
  58. {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/RECORD +62 -54
  59. {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/entry_points.txt +1 -0
  60. {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/WHEEL +0 -0
  61. {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/licenses/LICENSE +0 -0
  62. {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/top_level.txt +0 -0
@@ -21,94 +21,31 @@ import logging
21
21
 
22
22
  import numpy as np
23
23
  import pandas as pd
24
+ import xarray as xr
24
25
 
25
- from disdrodb.api.checks import check_measurement_intervals
26
- from disdrodb.api.info import get_start_end_time_from_filepaths
26
+ from disdrodb.api.io import open_netcdf_files
27
+ from disdrodb.l0.l0b_processing import set_l0b_encodings
27
28
  from disdrodb.l1.resampling import add_sample_interval
28
- from disdrodb.utils.logger import log_warning # , log_info
29
- from disdrodb.utils.time import (
30
- ensure_sorted_by_time,
31
- regularize_timesteps,
32
- )
29
+ from disdrodb.utils.attrs import set_disdrodb_attrs
30
+ from disdrodb.utils.logger import log_info, log_warning
31
+ from disdrodb.utils.time import ensure_sorted_by_time
33
32
 
34
33
  logger = logging.getLogger(__name__)
35
34
 
35
+ # L0C processing requires searching for data (per time blocks) into neighbouring files:
36
+ # - to account for possible trailing seconds in previous/next files
37
+ # - to get information if at the edges of the time blocks previous/next timesteps are available
38
+ # - to shift the time to ensure reported L0C time is the start of the measurement interval
39
+ TOLERANCE_SECONDS = 60 * 3
36
40
 
37
- TOLERANCE_SECONDS = 120
38
-
39
-
40
- def get_files_per_days(filepaths):
41
- """
42
- Organize files by the days they cover based on their start and end times.
43
-
44
- Parameters
45
- ----------
46
- filepaths : list of str
47
- List of file paths to be processed.
48
-
49
- Returns
50
- -------
51
- dict
52
- Dictionary where keys are days (as strings) and values are lists of file paths
53
- that cover those days.
54
-
55
- Notes
56
- -----
57
- This function adds a tolerance of 60 seconds to account for imprecise time logging by the sensors.
58
- """
59
- # Retrieve file start_time and end_time
60
- files_start_time, files_end_time = get_start_end_time_from_filepaths(filepaths)
61
-
62
- # Add tolerance to account for imprecise time logging by the sensors
63
- # - Example: timestep 23:59:30 might be 00.00 and goes into the next day file ...
64
- files_start_time = files_start_time - np.array(TOLERANCE_SECONDS, dtype="m8[s]")
65
- files_end_time = files_end_time + np.array(TOLERANCE_SECONDS, dtype="m8[s]")
66
-
67
- # Retrieve file start day and end day
68
- start_day = files_start_time.min().astype("M8[D]")
69
- end_day = files_end_time.max().astype("M8[D]") + np.array(1, dtype="m8[D]")
70
-
71
- # Create an array with all days in time period covered by the files
72
- list_days = np.asanyarray(pd.date_range(start=start_day, end=end_day, freq="D")).astype("M8[D]")
73
-
74
- # Expand dimension to match each day using broadcasting
75
- files_start_time = files_start_time.astype("M8[D]")[:, np.newaxis] # shape (n_files, 1)
76
- files_end_time = files_end_time.astype("M8[D]")[:, np.newaxis] # shape (n_files, 1)
77
-
78
- # Create an array of all days
79
- # - Expand dimension to match each day using broadcasting
80
- days = list_days[np.newaxis, :] # shape (1, n_days)
81
-
82
- # Use broadcasting to create a boolean matrix indicating which files cover which days
83
- mask = (files_start_time <= days) & (files_end_time >= days) # shape (n_files, n_days)
84
-
85
- # Build a mapping from days to file indices
86
- # For each day (column), find the indices of files (rows) that cover that day
87
- dict_days = {}
88
- filepaths = np.array(filepaths)
89
- for i, day in enumerate(list_days):
90
- file_indices = np.where(mask[:, i])[0]
91
- if file_indices.size > 0:
92
- dict_days[str(day)] = filepaths[file_indices].tolist()
93
-
94
- return dict_days
95
-
96
-
97
- def retrieve_possible_measurement_intervals(metadata):
98
- """Retrieve list of possible measurements intervals."""
99
- measurement_intervals = metadata.get("measurement_interval", [])
100
- return check_measurement_intervals(measurement_intervals)
41
+ ####---------------------------------------------------------------------------------
42
+ #### Measurement intervals
101
43
 
102
44
 
103
45
  def drop_timesteps_with_invalid_sample_interval(ds, measurement_intervals, verbose=True, logger=None):
104
46
  """Drop timesteps with unexpected sample intervals."""
105
- # TODO
106
- # - correct logged sample_interval for trailing seconds. Example (58,59,61,62) converted to 60 s ?
107
- # - Need to know more how Parsivel software computes sample_interval variable ...
108
-
109
- # Retrieve logged sample_interval
110
- sample_interval = ds["sample_interval"].compute().data
111
- timesteps = ds["time"].compute().data
47
+ sample_interval = ds["sample_interval"].to_numpy()
48
+ timesteps = ds["time"].to_numpy()
112
49
  is_valid_sample_interval = np.isin(sample_interval.data, measurement_intervals)
113
50
  indices_invalid_sample_interval = np.where(~is_valid_sample_interval)[0]
114
51
  if len(indices_invalid_sample_interval) > 0:
@@ -124,10 +61,26 @@ def drop_timesteps_with_invalid_sample_interval(ds, measurement_intervals, verbo
124
61
  return ds
125
62
 
126
63
 
127
- def split_dataset_by_sampling_intervals(ds, measurement_intervals, min_sample_interval=10, min_block_size=5):
64
+ def split_dataset_by_sampling_intervals(
65
+ ds,
66
+ measurement_intervals,
67
+ min_sample_interval=10,
68
+ min_block_size=5,
69
+ time_is_end_interval=True,
70
+ ):
128
71
  """
129
72
  Split a dataset into subsets where each subset has a consistent sampling interval.
130
73
 
74
+ Notes
75
+ -----
76
+ - Does not modify timesteps (regularization is left to `regularize_timesteps`).
77
+ - Assumes no duplicated timesteps in the dataset.
78
+ - If only one measurement interval is specified, no timestep-diff checks are performed.
79
+ - If multiple measurement intervals are specified:
80
+ * Raises an error if *none* of the expected intervals appear.
81
+ * Splits where interval changes.
82
+ - Segments shorter than `min_block_size` are discarded.
83
+
131
84
  Parameters
132
85
  ----------
133
86
  ds : xarray.Dataset
@@ -136,30 +89,41 @@ def split_dataset_by_sampling_intervals(ds, measurement_intervals, min_sample_in
136
89
  A list of possible primary sampling intervals (in seconds) that the dataset might have.
137
90
  min_sample_interval : int, optional
138
91
  The minimum expected sampling interval in seconds. Defaults to 10s.
92
+ This is used to deal with possible trailing seconds errors.
139
93
  min_block_size : float, optional
140
94
  The minimum number of timesteps with a given sampling interval to be considered.
141
95
  Otherwise such portion of data is discarded !
142
96
  Defaults to 5 timesteps.
97
+ time_is_end_interval: bool
98
+ Whether time refers to the end of the measurement interval.
99
+ The default is True.
143
100
 
144
101
  Returns
145
102
  -------
146
- dict
103
+ dict[int, xr.Dataset]
147
104
  A dictionary where keys are the identified sampling intervals (in seconds),
148
- and values are xarray.Datasets containing only data from those intervals.
105
+ and values are xarray.Datasets containing only data from those sampling intervals.
149
106
  """
150
107
  # Define array of possible measurement intervals
151
108
  measurement_intervals = np.array(measurement_intervals)
152
109
 
110
+ # Check sorted by time and sort if necessary
111
+ ds = ensure_sorted_by_time(ds)
112
+
153
113
  # If a single measurement interval expected, return dictionary with input dataset
154
114
  if len(measurement_intervals) == 1:
155
- dict_ds = {measurement_intervals[0]: ds}
115
+ dict_ds = {int(measurement_intervals[0]): ds}
156
116
  return dict_ds
157
117
 
158
- # Check sorted by time and sort if necessary
159
- ds = ensure_sorted_by_time(ds)
118
+ # If sample_interval is a dataset variable, use it to define dictionary of datasets
119
+ if "sample_interval" in ds:
120
+ return {int(interval): ds.isel(time=ds["sample_interval"] == interval) for interval in measurement_intervals}
121
+
122
+ # ---------------------------------------------------------------------------------------.
123
+ # Otherwise exploit difference between timesteps to identify change point
160
124
 
161
125
  # Calculate time differences in seconds
162
- deltadt = np.diff(ds["time"].data).astype("timedelta64[s]").astype(int)
126
+ deltadt = np.abs(np.diff(ds["time"].data)).astype("timedelta64[s]").astype(int)
163
127
 
164
128
  # Round each delta to the nearest multiple of 5 (because the smallest possible sample interval is 10 s)
165
129
  # - This account for possible trailing seconds of the logger
@@ -175,25 +139,46 @@ def split_dataset_by_sampling_intervals(ds, measurement_intervals, min_sample_in
175
139
  if np.all(np.isnan(mapped_intervals)):
176
140
  raise ValueError("Impossible to identify timesteps with expected sampling intervals.")
177
141
 
142
+ # Check which measurements intervals are occurring in the dataset
143
+ uniques = np.unique(mapped_intervals)
144
+ uniques_intervals = uniques[~np.isnan(uniques)]
145
+ n_different_intervals_occurring = len(uniques_intervals)
146
+ if n_different_intervals_occurring == 1:
147
+ dict_ds = {int(k): ds for k in uniques_intervals}
148
+ return dict_ds
149
+
150
+ # Fill NaNs: decide whether to attach to previous or next interval
151
+ for i in range(len(mapped_intervals)):
152
+ if np.isnan(mapped_intervals[i]):
153
+ # If next exists and is NaN → forward fill
154
+ if i + 1 < len(mapped_intervals) and np.isnan(mapped_intervals[i + 1]):
155
+ mapped_intervals[i] = mapped_intervals[i - 1] if i > 0 else mapped_intervals[i + 1]
156
+ # Otherwise → backward fill (attach to next valid)
157
+ else:
158
+ mapped_intervals[i] = (
159
+ mapped_intervals[i + 1] if i + 1 < len(mapped_intervals) else mapped_intervals[i - 1]
160
+ )
161
+
178
162
  # Infill np.nan values by using neighbor intervals
179
163
  # Forward fill
180
- for i in range(1, len(mapped_intervals)):
181
- if np.isnan(mapped_intervals[i]):
182
- mapped_intervals[i] = mapped_intervals[i - 1]
164
+ # for i in range(1, len(mapped_intervals)):
165
+ # if np.isnan(mapped_intervals[i]):
166
+ # mapped_intervals[i] = mapped_intervals[i - 1]
183
167
 
184
- # Backward fill (in case the first entries were np.nan)
185
- for i in range(len(mapped_intervals) - 2, -1, -1):
186
- if np.isnan(mapped_intervals[i]):
187
- mapped_intervals[i] = mapped_intervals[i + 1]
168
+ # # Backward fill (in case the first entries were np.nan)
169
+ # for i in range(len(mapped_intervals) - 2, -1, -1):
170
+ # if np.isnan(mapped_intervals[i]):
171
+ # mapped_intervals[i] = mapped_intervals[i + 1]
188
172
 
189
173
  # Now all intervals are assigned to one of the possible measurement_intervals.
190
174
  # Identify boundaries where interval changes
191
175
  change_points = np.where(mapped_intervals[:-1] != mapped_intervals[1:])[0] + 1
192
176
 
193
177
  # Split ds into segments according to change_points
194
- segments = np.split(np.arange(ds.sizes["time"]), change_points)
178
+ offset = 1 if time_is_end_interval else 0
179
+ segments = np.split(np.arange(ds.sizes["time"]), change_points + offset)
195
180
 
196
- # Remove segments with less than 10 points
181
+ # Remove segments with less than min_block_size elements
197
182
  segments = [seg for seg in segments if len(seg) >= min_block_size]
198
183
  if len(segments) == 0:
199
184
  raise ValueError(
@@ -202,23 +187,40 @@ def split_dataset_by_sampling_intervals(ds, measurement_intervals, min_sample_in
202
187
 
203
188
  # Define dataset indices for each sampling interva
204
189
  dict_sampling_interval_indices = {}
190
+ used_indices = set()
205
191
  for seg in segments:
206
192
  # Define the assumed sampling interval of such segment
207
193
  start_idx = seg[0]
208
194
  segment_sampling_interval = int(mapped_intervals[start_idx])
209
- if segment_sampling_interval not in dict_sampling_interval_indices:
210
- dict_sampling_interval_indices[segment_sampling_interval] = [seg]
211
- else:
212
- dict_sampling_interval_indices[segment_sampling_interval].append(seg)
195
+ # Remove any indices that have already been assigned to another interval
196
+ seg_filtered = seg[~np.isin(seg, list(used_indices))]
197
+
198
+ # Only keep segment if it still meets minimum size after filtering
199
+ if len(seg_filtered) >= min_block_size:
200
+ if segment_sampling_interval not in dict_sampling_interval_indices:
201
+ dict_sampling_interval_indices[segment_sampling_interval] = [seg_filtered]
202
+ else:
203
+ dict_sampling_interval_indices[segment_sampling_interval].append(seg_filtered)
204
+
205
+ # Mark these indices as used
206
+ used_indices.update(seg_filtered)
207
+
208
+ # Concatenate indices for each sampling interval
213
209
  dict_sampling_interval_indices = {
214
- k: np.concatenate(list_indices) for k, list_indices in dict_sampling_interval_indices.items()
210
+ k: np.concatenate(list_indices)
211
+ for k, list_indices in dict_sampling_interval_indices.items()
212
+ if list_indices # Only include if there are valid segments
215
213
  }
216
214
 
217
215
  # Define dictionary of datasets
218
- dict_ds = {k: ds.isel(time=indices) for k, indices in dict_sampling_interval_indices.items()}
216
+ dict_ds = {int(k): ds.isel(time=indices) for k, indices in dict_sampling_interval_indices.items()}
219
217
  return dict_ds
220
218
 
221
219
 
220
+ ####---------------------------------------------------------------------------------
221
+ #### Timesteps duplicates
222
+
223
+
222
224
  def has_same_value_over_time(da):
223
225
  """
224
226
  Check if a DataArray has the same value over all timesteps, considering NaNs as equal.
@@ -317,6 +319,190 @@ def remove_duplicated_timesteps(ds, ensure_variables_equality=True, logger=None,
317
319
  return ds
318
320
 
319
321
 
322
+ ####---------------------------------------------------------------------------------
323
+ #### Timesteps regularization
324
+
325
+
326
+ def get_problematic_timestep_indices(timesteps, sample_interval):
327
+ """Identify timesteps with missing previous or following timesteps."""
328
+ previous_time = timesteps - pd.Timedelta(seconds=sample_interval)
329
+ next_time = timesteps + pd.Timedelta(seconds=sample_interval)
330
+ idx_previous_missing = np.where(~np.isin(previous_time, timesteps))[0][1:]
331
+ idx_next_missing = np.where(~np.isin(next_time, timesteps))[0][:-1]
332
+ idx_isolated_missing = np.intersect1d(idx_previous_missing, idx_next_missing)
333
+ idx_previous_missing = idx_previous_missing[np.isin(idx_previous_missing, idx_isolated_missing, invert=True)]
334
+ idx_next_missing = idx_next_missing[np.isin(idx_next_missing, idx_isolated_missing, invert=True)]
335
+ return idx_previous_missing, idx_next_missing, idx_isolated_missing
336
+
337
+
338
+ def regularize_timesteps(ds, sample_interval, robust=False, add_quality_flag=True, logger=None, verbose=True):
339
+ """Ensure timesteps match with the sample_interval.
340
+
341
+ This function:
342
+ - drop dataset indices with duplicated timesteps,
343
+ - but does not add missing timesteps to the dataset.
344
+ """
345
+ # Check sorted by time and sort if necessary
346
+ ds = ensure_sorted_by_time(ds)
347
+
348
+ # Convert time to pandas.DatetimeIndex for easier manipulation
349
+ times = pd.to_datetime(ds["time"].to_numpy())
350
+
351
+ # Determine the start and end times
352
+ start_time = times[0].floor(f"{sample_interval}s")
353
+ end_time = times[-1].ceil(f"{sample_interval}s")
354
+
355
+ # Create the expected time grid
356
+ expected_times = pd.date_range(start=start_time, end=end_time, freq=f"{sample_interval}s")
357
+
358
+ # Convert to numpy arrays
359
+ times = times.to_numpy(dtype="M8[s]")
360
+ expected_times = expected_times.to_numpy(dtype="M8[s]")
361
+
362
+ # Map original times to the nearest expected times
363
+ # Calculate the difference between original times and expected times
364
+ time_deltas = np.abs(times - expected_times[:, None]).astype(int)
365
+
366
+ # Find the index of the closest expected time for each original time
367
+ nearest_indices = np.argmin(time_deltas, axis=0)
368
+ adjusted_times = expected_times[nearest_indices]
369
+
370
+ # Check for duplicates in adjusted times
371
+ unique_times, counts = np.unique(adjusted_times, return_counts=True)
372
+ duplicates = unique_times[counts > 1]
373
+
374
+ # Initialize time quality flag
375
+ # - 0 when ok or just rounded to closest 00
376
+ # - 1 if previous timestep is missing
377
+ # - 2 if next timestep is missing
378
+ # - 3 if previous and next timestep is missing
379
+ # - 4 if solved duplicated timesteps
380
+ # - 5 if needed to drop duplicated timesteps and select the last
381
+ flag_previous_missing = 1
382
+ flag_next_missing = 2
383
+ flag_isolated_timestep = 3
384
+ flag_solved_duplicated_timestep = 4
385
+ flag_dropped_duplicated_timestep = 5
386
+ qc_flag = np.zeros(adjusted_times.shape)
387
+
388
+ # Initialize list with the duplicated timesteps index to drop
389
+ # - We drop the first occurrence because is likely the shortest interval
390
+ idx_to_drop = []
391
+
392
+ # Attempt to resolve for duplicates
393
+ if duplicates.size > 0:
394
+ # Handle duplicates
395
+ for dup_time in duplicates:
396
+ # Indices of duplicates
397
+ dup_indices = np.where(adjusted_times == dup_time)[0]
398
+ n_duplicates = len(dup_indices)
399
+ # Define previous and following timestep
400
+ prev_time = dup_time - pd.Timedelta(seconds=sample_interval)
401
+ next_time = dup_time + pd.Timedelta(seconds=sample_interval)
402
+ # Try to find missing slots before and after
403
+ # - If more than 3 duplicates, impossible to solve !
404
+ count_solved = 0
405
+ # If the previous timestep is available, set that one
406
+ if n_duplicates == 2:
407
+ if prev_time not in adjusted_times:
408
+ adjusted_times[dup_indices[0]] = prev_time
409
+ qc_flag[dup_indices[0]] = flag_solved_duplicated_timestep
410
+ count_solved += 1
411
+ elif next_time not in adjusted_times:
412
+ adjusted_times[dup_indices[-1]] = next_time
413
+ qc_flag[dup_indices[-1]] = flag_solved_duplicated_timestep
414
+ count_solved += 1
415
+ else:
416
+ pass
417
+ elif n_duplicates == 3:
418
+ if prev_time not in adjusted_times:
419
+ adjusted_times[dup_indices[0]] = prev_time
420
+ qc_flag[dup_indices[0]] = flag_solved_duplicated_timestep
421
+ count_solved += 1
422
+ if next_time not in adjusted_times:
423
+ adjusted_times[dup_indices[-1]] = next_time
424
+ qc_flag[dup_indices[-1]] = flag_solved_duplicated_timestep
425
+ count_solved += 1
426
+ if count_solved != n_duplicates - 1:
427
+ idx_to_drop = np.append(idx_to_drop, dup_indices[0:-1])
428
+ qc_flag[dup_indices[-1]] = flag_dropped_duplicated_timestep
429
+ msg = (
430
+ f"Cannot resolve {n_duplicates} duplicated timesteps "
431
+ f"(after trailing seconds correction) around {dup_time}."
432
+ )
433
+ log_warning(logger=logger, msg=msg, verbose=verbose)
434
+ if robust:
435
+ raise ValueError(msg)
436
+
437
+ # Update the time coordinate (Convert to ns for xarray compatibility)
438
+ ds = ds.assign_coords({"time": adjusted_times.astype("datetime64[ns]")})
439
+
440
+ # Update quality flag values for next and previous timestep is missing
441
+ if add_quality_flag:
442
+ idx_previous_missing, idx_next_missing, idx_isolated_missing = get_problematic_timestep_indices(
443
+ adjusted_times,
444
+ sample_interval,
445
+ )
446
+ qc_flag[idx_previous_missing] = np.maximum(qc_flag[idx_previous_missing], flag_previous_missing)
447
+ qc_flag[idx_next_missing] = np.maximum(qc_flag[idx_next_missing], flag_next_missing)
448
+ qc_flag[idx_isolated_missing] = np.maximum(qc_flag[idx_isolated_missing], flag_isolated_timestep)
449
+
450
+ # If the first timestep is at 00:00 and currently flagged as previous missing (1), reset to 0
451
+ # first_time = pd.to_datetime(adjusted_times[0]).time()
452
+ # first_expected_time = pd.Timestamp("00:00:00").time()
453
+ # if first_time == first_expected_time and qc_flag[0] == flag_previous_missing:
454
+ # qc_flag[0] = 0
455
+
456
+ # # If the last timestep is flagged and currently flagged as next missing (2), reset it to 0
457
+ # last_time = pd.to_datetime(adjusted_times[-1]).time()
458
+ # last_time_expected = (pd.Timestamp("00:00:00") - pd.Timedelta(30, unit="seconds")).time()
459
+ # # Check if adding one interval would go beyond the end_time
460
+ # if last_time == last_time_expected and qc_flag[-1] == flag_next_missing:
461
+ # qc_flag[-1] = 0
462
+
463
+ # Assign time quality flag coordinate
464
+ ds["time_qc"] = xr.DataArray(qc_flag, dims="time")
465
+ ds = ds.set_coords("time_qc")
466
+
467
+ # Add CF attributes for time_qc
468
+ ds["time_qc"].attrs = {
469
+ "long_name": "time quality flag",
470
+ "standard_name": "status_flag",
471
+ "units": "1",
472
+ "valid_range": [0, 5],
473
+ "flag_values": [0, 1, 2, 3, 4, 5],
474
+ "flag_meanings": (
475
+ "good_data "
476
+ "previous_timestep_missing "
477
+ "next_timestep_missing "
478
+ "isolated_timestep "
479
+ "solved_duplicated_timestep "
480
+ "dropped_duplicated_timestep"
481
+ ),
482
+ "comment": (
483
+ "Quality flag for time coordinate. "
484
+ "Flag 0: data is good or just rounded to nearest sampling interval. "
485
+ "Flag 1: previous timestep is missing in the time series. "
486
+ "Flag 2: next timestep is missing in the time series. "
487
+ "Flag 3: both previous and next timesteps are missing (isolated timestep). "
488
+ "Flag 4: timestep was moved from duplicate to fill missing timestep. "
489
+ "Flag 5: duplicate timestep was dropped, keeping the last occurrence."
490
+ ),
491
+ }
492
+
493
+ # Drop duplicated timesteps
494
+ # - Using ds = ds.drop_isel({"time": idx_to_drop.astype(int)}) raise:
495
+ # --> pandas.errors.InvalidIndexError: Reindexing only valid with uniquely valued Index objects
496
+ # --> https://github.com/pydata/xarray/issues/6605
497
+ if len(idx_to_drop) > 0:
498
+ idx_to_drop = idx_to_drop.astype(int)
499
+ idx_valid_timesteps = np.arange(0, ds["time"].size)
500
+ idx_valid_timesteps = np.delete(idx_valid_timesteps, idx_to_drop)
501
+ ds = ds.isel(time=idx_valid_timesteps)
502
+ # Return dataset
503
+ return ds
504
+
505
+
320
506
  def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
321
507
  """Check for the regularity of timesteps."""
322
508
  # Check sorted by time and sort if necessary
@@ -339,12 +525,14 @@ def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
339
525
  fractions = np.round(counts / len(deltadt) * 100, 2)
340
526
 
341
527
  # Compute stats about expected deltadt
342
- sample_interval_counts = counts[unique_deltadt == sample_interval].item()
343
- sample_interval_fraction = fractions[unique_deltadt == sample_interval].item()
528
+ mask = unique_deltadt == sample_interval
529
+ sample_interval_counts = counts[mask].item() if mask.any() else 0
530
+ sample_interval_fraction = fractions[mask].item() if mask.any() else 0.0
344
531
 
345
532
  # Compute stats about most frequent deltadt
346
- most_frequent_deltadt_counts = counts[unique_deltadt == most_frequent_deltadt].item()
347
- most_frequent_deltadt_fraction = fractions[unique_deltadt == most_frequent_deltadt].item()
533
+ mask = unique_deltadt == most_frequent_deltadt
534
+ most_frequent_deltadt_counts = counts[mask].item() if mask.any() else 0
535
+ most_frequent_deltadt_fraction = fractions[mask].item() if mask.any() else 0.0
348
536
 
349
537
  # Compute stats about unexpected deltadt
350
538
  unexpected_intervals = unique_deltadt[unique_deltadt != sample_interval]
@@ -352,13 +540,14 @@ def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
352
540
  unexpected_intervals_fractions = fractions[unique_deltadt != sample_interval]
353
541
  frequent_unexpected_intervals = unexpected_intervals[unexpected_intervals_fractions > 5]
354
542
 
355
- # Report warning if the samplin_interval deltadt occurs less often than 60 % of times
543
+ # Report warning if the sampling_interval deltadt occurs less often than 60 % of times
356
544
  # -> TODO: maybe only report in stations where the disdro does not log only data when rainy
357
545
  if sample_interval_fraction < 60:
358
546
  msg = (
359
547
  f"The expected (sampling) interval between observations occurs only "
360
548
  f"{sample_interval_counts}/{n} times ({sample_interval_fraction} %)."
361
549
  )
550
+ log_warning(logger=logger, msg=msg, verbose=verbose)
362
551
 
363
552
  # Report warning if a deltadt occurs more often then the sampling interval
364
553
  if most_frequent_deltadt != sample_interval:
@@ -372,14 +561,7 @@ def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
372
561
 
373
562
  # Report with a warning all unexpected deltadt with frequency larger than 5 %
374
563
  if len(frequent_unexpected_intervals) > 0:
375
- msg_parts = ["The following unexpected intervals occur frequently:"]
376
- for interval in frequent_unexpected_intervals:
377
- c = unexpected_intervals_counts[unexpected_intervals == interval].item()
378
- f = unexpected_intervals_fractions[unexpected_intervals == interval].item()
379
- msg_parts.append(f" {interval} ({f}%) ({c}/{n}) | ")
380
- msg = " ".join(msg_parts)
381
-
382
- msg = "The following time intervals between observations occurs often: "
564
+ msg = "The following time intervals between observations occur frequently: "
383
565
  for interval in frequent_unexpected_intervals:
384
566
  c = unexpected_intervals_counts[unexpected_intervals == interval].item()
385
567
  f = unexpected_intervals_fractions[unexpected_intervals == interval].item()
@@ -388,7 +570,11 @@ def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
388
570
  return ds
389
571
 
390
572
 
391
- def finalize_l0c_dataset(ds, sample_interval, verbose=True, logger=None):
573
+ ####----------------------------------------------------------------------------------------------.
574
+ #### Wrapper
575
+
576
+
577
+ def _finalize_l0c_dataset(ds, sample_interval, sensor_name, verbose=True, logger=None):
392
578
  """Finalize a L0C dataset with unique sampling interval.
393
579
 
394
580
  It adds the sampling_interval coordinate and it regularizes the timesteps for trailing seconds.
@@ -407,26 +593,45 @@ def finalize_l0c_dataset(ds, sample_interval, verbose=True, logger=None):
407
593
  )
408
594
 
409
595
  # Performs checks about timesteps regularity
596
+ # - Do not discard anything
597
+ # - Just log warnings in the log file
410
598
  ds = check_timesteps_regularity(ds=ds, sample_interval=sample_interval, verbose=verbose, logger=logger)
599
+
600
+ # Shift timesteps to ensure time correspond to start of measurement interval
601
+ # TODO as function of sensor name
602
+
603
+ # Set netCDF dimension order
604
+ # --> Required for correct encoding !
605
+ ds = ds.transpose("time", "diameter_bin_center", ...)
606
+
607
+ # Set encodings
608
+ ds = set_l0b_encodings(ds=ds, sensor_name=sensor_name)
609
+
610
+ # Update global attributes
611
+ ds = set_disdrodb_attrs(ds, product="L0C")
411
612
  return ds
412
613
 
413
614
 
414
- def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_equality=True, logger=None, verbose=True):
615
+ def create_l0c_datasets(
616
+ event_info,
617
+ measurement_intervals,
618
+ sensor_name,
619
+ ensure_variables_equality=True,
620
+ logger=None,
621
+ verbose=True,
622
+ ):
415
623
  """
416
- Create a daily file by merging and processing data from multiple filepaths.
624
+ Create a single dataset by merging and processing data from multiple filepaths.
417
625
 
418
626
  Parameters
419
627
  ----------
420
- day : str or numpy.datetime64
421
- The day for which the daily file is to be created.
422
- Should be in a format that can be converted to numpy.datetime64.
423
- filepaths : list of str
424
- List of filepaths to the data files to be processed.
628
+ event_info : dict
629
+ Dictionary with start_time, end_time and filepaths keys.
425
630
 
426
631
  Returns
427
632
  -------
428
- xarray.Dataset
429
- The processed dataset containing data for the specified day.
633
+ dict
634
+ A dictionary with an xarray.Dataset for each measurement interval.
430
635
 
431
636
  Raises
432
637
  ------
@@ -435,50 +640,39 @@ def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_eq
435
640
 
436
641
  Notes
437
642
  -----
438
- - The function adds a tolerance for searching timesteps
439
- before and after 00:00 to account for imprecise logging times.
440
- - It checks that duplicated timesteps have the same raw drop number values.
441
- - The function infers the sample interval and
442
- regularizes timesteps to handle trailing seconds.
443
- - The data is loaded into memory and connections to source files
444
- are closed before returning the dataset.
643
+ - Data is loaded into memory and connections to source files are closed before returning the dataset.
644
+ - Tolerance in input files is used around expected dataset start_time and end_time to account for
645
+ imprecise logging times and ensuring correct definition of qc_time at files boundaries (e.g. 00:00).
646
+ - Duplicated timesteps with different raw drop number values are dropped
647
+ - First occurrence of duplicated timesteps with equal raw drop number values is kept.
648
+ - Regularizes timesteps to handle trailing seconds.
445
649
  """
446
- import xarray as xr # Load in each process when function is called !
447
-
448
650
  # ---------------------------------------------------------------------------------------.
449
- # Define start day and end of day
450
- start_day = np.array(day).astype("M8[D]")
451
- end_day = start_day + np.array(1, dtype="m8[D]") - np.array(1, dtype="m8[s]") # avoid 00:00 of next day !
651
+ # Retrieve information
652
+ start_time = np.array(event_info["start_time"], dtype="M8[s]")
653
+ end_time = np.array(event_info["end_time"], dtype="M8[s]")
654
+ filepaths = event_info["filepaths"]
452
655
 
453
- # Add tolerance for searching timesteps before and after 00:00 to account for imprecise logging time
454
- # - Example: timestep 23:59:30 that should be 00.00 goes into the next day ...
455
- start_day_tol = start_day - np.array(TOLERANCE_SECONDS, dtype="m8[s]")
456
- end_day_tol = end_day + np.array(TOLERANCE_SECONDS, dtype="m8[s]")
656
+ # Define expected dataset time coverage
657
+ start_time_tol = start_time - np.array(TOLERANCE_SECONDS, dtype="m8[s]")
658
+ end_time_tol = end_time + np.array(TOLERANCE_SECONDS, dtype="m8[s]")
457
659
 
458
660
  # ---------------------------------------------------------------------------------------.
459
661
  # Open files with data within the provided day and concatenate them
460
- list_ds = [
461
- xr.open_dataset(filepath, decode_timedelta=False, chunks=-1, cache=False).sortby("time")
462
- for filepath in filepaths
463
- ]
464
- list_ds = [ds.sel({"time": slice(start_day_tol, end_day_tol)}) for ds in list_ds]
465
- if len(list_ds) > 1:
466
- # Concatenate dataset
467
- # - If some variable are missing in one file, it is filled with NaN. This should not occur anyway.
468
- # - The resulting dataset can have duplicated timesteps !
469
- ds = xr.concat(list_ds, dim="time", join="outer", compat="no_conflicts", combine_attrs="override").sortby(
470
- "time",
471
- )
472
- else:
473
- ds = list_ds[0]
474
-
475
- # Compute data
476
- ds = ds.compute()
662
+ ds = open_netcdf_files(
663
+ filepaths,
664
+ start_time=start_time_tol,
665
+ end_time=end_time_tol,
666
+ chunks={},
667
+ parallel=False,
668
+ compute=True,
669
+ )
477
670
 
478
- # Close connection to source files
479
- _ = [ds.close() for ds in list_ds]
480
- ds.close()
481
- del list_ds
671
+ # If not data for that time block, return empty dictionary
672
+ # - Can occur when raw files are already by block of months and e.g. here saving to daily blocks !
673
+ if ds.sizes["time"] == 0:
674
+ log_info(logger=logger, msg=f"No data between {start_time} and {end_time}.", verbose=verbose)
675
+ return {}
482
676
 
483
677
  # ---------------------------------------------------------------------------------------.
484
678
  # If sample interval is a dataset variable, drop timesteps with unexpected measurement intervals !
@@ -489,9 +683,16 @@ def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_eq
489
683
  verbose=verbose,
490
684
  logger=logger,
491
685
  )
686
+ n_timesteps = len(ds["time"])
687
+ if n_timesteps < 3:
688
+ raise ValueError(f"Only {n_timesteps} timesteps left after removing those with unexpected sample interval.")
492
689
 
493
690
  # ---------------------------------------------------------------------------------------.
494
- # Remove duplicated timesteps
691
+ # Remove duplicated timesteps (before correcting for trailing seconds)
692
+ # - It checks that duplicated timesteps have the same raw_drop_number values
693
+ # - If duplicated timesteps have different raw_drop_number values:
694
+ # --> warning is raised
695
+ # --> timesteps are dropped
495
696
  ds = remove_duplicated_timesteps(
496
697
  ds,
497
698
  ensure_variables_equality=ensure_variables_equality,
@@ -502,7 +703,7 @@ def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_eq
502
703
  # Raise error if less than 3 timesteps left
503
704
  n_timesteps = len(ds["time"])
504
705
  if n_timesteps < 3:
505
- raise ValueError(f"{n_timesteps} timesteps left after removing duplicated timesteps.")
706
+ raise ValueError(f"{n_timesteps} timesteps left after removing duplicated.")
506
707
 
507
708
  # ---------------------------------------------------------------------------------------.
508
709
  # Split dataset by sampling intervals
@@ -513,107 +714,24 @@ def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_eq
513
714
  min_block_size=5,
514
715
  )
515
716
 
516
- # Log a warning if two sampling intervals are present within a given day
717
+ # Log a warning if two sampling intervals are present within a given time block
517
718
  if len(dict_ds) > 1:
518
719
  occuring_sampling_intervals = list(dict_ds)
519
- msg = f"The dataset contains both sampling intervals {occuring_sampling_intervals}."
720
+ msg = f"The input files contains these sampling intervals: {occuring_sampling_intervals}."
520
721
  log_warning(logger=logger, msg=msg, verbose=verbose)
521
722
 
522
723
  # ---------------------------------------------------------------------------------------.
523
724
  # Finalize L0C datasets
524
- # - Add sample_interval coordinate
725
+ # - Add and ensure sample_interval coordinate has just 1 value (not varying with time)
525
726
  # - Regularize timesteps for trailing seconds
526
727
  dict_ds = {
527
- sample_interval: finalize_l0c_dataset(
728
+ sample_interval: _finalize_l0c_dataset(
528
729
  ds=ds,
529
730
  sample_interval=sample_interval,
731
+ sensor_name=sensor_name,
530
732
  verbose=verbose,
531
733
  logger=logger,
532
- ).sel({"time": slice(start_day, end_day)})
734
+ ).sel({"time": slice(start_time, end_time)})
533
735
  for sample_interval, ds in dict_ds.items()
534
736
  }
535
737
  return dict_ds
536
-
537
-
538
- # ---------------------------------------------------------------------------------------.
539
- #### DEPRECATED CODE
540
-
541
-
542
- # def copy_l0b_to_l0c_directory(filepath):
543
- # """Copy L0B file to L0C directory."""
544
- # import netCDF4
545
-
546
- # # Copy file
547
- # l0c_filepath = filepath.replace("L0B", "L0C")
548
- # _ = shutil.copy(filepath, l0c_filepath)
549
-
550
- # # Edit DISDRODB product attribute
551
- # with netCDF4.Dataset(l0c_filepath, mode="a") as nc_file:
552
- # # Modify the global attribute
553
- # nc_file.setncattr("disdrodb_product", "L0C")
554
-
555
- # def find_isel_common_time(da1, da2):
556
- # """
557
- # Find the indices of common time steps between two data arrays.
558
-
559
- # Parameters
560
- # ----------
561
- # da1 : xarray.DataArray
562
- # The first data array with a time coordinate.
563
- # da2 : xarray.DataArray
564
- # The second data array with a time coordinate.
565
-
566
- # Returns
567
- # -------
568
- # da1_isel : numpy.ndarray
569
- # Indices of the common time steps in the first data array.
570
- # da2_isel : numpy.ndarray
571
- # Indices of the common time steps in the second data array.
572
-
573
- # Notes
574
- # -----
575
- # This function assumes that both input data arrays have a "time" coordinate.
576
- # The function finds the intersection of the time steps in both data arrays
577
- # and returns the indices of these common time steps for each data array.
578
- # """
579
- # intersecting_timesteps = np.intersect1d(da1["time"], da2["time"])
580
- # da1_isel = np.where(np.isin(da1["time"], intersecting_timesteps))[0]
581
- # da2_isel = np.where(np.isin(da2["time"], intersecting_timesteps))[0]
582
- # return da1_isel, da2_isel
583
-
584
-
585
- # def check_same_raw_drop_number_values(list_ds, filepaths):
586
- # """
587
- # Check if the 'raw_drop_number' values are the same across multiple datasets.
588
-
589
- # This function compares the 'raw_drop_number' values of multiple datasets to ensure they are identical
590
- # at common timesteps.
591
-
592
- # If any discrepancies are found, a ValueError is raised indicating which files
593
- # have differing values.
594
-
595
- # Parameters
596
- # ----------
597
- # list_ds : list of xarray.Dataset
598
- # A list of xarray Datasets to be compared.
599
- # filepaths : list of str
600
- # A list of file paths corresponding to the datasets in `list_ds`.
601
-
602
- # Raises
603
- # ------
604
- # ValueError
605
- # If 'raw_drop_number' values differ at any common timestep between any two datasets.
606
- # """
607
- # # Retrieve variable to compare
608
- # list_drop_number = [ds["raw_drop_number"].compute() for ds in list_ds]
609
- # # Compare values
610
- # combos = list(itertools.combinations(range(len(list_drop_number)), 2))
611
- # for i, j in combos:
612
- # da1 = list_drop_number[i]
613
- # da2 = list_drop_number[j]
614
- # da1_isel, da2_isel = find_isel_common_time(da1=da1, da2=da2)
615
- # if not np.all(da1.isel(time=da1_isel).data == da2.isel(time=da2_isel).data):
616
- # file1 = filepaths[i]
617
- # file2 = filepaths[i]
618
- # msg = f"Duplicated timesteps have different values between file {file1} and {file2}"
619
- # raise ValueError(msg)