disdrodb 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. disdrodb/__init__.py +4 -0
  2. disdrodb/_version.py +2 -2
  3. disdrodb/api/checks.py +70 -47
  4. disdrodb/api/configs.py +0 -2
  5. disdrodb/api/info.py +3 -3
  6. disdrodb/api/io.py +48 -8
  7. disdrodb/api/path.py +116 -133
  8. disdrodb/api/search.py +12 -3
  9. disdrodb/cli/disdrodb_create_summary.py +103 -0
  10. disdrodb/cli/disdrodb_create_summary_station.py +1 -1
  11. disdrodb/cli/disdrodb_run_l0a_station.py +1 -1
  12. disdrodb/cli/disdrodb_run_l0b_station.py +2 -2
  13. disdrodb/cli/disdrodb_run_l0c_station.py +2 -2
  14. disdrodb/cli/disdrodb_run_l1_station.py +2 -2
  15. disdrodb/cli/disdrodb_run_l2e_station.py +2 -2
  16. disdrodb/cli/disdrodb_run_l2m_station.py +2 -2
  17. disdrodb/data_transfer/download_data.py +123 -7
  18. disdrodb/issue/writer.py +2 -0
  19. disdrodb/l0/l0a_processing.py +10 -5
  20. disdrodb/l0/l0b_nc_processing.py +10 -6
  21. disdrodb/l0/l0b_processing.py +26 -61
  22. disdrodb/l0/l0c_processing.py +369 -251
  23. disdrodb/l0/readers/LPM/ARM/ARM_LPM.py +7 -0
  24. disdrodb/l0/readers/PARSIVEL2/ARM/ARM_PARSIVEL2.py +4 -0
  25. disdrodb/l0/readers/PARSIVEL2/CANADA/UQAM_NC.py +69 -0
  26. disdrodb/l0/readers/PARSIVEL2/MPI/BCO_PARSIVEL2.py +136 -0
  27. disdrodb/l0/readers/PARSIVEL2/MPI/BOWTIE.py +220 -0
  28. disdrodb/l0/readers/PARSIVEL2/NASA/LPVEX.py +109 -0
  29. disdrodb/l0/readers/PARSIVEL2/NETHERLANDS/DELFT_NC.py +3 -0
  30. disdrodb/l1/fall_velocity.py +46 -0
  31. disdrodb/l1/processing.py +1 -1
  32. disdrodb/l2/processing.py +1 -1
  33. disdrodb/metadata/checks.py +132 -125
  34. disdrodb/psd/fitting.py +172 -205
  35. disdrodb/psd/models.py +1 -1
  36. disdrodb/routines/__init__.py +54 -0
  37. disdrodb/{l0/routines.py → routines/l0.py} +288 -418
  38. disdrodb/{l1/routines.py → routines/l1.py} +60 -92
  39. disdrodb/{l2/routines.py → routines/l2.py} +249 -462
  40. disdrodb/{routines.py → routines/wrappers.py} +95 -7
  41. disdrodb/scattering/axis_ratio.py +5 -1
  42. disdrodb/scattering/permittivity.py +18 -0
  43. disdrodb/scattering/routines.py +56 -36
  44. disdrodb/summary/routines.py +110 -34
  45. disdrodb/utils/archiving.py +434 -0
  46. disdrodb/utils/cli.py +5 -5
  47. disdrodb/utils/dask.py +62 -1
  48. disdrodb/utils/decorators.py +31 -0
  49. disdrodb/utils/encoding.py +5 -1
  50. disdrodb/{l2 → utils}/event.py +1 -66
  51. disdrodb/utils/logger.py +1 -1
  52. disdrodb/utils/manipulations.py +22 -12
  53. disdrodb/utils/routines.py +166 -0
  54. disdrodb/utils/time.py +3 -291
  55. disdrodb/utils/xarray.py +3 -0
  56. disdrodb/viz/plots.py +85 -14
  57. {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/METADATA +2 -2
  58. {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/RECORD +62 -54
  59. {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/entry_points.txt +1 -0
  60. {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/WHEEL +0 -0
  61. {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/licenses/LICENSE +0 -0
  62. {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,434 @@
1
+ # -----------------------------------------------------------------------------.
2
+ # Copyright (c) 2021-2023 DISDRODB developers
3
+ #
4
+ # This program is free software: you can redistribute it and/or modify
5
+ # it under the terms of the GNU General Public License as published by
6
+ # the Free Software Foundation, either version 3 of the License, or
7
+ # (at your option) any later version.
8
+ #
9
+ # This program is distributed in the hope that it will be useful,
10
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
+ # GNU General Public License for more details.
13
+ #
14
+ # You should have received a copy of the GNU General Public License
15
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
+ # -----------------------------------------------------------------------------.
17
+ """Utility function for DISDRODB product archiving."""
18
+ import datetime
19
+
20
+ import numpy as np
21
+ import pandas as pd
22
+
23
+ from disdrodb.api.info import get_start_end_time_from_filepaths
24
+ from disdrodb.api.io import open_netcdf_files
25
+ from disdrodb.utils.event import group_timesteps_into_event
26
+ from disdrodb.utils.time import (
27
+ ensure_sorted_by_time,
28
+ ensure_timedelta_seconds,
29
+ )
30
+
31
+ ####---------------------------------------------------------------------------------
32
+ #### Time blocks
33
+
34
+
35
+ def check_freq(freq: str) -> None:
36
+ """Check validity of freq argument."""
37
+ valid_freq = ["none", "year", "season", "quarter", "month", "day", "hour"]
38
+ if not isinstance(freq, str):
39
+ raise TypeError("'freq' must be a string.")
40
+ if freq not in valid_freq:
41
+ raise ValueError(
42
+ f"'freq' '{freq}' is not possible. Must be one of: {valid_freq}.",
43
+ )
44
+ return freq
45
+
46
+
47
+ def generate_time_blocks(
48
+ start_time: np.datetime64,
49
+ end_time: np.datetime64,
50
+ freq: str,
51
+ inclusive_end_time: bool = True,
52
+ ) -> np.ndarray:
53
+ """Generate time blocks between `start_time` and `end_time` for a given frequency.
54
+
55
+ Parameters
56
+ ----------
57
+ start_time : numpy.datetime64
58
+ Inclusive start of the overall time range.
59
+ end_time : numpy.datetime64
60
+ End of the overall time range. Inclusive by default (see inclusive_end_time argument).
61
+ freq : str
62
+ Frequency specifier. Accepted values are:
63
+ - 'none' : return a single block [start_time, end_time]
64
+ - 'day' : split into daily blocks
65
+ - 'month' : split into calendar months
66
+ - 'quarter' : split into calendar quarters
67
+ - 'year' : split into calendar years
68
+ - 'season' : split into meteorological seasons (MAM, JJA, SON, DJF)
69
+ inclusive_end_time: bool
70
+ The default is True.
71
+ If False, if the last block end_time is equal to input end_time, such block is removed.
72
+
73
+ Returns
74
+ -------
75
+ numpy.ndarray
76
+ Array of shape (n, 2) with dtype datetime64[s], where each row is [block_start, block_end].
77
+
78
+ """
79
+ freq = check_freq(freq)
80
+ if freq == "none":
81
+ return np.array([[start_time, end_time]], dtype="datetime64[s]")
82
+
83
+ # Mapping from our custom freq to pandas frequency codes
84
+ freq_map = {
85
+ "hour": "h",
86
+ "day": "d",
87
+ "month": "M",
88
+ "quarter": "Q",
89
+ "year": "Y",
90
+ "season": "Q-FEB", # seasons DJF, MAM, JJA, SON
91
+ }
92
+
93
+ # Define periods
94
+ periods = pd.period_range(start=start_time, end=end_time, freq=freq_map[freq])
95
+
96
+ # Create time blocks
97
+ blocks = []
98
+ for period in periods:
99
+ start = period.start_time.to_datetime64().astype("datetime64[s]")
100
+ if freq == "quarter":
101
+ end = period.end_time.floor("s").to_datetime64().astype("datetime64[s]")
102
+ else:
103
+ end = period.end_time.to_datetime64().astype("datetime64[s]")
104
+ blocks.append([start, end])
105
+ blocks = np.array(blocks, dtype="datetime64[s]")
106
+
107
+ if not inclusive_end_time and len(blocks) > 0 and blocks[-1, 0] == end_time:
108
+ blocks = blocks[:-1]
109
+ return blocks
110
+
111
+
112
+ ####----------------------------------------------------------------------------
113
+ #### Event/Time partitioning
114
+ def identify_events(
115
+ filepaths,
116
+ parallel=False,
117
+ min_drops=5,
118
+ neighbor_min_size=2,
119
+ neighbor_time_interval="5MIN",
120
+ event_max_time_gap="6H",
121
+ event_min_duration="5MIN",
122
+ event_min_size=3,
123
+ ):
124
+ """Return a list of rainy events.
125
+
126
+ Rainy timesteps are defined when N > min_drops.
127
+ Any rainy isolated timesteps (based on neighborhood criteria) is removed.
128
+ Then, consecutive rainy timesteps are grouped into the same event if the time gap between them does not
129
+ exceed `event_max_time_gap`. Finally, events that do not meet minimum size or duration
130
+ requirements are filtered out.
131
+
132
+ Parameters
133
+ ----------
134
+ filepaths: list
135
+ List of L1C file paths.
136
+ parallel: bool
137
+ Whether to load the files in parallel.
138
+ Set parallel=True only in a multiprocessing environment.
139
+ The default is False.
140
+ neighbor_time_interval : str
141
+ The time interval around a given a timestep defining the neighborhood.
142
+ Only timesteps that fall within this time interval before or after a timestep are considered neighbors.
143
+ neighbor_min_size : int, optional
144
+ The minimum number of neighboring timesteps required within `neighbor_time_interval` for a
145
+ timestep to be considered non-isolated. Isolated timesteps are removed !
146
+ - If `neighbor_min_size=0, then no timestep is considered isolated and no filtering occurs.
147
+ - If `neighbor_min_size=1`, the timestep must have at least one neighbor within `neighbor_time_interval`.
148
+ - If `neighbor_min_size=2`, the timestep must have at least two timesteps within `neighbor_time_interval`.
149
+ Defaults to 1.
150
+ event_max_time_gap: str
151
+ The maximum time interval between two timesteps to be considered part of the same event.
152
+ This parameters is used to group timesteps into events !
153
+ event_min_duration : str
154
+ The minimum duration an event must span. Events shorter than this duration are discarded.
155
+ event_min_size : int, optional
156
+ The minimum number of valid timesteps required for an event. Defaults to 1.
157
+
158
+ Returns
159
+ -------
160
+ list of dict
161
+ A list of events, where each event is represented as a dictionary with keys:
162
+ - "start_time": np.datetime64, start time of the event
163
+ - "end_time": np.datetime64, end time of the event
164
+ - "duration": np.timedelta64, duration of the event
165
+ - "n_timesteps": int, number of valid timesteps in the event
166
+ """
167
+ # Open datasets in parallel
168
+ ds = open_netcdf_files(filepaths, variables=["time", "N"], parallel=parallel, compute=True)
169
+ # Sort dataset by time
170
+ ds = ensure_sorted_by_time(ds)
171
+ # Define candidate timesteps to group into events
172
+ idx_valid = ds["N"].to_numpy() > min_drops
173
+ timesteps = ds["time"].to_numpy()[idx_valid]
174
+ # Define event list
175
+ event_list = group_timesteps_into_event(
176
+ timesteps=timesteps,
177
+ neighbor_min_size=neighbor_min_size,
178
+ neighbor_time_interval=neighbor_time_interval,
179
+ event_max_time_gap=event_max_time_gap,
180
+ event_min_duration=event_min_duration,
181
+ event_min_size=event_min_size,
182
+ )
183
+ del ds
184
+ return event_list
185
+
186
+
187
+ def identify_time_partitions(start_times, end_times, freq: str) -> list[dict]:
188
+ """Identify the set of time blocks covered by files.
189
+
190
+ The result is a minimal, sorted, and unique set of time partitions.
191
+ 'start_times' and end_times can be derived using get_start_end_time_from_filepaths.
192
+
193
+ Parameters
194
+ ----------
195
+ start_times : numpy.ndarray of datetime64[s]
196
+ Array of inclusive start times for each file.
197
+ end_times : numpy.ndarray of datetime64[s]
198
+ Array of inclusive end times for each file.
199
+ freq : {'none', 'hour', 'day', 'month', 'quarter', 'season', 'year'}
200
+ Frequency determining the granularity of candidate blocks.
201
+ See `generate_time_blocks` for more details.
202
+
203
+ Returns
204
+ -------
205
+ list of dict
206
+ A list of dictionaries, each containing:
207
+
208
+ - `start_time` (numpy.datetime64[s])
209
+ Inclusive start of a time block.
210
+ - `end_time` (numpy.datetime64[s])
211
+ Inclusive end of a time block.
212
+
213
+ Only those blocks that overlap at least one file's interval are returned.
214
+ The list is sorted by `start_time` and contains no duplicate blocks.
215
+ """
216
+ # Define files time coverage
217
+ start_time, end_time = start_times.min(), end_times.max()
218
+
219
+ # Compute candidate time blocks
220
+ blocks = generate_time_blocks(start_time, end_time, freq=freq)
221
+
222
+ # Select time blocks with files
223
+ mask = (blocks[:, 0][:, None] <= end_times) & (blocks[:, 1][:, None] >= start_times)
224
+ blocks = blocks[mask.any(axis=1)]
225
+
226
+ # Ensure sorted unique time blocks
227
+ order = np.argsort(blocks[:, 0])
228
+ blocks = np.unique(blocks[order], axis=0)
229
+
230
+ # Convert to list of dicts
231
+ list_time_blocks = [{"start_time": start_time, "end_time": end_time} for start_time, end_time in blocks]
232
+ return list_time_blocks
233
+
234
+
235
+ def define_temporal_partitions(filepaths, strategy, parallel, strategy_options):
236
+ """Define temporal file processing partitions.
237
+
238
+ Parameters
239
+ ----------
240
+ filepaths : list
241
+ List of files paths to be processed
242
+
243
+ strategy : str
244
+ Which partitioning strategy to apply:
245
+
246
+ - ``'time_block'`` defines fixed time intervals (e.g. monthly) covering input files.
247
+ - ``'event'`` detect clusters of precipitation ("events").
248
+
249
+ parallel : bool
250
+ If True, parallel data loading is used to identify events.
251
+
252
+ strategy_options : dict
253
+ Dictionary with strategy-specific parameters:
254
+
255
+ If ``strategy == 'time_block'``, supported options are:
256
+
257
+ - ``freq``: Time unit for blocks. One of {'year', 'season', 'month', 'day'}.
258
+
259
+ See identify_time_partitions for more information.
260
+
261
+ If ``strategy == 'event'``, supported options are:
262
+
263
+ - ``min_drops`` : int
264
+ Minimum number of drops to consider a timestep.
265
+ - ``neighbor_min_size`` : int
266
+ Minimum cluster size for merging neighboring events.
267
+ - ``neighbor_time_interval`` : str
268
+ Time window (e.g. "5MIN") to merge adjacent clusters.
269
+ - ``event_max_time_gap`` : str
270
+ Maximum allowed gap (e.g. "6H") within a single event.
271
+ - ``event_min_duration`` : str
272
+ Minimum total duration (e.g. "5MIN") of an event.
273
+ - ``event_min_size`` : int
274
+ Minimum number of records in an event.
275
+
276
+ See identify_events for more information.
277
+
278
+ Returns
279
+ -------
280
+ list
281
+ A list of dictionaries, each containing:
282
+
283
+ - ``start_time`` (numpy.datetime64[s])
284
+ Inclusive start of an event or time block.
285
+ - ``end_time`` (numpy.datetime64[s])
286
+ Inclusive end of an event or time block.
287
+
288
+ Notes
289
+ -----
290
+ - The ``'event'`` strategy requires loading data into memory to identify clusters.
291
+ - The ``'time_block'`` strategy can operate on metadata alone, without full data loading.
292
+ - The ``'event'`` strategy implicitly performs data selection on which files to process !
293
+ - The ``'time_block'`` strategy does not performs data selection on which files to process !
294
+ """
295
+ if strategy not in ["time_block", "event"]:
296
+ raise ValueError(f"Unknown strategy: {strategy!r}. Must be 'time_block' or 'event'.")
297
+ if strategy == "event":
298
+ return identify_events(filepaths, parallel=parallel, **strategy_options)
299
+
300
+ start_times, end_times = get_start_end_time_from_filepaths(filepaths)
301
+ return identify_time_partitions(start_times=start_times, end_times=end_times, **strategy_options)
302
+
303
+
304
+ ####----------------------------------------------------------------------------
305
+ #### Filepaths partitioning
306
+
307
+
308
+ def _map_files_to_blocks(files_start_time, files_end_time, filepaths, block_starts, block_ends):
309
+ """Map each block start_time to list of overlapping filepaths."""
310
+ # Use broadcasting to create a boolean matrix indicating which files cover which time block
311
+ # Broadcasting: (n_files, n_blocks)
312
+ mask = (files_start_time[:, None] <= block_ends[None, :]) & (files_end_time[:, None] >= block_starts[None, :])
313
+ # Create a list with the a dictionary for each block
314
+ filepaths = np.array(filepaths)
315
+ results = []
316
+ for i, (start, end) in enumerate(zip(block_starts, block_ends)):
317
+ indices = np.where(mask[:, i])[0]
318
+ if indices.size > 0:
319
+ results.append(
320
+ {
321
+ "start_time": start.astype(datetime.datetime),
322
+ "end_time": end.astype(datetime.datetime),
323
+ "filepaths": filepaths[indices].tolist(),
324
+ },
325
+ )
326
+ return results
327
+
328
+
329
+ def get_files_partitions(list_partitions, filepaths, sample_interval, accumulation_interval, rolling): # noqa: ARG001
330
+ """
331
+ Provide information about the required files for each event.
332
+
333
+ For each event in `list_partitions`, this function identifies the file paths from `filepaths` that
334
+ overlap with the event period, adjusted by the `accumulation_interval`. The event period is
335
+ extended backward or forward based on the `rolling` parameter.
336
+
337
+ Parameters
338
+ ----------
339
+ list_partitions : list of dict
340
+ List of events, where each event is a dictionary containing at least 'start_time' and 'end_time'
341
+ keys with `numpy.datetime64` values.
342
+ filepaths : list of str
343
+ List of file paths corresponding to data files.
344
+ sample_interval : numpy.timedelta64 or int
345
+ The sample interval of the input dataset.
346
+ accumulation_interval : numpy.timedelta64 or int
347
+ Time interval to adjust the event period for accumulation. If an integer is provided, it is
348
+ assumed to be in seconds.
349
+ rolling : bool
350
+ If True, adjust the event period backward by `accumulation_interval` (rolling backward).
351
+ If False, adjust forward (aggregate forward).
352
+
353
+ Returns
354
+ -------
355
+ list of dict
356
+ A list where each element is a dictionary containing:
357
+ - 'start_time': Adjusted start time of the event (`datetime.datetime64`).
358
+ - 'end_time': Adjusted end time of the event (`datetime.datetime64`).
359
+ - 'filepaths': List of file paths overlapping with the adjusted event period.
360
+
361
+ """
362
+ if len(filepaths) == 0 or len(list_partitions) == 0:
363
+ return []
364
+
365
+ # Ensure sample_interval and accumulation_interval is numpy.timedelta64
366
+ accumulation_interval = ensure_timedelta_seconds(accumulation_interval)
367
+ sample_interval = ensure_timedelta_seconds(sample_interval)
368
+
369
+ # Define offset on event_end_time
370
+ offset = accumulation_interval if sample_interval != accumulation_interval else ensure_timedelta_seconds(0)
371
+
372
+ # Retrieve file start_time and end_time
373
+ files_start_time, files_end_time = get_start_end_time_from_filepaths(filepaths)
374
+
375
+ # Retrieve partitions blocks start and end time arrays
376
+ block_starts = np.array([p["start_time"] for p in list_partitions]).astype("M8[s]")
377
+ block_ends = np.array([p["end_time"] for p in list_partitions]).astype("M8[s]")
378
+
379
+ # Add optional offset for resampling
380
+ # TODO: expanding partition time should be done only at L1 stage when resampling
381
+ # In disdrodb, the time reported is time at the start of the accumulation period !
382
+ # If sensors report time at the end of measurement interval, we might being reporting time
383
+ # with an inaccuracy equals to the sensor measurement interval.
384
+ # We could correct for that at L0C stage already !
385
+ block_ends = block_ends + offset
386
+
387
+ # Map filepaths to corresponding time blocks
388
+ list_event_info = _map_files_to_blocks(files_start_time, files_end_time, filepaths, block_starts, block_ends)
389
+ return list_event_info
390
+
391
+
392
+ def get_files_per_time_block(filepaths, freq="day", tolerance_seconds=120):
393
+ """
394
+ Organize files by the days they cover based on their start and end times.
395
+
396
+ Parameters
397
+ ----------
398
+ filepaths : list of str
399
+ List of file paths to be processed.
400
+
401
+ Returns
402
+ -------
403
+ dict
404
+ Dictionary where keys are days (as strings) and values are lists of file paths
405
+ that cover those days.
406
+
407
+ Notes
408
+ -----
409
+ This function adds a tolerance of 60 seconds to account for imprecise time logging by the sensors.
410
+ """
411
+ # Empty filepaths list return a dictionary
412
+ if len(filepaths) == 0:
413
+ return []
414
+
415
+ # Retrieve file start_time and end_time
416
+ files_start_time, files_end_time = get_start_end_time_from_filepaths(filepaths)
417
+
418
+ # Add tolerance to account for imprecise time logging by the sensors
419
+ # - Example: timestep 23:59:30 might be 00.00 and goes into the next day file ...
420
+ files_start_time = files_start_time - np.array(tolerance_seconds, dtype="m8[s]")
421
+ files_end_time = files_end_time + np.array(tolerance_seconds, dtype="m8[s]")
422
+
423
+ # Identify candidate blocks
424
+ list_partitions = identify_time_partitions(
425
+ start_times=files_start_time,
426
+ end_times=files_end_time,
427
+ freq=freq,
428
+ )
429
+ block_starts = np.array([b["start_time"] for b in list_partitions]).astype("M8[s]")
430
+ block_ends = np.array([b["end_time"] for b in list_partitions]).astype("M8[s]")
431
+
432
+ # Map filepaths to corresponding time blocks
433
+ list_event_info = _map_files_to_blocks(files_start_time, files_end_time, filepaths, block_starts, block_ends)
434
+ return list_event_info
disdrodb/utils/cli.py CHANGED
@@ -21,7 +21,7 @@
21
21
  import click
22
22
 
23
23
 
24
- def _execute_cmd(cmd, raise_error=False):
24
+ def execute_cmd(cmd, raise_error=False):
25
25
  """Execute command in the terminal, streaming output in python console."""
26
26
  from subprocess import PIPE, CalledProcessError, Popen
27
27
 
@@ -34,7 +34,7 @@ def _execute_cmd(cmd, raise_error=False):
34
34
  raise CalledProcessError(p.returncode, p.args)
35
35
 
36
36
 
37
- def _parse_empty_string_and_none(args):
37
+ def parse_empty_string_and_none(args):
38
38
  """Utility to parse argument passed from the command line.
39
39
 
40
40
  If ``args = ''``, returns None.
@@ -58,7 +58,7 @@ def parse_arg_to_list(args):
58
58
  If ``args = 'variable1 variable2'`` returns ``[variable1, variable2]``.
59
59
  """
60
60
  # If '' or 'None' --> Set to None
61
- args = _parse_empty_string_and_none(args)
61
+ args = parse_empty_string_and_none(args)
62
62
  # - If multiple arguments, split by space
63
63
  if isinstance(args, str):
64
64
  # - Split by space
@@ -75,7 +75,7 @@ def parse_archive_dir(archive_dir: str):
75
75
  If ``archive_dir = ''`` returns ``None``.
76
76
  """
77
77
  # If '', set to 'None'
78
- return _parse_empty_string_and_none(archive_dir)
78
+ return parse_empty_string_and_none(archive_dir)
79
79
 
80
80
 
81
81
  def click_station_arguments(function: object):
@@ -86,7 +86,7 @@ def click_station_arguments(function: object):
86
86
  function : object
87
87
  Function.
88
88
  """
89
- function = click.argument("station_name", metavar="<station>")(function)
89
+ function = click.argument("station_name", metavar="<STATION_NAME>")(function)
90
90
  function = click.argument("campaign_name", metavar="<CAMPAIGN_NAME>")(function)
91
91
  function = click.argument("data_source", metavar="<DATA_SOURCE>")(function)
92
92
  return function
disdrodb/utils/dask.py CHANGED
@@ -90,7 +90,7 @@ def initialize_dask_cluster(minimum_memory=None):
90
90
  n_workers=num_workers,
91
91
  threads_per_worker=1,
92
92
  processes=True,
93
- # memory_limit='8GB',
93
+ memory_limit=0, # this avoid flexible dask memory management
94
94
  silence_logs=logging.ERROR,
95
95
  )
96
96
  client = Client(cluster)
@@ -111,3 +111,64 @@ def close_dask_cluster(cluster, client):
111
111
  finally:
112
112
  # Restore the original log level
113
113
  logger.setLevel(original_level)
114
+
115
+
116
+ def execute_tasks_safely(list_tasks, parallel: bool, logs_dir: str):
117
+ """
118
+ Execute Dask tasks and skip failed ones.
119
+
120
+ Parameters
121
+ ----------
122
+ list_tasks : list
123
+ List of dask delayed objects or results.
124
+ parallel : bool
125
+ Whether to execute in parallel with Dask or not.
126
+ logs_dir : str
127
+ Directory to store FAILED_TASKS.log.
128
+
129
+ Returns
130
+ -------
131
+ list_logs : list
132
+ List of task results. For failed tasks, adds the path
133
+ to FAILED_TASKS.log in place of the result.
134
+ """
135
+ from dask.distributed import get_client
136
+
137
+ # Ensure logs_dir exists
138
+ os.makedirs(logs_dir, exist_ok=True)
139
+
140
+ # Define file name where to log failed dask tasks
141
+ failed_log_path = os.path.join(logs_dir, "FAILED_DASK_TASKS.log")
142
+
143
+ if not parallel:
144
+ # Non-parallel mode: just return results directly
145
+ return list_tasks
146
+
147
+ # Ensure we have a Dask client
148
+ try:
149
+ client = get_client()
150
+ except ValueError:
151
+ raise ValueError("No Dask Distributed Client found.")
152
+
153
+ # Compute tasks (all concurrently)
154
+ # - Runs tasks == num_workers * threads_per_worker (which is 1 for DISDRODB)
155
+ # - If errors occurs in some, skip it
156
+ futures = client.compute(list_tasks)
157
+ results = client.gather(futures, errors="skip")
158
+
159
+ # Collect failed futures
160
+ failed_futures = [f for f in futures if f.status != "finished"] # "error"
161
+
162
+ # If no tasks failed, return results
163
+ if not failed_futures:
164
+ return results
165
+
166
+ # Otherwise define log file listing failed tasks
167
+ with open(failed_log_path, "w") as f:
168
+ for fut in failed_futures:
169
+ err = fut.exception()
170
+ f.write(f"ERROR - DASK TASK FAILURE - Task {fut.key} failed: {err}\n")
171
+
172
+ # Append to list of log filepaths (results) the dask failing log
173
+ results.append(failed_log_path)
174
+ return results
@@ -19,10 +19,34 @@
19
19
  """DISDRODB decorators."""
20
20
  import functools
21
21
  import importlib
22
+ import uuid
22
23
 
23
24
  import dask
24
25
 
25
26
 
27
+ def create_dask_task_name(function_name: str, name=None) -> str | None:
28
+ """
29
+ Create a custom dask task name.
30
+
31
+ Parameters
32
+ ----------
33
+ function_name : str
34
+ Name of the function being delayed.
35
+ name : str, optional
36
+ Custom name for the task (e.g., filepath or ID).
37
+ If None, returns None so that Dask generates is own default name.
38
+
39
+ Returns
40
+ -------
41
+ str | None
42
+ Custom dask task name string if `name` is given,
43
+ otherwise None (use Dask's default naming).
44
+ """
45
+ if name is None:
46
+ return None
47
+ return f"{function_name}.{name}-{uuid.uuid4()}"
48
+
49
+
26
50
  def delayed_if_parallel(function):
27
51
  """Decorator to make the function delayed if its ``parallel`` argument is ``True``."""
28
52
 
@@ -34,6 +58,13 @@ def delayed_if_parallel(function):
34
58
  if parallel:
35
59
  # Enforce verbose to be False
36
60
  kwargs["verbose"] = False
61
+ # Define custom dask task name
62
+ if "logs_filename" in kwargs:
63
+ kwargs["dask_key_name"] = create_dask_task_name(
64
+ function_name=function.__name__,
65
+ name=kwargs["logs_filename"],
66
+ )
67
+
37
68
  # Define the delayed task
38
69
  result = dask.delayed(function)(*args, **kwargs)
39
70
  else:
@@ -50,6 +50,9 @@ def set_encodings(ds: xr.Dataset, encodings_dict: dict) -> xr.Dataset:
50
50
  xarray.Dataset
51
51
  Output xarray dataset.
52
52
  """
53
+ # TODO: CHANGE CHUNKSIZES SPECIFICATION USING {<DIM>: <CHUNKSIZE>} INSTEAD OF LIST
54
+ # --> Then unwrap to list of chunksizes here
55
+
53
56
  # Subset encoding dictionary
54
57
  # - Here below encodings_dict contains only keys (variables) within the dataset
55
58
  encodings_dict = {var: encodings_dict[var] for var in ds.data_vars if var in encodings_dict}
@@ -119,11 +122,12 @@ def rechunk_dataset(ds: xr.Dataset, encodings_dict: dict) -> xr.Dataset:
119
122
  """
120
123
  for var in ds.data_vars:
121
124
  if var in encodings_dict:
122
- chunks = encodings_dict[var].pop("chunksizes", None)
125
+ chunks = encodings_dict[var].get("chunksizes", None) # .pop("chunksizes", None)
123
126
  if chunks is not None:
124
127
  dims = list(ds[var].dims)
125
128
  chunks_dict = dict(zip(dims, chunks))
126
129
  ds[var] = ds[var].chunk(chunks_dict)
130
+ ds[var].encoding["chunksizes"] = chunks
127
131
  return ds
128
132
 
129
133