disdrodb 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- disdrodb/__init__.py +4 -0
- disdrodb/_version.py +2 -2
- disdrodb/api/checks.py +70 -47
- disdrodb/api/configs.py +0 -2
- disdrodb/api/create_directories.py +0 -2
- disdrodb/api/info.py +3 -3
- disdrodb/api/io.py +48 -8
- disdrodb/api/path.py +116 -133
- disdrodb/api/search.py +12 -3
- disdrodb/cli/disdrodb_create_summary.py +113 -0
- disdrodb/cli/disdrodb_create_summary_station.py +11 -1
- disdrodb/cli/disdrodb_run_l0a_station.py +1 -1
- disdrodb/cli/disdrodb_run_l0b_station.py +2 -2
- disdrodb/cli/disdrodb_run_l0c_station.py +2 -2
- disdrodb/cli/disdrodb_run_l1_station.py +2 -2
- disdrodb/cli/disdrodb_run_l2e_station.py +2 -2
- disdrodb/cli/disdrodb_run_l2m_station.py +2 -2
- disdrodb/constants.py +1 -1
- disdrodb/data_transfer/download_data.py +123 -7
- disdrodb/etc/products/L1/global.yaml +1 -1
- disdrodb/etc/products/L2E/5MIN.yaml +1 -0
- disdrodb/etc/products/L2E/global.yaml +1 -1
- disdrodb/etc/products/L2M/GAMMA_GS_ND_MAE.yaml +6 -0
- disdrodb/etc/products/L2M/GAMMA_ML.yaml +1 -1
- disdrodb/etc/products/L2M/LOGNORMAL_GS_LOG_ND_MAE.yaml +6 -0
- disdrodb/etc/products/L2M/LOGNORMAL_GS_ND_MAE.yaml +6 -0
- disdrodb/etc/products/L2M/LOGNORMAL_ML.yaml +8 -0
- disdrodb/etc/products/L2M/global.yaml +11 -3
- disdrodb/issue/writer.py +2 -0
- disdrodb/l0/check_configs.py +49 -16
- disdrodb/l0/configs/LPM/l0a_encodings.yml +2 -2
- disdrodb/l0/configs/LPM/l0b_cf_attrs.yml +2 -2
- disdrodb/l0/configs/LPM/l0b_encodings.yml +2 -2
- disdrodb/l0/configs/LPM/raw_data_format.yml +2 -2
- disdrodb/l0/configs/PWS100/l0b_encodings.yml +1 -0
- disdrodb/l0/configs/SWS250/bins_diameter.yml +108 -0
- disdrodb/l0/configs/SWS250/bins_velocity.yml +83 -0
- disdrodb/l0/configs/SWS250/l0a_encodings.yml +18 -0
- disdrodb/l0/configs/SWS250/l0b_cf_attrs.yml +72 -0
- disdrodb/l0/configs/SWS250/l0b_encodings.yml +155 -0
- disdrodb/l0/configs/SWS250/raw_data_format.yml +148 -0
- disdrodb/l0/l0a_processing.py +10 -5
- disdrodb/l0/l0b_nc_processing.py +10 -6
- disdrodb/l0/l0b_processing.py +92 -72
- disdrodb/l0/l0c_processing.py +369 -251
- disdrodb/l0/readers/LPM/ARM/ARM_LPM.py +8 -1
- disdrodb/l0/readers/LPM/AUSTRALIA/MELBOURNE_2007_LPM.py +2 -2
- disdrodb/l0/readers/LPM/BELGIUM/ULIEGE.py +256 -0
- disdrodb/l0/readers/LPM/BRAZIL/CHUVA_LPM.py +2 -2
- disdrodb/l0/readers/LPM/BRAZIL/GOAMAZON_LPM.py +2 -2
- disdrodb/l0/readers/LPM/GERMANY/DWD.py +491 -0
- disdrodb/l0/readers/LPM/ITALY/GID_LPM.py +2 -2
- disdrodb/l0/readers/LPM/ITALY/GID_LPM_W.py +2 -2
- disdrodb/l0/readers/LPM/KIT/CHWALA.py +2 -2
- disdrodb/l0/readers/LPM/SLOVENIA/ARSO.py +107 -12
- disdrodb/l0/readers/LPM/SLOVENIA/UL.py +3 -3
- disdrodb/l0/readers/LPM/SWITZERLAND/INNERERIZ_LPM.py +2 -2
- disdrodb/l0/readers/PARSIVEL/NCAR/VORTEX2_2010.py +5 -14
- disdrodb/l0/readers/PARSIVEL/NCAR/VORTEX2_2010_UF.py +5 -14
- disdrodb/l0/readers/PARSIVEL/SLOVENIA/UL.py +117 -8
- disdrodb/l0/readers/PARSIVEL2/ARM/ARM_PARSIVEL2.py +4 -0
- disdrodb/l0/readers/PARSIVEL2/BRAZIL/CHUVA_PARSIVEL2.py +10 -14
- disdrodb/l0/readers/PARSIVEL2/BRAZIL/GOAMAZON_PARSIVEL2.py +10 -14
- disdrodb/l0/readers/PARSIVEL2/CANADA/UQAM_NC.py +69 -0
- disdrodb/l0/readers/PARSIVEL2/DENMARK/DTU.py +8 -14
- disdrodb/l0/readers/PARSIVEL2/DENMARK/EROSION_raw.py +382 -0
- disdrodb/l0/readers/PARSIVEL2/FINLAND/FMI_PARSIVEL2.py +4 -0
- disdrodb/l0/readers/PARSIVEL2/FRANCE/OSUG.py +1 -1
- disdrodb/l0/readers/PARSIVEL2/GREECE/NOA.py +127 -0
- disdrodb/l0/readers/PARSIVEL2/ITALY/HYDROX.py +239 -0
- disdrodb/l0/readers/PARSIVEL2/MPI/BCO_PARSIVEL2.py +136 -0
- disdrodb/l0/readers/PARSIVEL2/MPI/BOWTIE.py +220 -0
- disdrodb/l0/readers/PARSIVEL2/NASA/LPVEX.py +109 -0
- disdrodb/l0/readers/PARSIVEL2/NCAR/FARM_PARSIVEL2.py +5 -11
- disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_MIPS.py +4 -17
- disdrodb/l0/readers/PARSIVEL2/NCAR/RELAMPAGO_PARSIVEL2.py +5 -14
- disdrodb/l0/readers/PARSIVEL2/NCAR/SNOWIE_PJ.py +10 -13
- disdrodb/l0/readers/PARSIVEL2/NCAR/SNOWIE_SB.py +10 -13
- disdrodb/l0/readers/PARSIVEL2/NETHERLANDS/DELFT_NC.py +3 -0
- disdrodb/l0/readers/PARSIVEL2/PHILIPPINES/PANGASA.py +232 -0
- disdrodb/l0/readers/PARSIVEL2/SPAIN/CENER.py +6 -18
- disdrodb/l0/readers/PARSIVEL2/SPAIN/GRANADA.py +120 -0
- disdrodb/l0/readers/PARSIVEL2/USA/C3WE.py +7 -25
- disdrodb/l0/readers/PWS100/AUSTRIA/HOAL.py +321 -0
- disdrodb/l0/readers/SW250/BELGIUM/KMI.py +239 -0
- disdrodb/l1/beard_model.py +31 -129
- disdrodb/l1/fall_velocity.py +156 -57
- disdrodb/l1/filters.py +25 -28
- disdrodb/l1/processing.py +12 -14
- disdrodb/l1_env/routines.py +46 -17
- disdrodb/l2/empirical_dsd.py +6 -0
- disdrodb/l2/processing.py +3 -3
- disdrodb/metadata/checks.py +132 -125
- disdrodb/metadata/geolocation.py +0 -2
- disdrodb/psd/fitting.py +180 -210
- disdrodb/psd/models.py +1 -1
- disdrodb/routines/__init__.py +54 -0
- disdrodb/{l0/routines.py → routines/l0.py} +288 -418
- disdrodb/{l1/routines.py → routines/l1.py} +60 -92
- disdrodb/{l2/routines.py → routines/l2.py} +284 -485
- disdrodb/{routines.py → routines/wrappers.py} +100 -7
- disdrodb/scattering/axis_ratio.py +95 -85
- disdrodb/scattering/permittivity.py +24 -0
- disdrodb/scattering/routines.py +56 -36
- disdrodb/summary/routines.py +147 -45
- disdrodb/utils/archiving.py +434 -0
- disdrodb/utils/attrs.py +2 -0
- disdrodb/utils/cli.py +5 -5
- disdrodb/utils/dask.py +62 -1
- disdrodb/utils/decorators.py +31 -0
- disdrodb/utils/encoding.py +10 -1
- disdrodb/{l2 → utils}/event.py +1 -66
- disdrodb/utils/logger.py +1 -1
- disdrodb/utils/manipulations.py +22 -12
- disdrodb/utils/routines.py +166 -0
- disdrodb/utils/time.py +5 -293
- disdrodb/utils/xarray.py +3 -0
- disdrodb/viz/plots.py +109 -15
- {disdrodb-0.1.3.dist-info → disdrodb-0.1.5.dist-info}/METADATA +3 -2
- {disdrodb-0.1.3.dist-info → disdrodb-0.1.5.dist-info}/RECORD +124 -96
- {disdrodb-0.1.3.dist-info → disdrodb-0.1.5.dist-info}/entry_points.txt +1 -0
- {disdrodb-0.1.3.dist-info → disdrodb-0.1.5.dist-info}/WHEEL +0 -0
- {disdrodb-0.1.3.dist-info → disdrodb-0.1.5.dist-info}/licenses/LICENSE +0 -0
- {disdrodb-0.1.3.dist-info → disdrodb-0.1.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
# -----------------------------------------------------------------------------.
|
|
2
|
+
# Copyright (c) 2021-2023 DISDRODB developers
|
|
3
|
+
#
|
|
4
|
+
# This program is free software: you can redistribute it and/or modify
|
|
5
|
+
# it under the terms of the GNU General Public License as published by
|
|
6
|
+
# the Free Software Foundation, either version 3 of the License, or
|
|
7
|
+
# (at your option) any later version.
|
|
8
|
+
#
|
|
9
|
+
# This program is distributed in the hope that it will be useful,
|
|
10
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
12
|
+
# GNU General Public License for more details.
|
|
13
|
+
#
|
|
14
|
+
# You should have received a copy of the GNU General Public License
|
|
15
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
16
|
+
# -----------------------------------------------------------------------------.
|
|
17
|
+
"""Utility function for DISDRODB product archiving."""
|
|
18
|
+
import datetime
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
import pandas as pd
|
|
22
|
+
|
|
23
|
+
from disdrodb.api.info import get_start_end_time_from_filepaths
|
|
24
|
+
from disdrodb.api.io import open_netcdf_files
|
|
25
|
+
from disdrodb.utils.event import group_timesteps_into_event
|
|
26
|
+
from disdrodb.utils.time import (
|
|
27
|
+
ensure_sorted_by_time,
|
|
28
|
+
ensure_timedelta_seconds,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
####---------------------------------------------------------------------------------
|
|
32
|
+
#### Time blocks
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def check_freq(freq: str) -> None:
|
|
36
|
+
"""Check validity of freq argument."""
|
|
37
|
+
valid_freq = ["none", "year", "season", "quarter", "month", "day", "hour"]
|
|
38
|
+
if not isinstance(freq, str):
|
|
39
|
+
raise TypeError("'freq' must be a string.")
|
|
40
|
+
if freq not in valid_freq:
|
|
41
|
+
raise ValueError(
|
|
42
|
+
f"'freq' '{freq}' is not possible. Must be one of: {valid_freq}.",
|
|
43
|
+
)
|
|
44
|
+
return freq
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def generate_time_blocks(
|
|
48
|
+
start_time: np.datetime64,
|
|
49
|
+
end_time: np.datetime64,
|
|
50
|
+
freq: str,
|
|
51
|
+
inclusive_end_time: bool = True,
|
|
52
|
+
) -> np.ndarray:
|
|
53
|
+
"""Generate time blocks between `start_time` and `end_time` for a given frequency.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
start_time : numpy.datetime64
|
|
58
|
+
Inclusive start of the overall time range.
|
|
59
|
+
end_time : numpy.datetime64
|
|
60
|
+
End of the overall time range. Inclusive by default (see inclusive_end_time argument).
|
|
61
|
+
freq : str
|
|
62
|
+
Frequency specifier. Accepted values are:
|
|
63
|
+
- 'none' : return a single block [start_time, end_time]
|
|
64
|
+
- 'day' : split into daily blocks
|
|
65
|
+
- 'month' : split into calendar months
|
|
66
|
+
- 'quarter' : split into calendar quarters
|
|
67
|
+
- 'year' : split into calendar years
|
|
68
|
+
- 'season' : split into meteorological seasons (MAM, JJA, SON, DJF)
|
|
69
|
+
inclusive_end_time: bool
|
|
70
|
+
The default is True.
|
|
71
|
+
If False, if the last block end_time is equal to input end_time, such block is removed.
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
numpy.ndarray
|
|
76
|
+
Array of shape (n, 2) with dtype datetime64[s], where each row is [block_start, block_end].
|
|
77
|
+
|
|
78
|
+
"""
|
|
79
|
+
freq = check_freq(freq)
|
|
80
|
+
if freq == "none":
|
|
81
|
+
return np.array([[start_time, end_time]], dtype="datetime64[s]")
|
|
82
|
+
|
|
83
|
+
# Mapping from our custom freq to pandas frequency codes
|
|
84
|
+
freq_map = {
|
|
85
|
+
"hour": "h",
|
|
86
|
+
"day": "d",
|
|
87
|
+
"month": "M",
|
|
88
|
+
"quarter": "Q",
|
|
89
|
+
"year": "Y",
|
|
90
|
+
"season": "Q-FEB", # seasons DJF, MAM, JJA, SON
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# Define periods
|
|
94
|
+
periods = pd.period_range(start=start_time, end=end_time, freq=freq_map[freq])
|
|
95
|
+
|
|
96
|
+
# Create time blocks
|
|
97
|
+
blocks = []
|
|
98
|
+
for period in periods:
|
|
99
|
+
start = period.start_time.to_datetime64().astype("datetime64[s]")
|
|
100
|
+
if freq == "quarter":
|
|
101
|
+
end = period.end_time.floor("s").to_datetime64().astype("datetime64[s]")
|
|
102
|
+
else:
|
|
103
|
+
end = period.end_time.to_datetime64().astype("datetime64[s]")
|
|
104
|
+
blocks.append([start, end])
|
|
105
|
+
blocks = np.array(blocks, dtype="datetime64[s]")
|
|
106
|
+
|
|
107
|
+
if not inclusive_end_time and len(blocks) > 0 and blocks[-1, 0] == end_time:
|
|
108
|
+
blocks = blocks[:-1]
|
|
109
|
+
return blocks
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
####----------------------------------------------------------------------------
|
|
113
|
+
#### Event/Time partitioning
|
|
114
|
+
def identify_events(
|
|
115
|
+
filepaths,
|
|
116
|
+
parallel=False,
|
|
117
|
+
min_drops=5,
|
|
118
|
+
neighbor_min_size=2,
|
|
119
|
+
neighbor_time_interval="5MIN",
|
|
120
|
+
event_max_time_gap="6H",
|
|
121
|
+
event_min_duration="5MIN",
|
|
122
|
+
event_min_size=3,
|
|
123
|
+
):
|
|
124
|
+
"""Return a list of rainy events.
|
|
125
|
+
|
|
126
|
+
Rainy timesteps are defined when N > min_drops.
|
|
127
|
+
Any rainy isolated timesteps (based on neighborhood criteria) is removed.
|
|
128
|
+
Then, consecutive rainy timesteps are grouped into the same event if the time gap between them does not
|
|
129
|
+
exceed `event_max_time_gap`. Finally, events that do not meet minimum size or duration
|
|
130
|
+
requirements are filtered out.
|
|
131
|
+
|
|
132
|
+
Parameters
|
|
133
|
+
----------
|
|
134
|
+
filepaths: list
|
|
135
|
+
List of L1C file paths.
|
|
136
|
+
parallel: bool
|
|
137
|
+
Whether to load the files in parallel.
|
|
138
|
+
Set parallel=True only in a multiprocessing environment.
|
|
139
|
+
The default is False.
|
|
140
|
+
neighbor_time_interval : str
|
|
141
|
+
The time interval around a given a timestep defining the neighborhood.
|
|
142
|
+
Only timesteps that fall within this time interval before or after a timestep are considered neighbors.
|
|
143
|
+
neighbor_min_size : int, optional
|
|
144
|
+
The minimum number of neighboring timesteps required within `neighbor_time_interval` for a
|
|
145
|
+
timestep to be considered non-isolated. Isolated timesteps are removed !
|
|
146
|
+
- If `neighbor_min_size=0, then no timestep is considered isolated and no filtering occurs.
|
|
147
|
+
- If `neighbor_min_size=1`, the timestep must have at least one neighbor within `neighbor_time_interval`.
|
|
148
|
+
- If `neighbor_min_size=2`, the timestep must have at least two timesteps within `neighbor_time_interval`.
|
|
149
|
+
Defaults to 1.
|
|
150
|
+
event_max_time_gap: str
|
|
151
|
+
The maximum time interval between two timesteps to be considered part of the same event.
|
|
152
|
+
This parameters is used to group timesteps into events !
|
|
153
|
+
event_min_duration : str
|
|
154
|
+
The minimum duration an event must span. Events shorter than this duration are discarded.
|
|
155
|
+
event_min_size : int, optional
|
|
156
|
+
The minimum number of valid timesteps required for an event. Defaults to 1.
|
|
157
|
+
|
|
158
|
+
Returns
|
|
159
|
+
-------
|
|
160
|
+
list of dict
|
|
161
|
+
A list of events, where each event is represented as a dictionary with keys:
|
|
162
|
+
- "start_time": np.datetime64, start time of the event
|
|
163
|
+
- "end_time": np.datetime64, end time of the event
|
|
164
|
+
- "duration": np.timedelta64, duration of the event
|
|
165
|
+
- "n_timesteps": int, number of valid timesteps in the event
|
|
166
|
+
"""
|
|
167
|
+
# Open datasets in parallel
|
|
168
|
+
ds = open_netcdf_files(filepaths, variables=["time", "N"], parallel=parallel, compute=True)
|
|
169
|
+
# Sort dataset by time
|
|
170
|
+
ds = ensure_sorted_by_time(ds)
|
|
171
|
+
# Define candidate timesteps to group into events
|
|
172
|
+
idx_valid = ds["N"].to_numpy() > min_drops
|
|
173
|
+
timesteps = ds["time"].to_numpy()[idx_valid]
|
|
174
|
+
# Define event list
|
|
175
|
+
event_list = group_timesteps_into_event(
|
|
176
|
+
timesteps=timesteps,
|
|
177
|
+
neighbor_min_size=neighbor_min_size,
|
|
178
|
+
neighbor_time_interval=neighbor_time_interval,
|
|
179
|
+
event_max_time_gap=event_max_time_gap,
|
|
180
|
+
event_min_duration=event_min_duration,
|
|
181
|
+
event_min_size=event_min_size,
|
|
182
|
+
)
|
|
183
|
+
del ds
|
|
184
|
+
return event_list
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def identify_time_partitions(start_times, end_times, freq: str) -> list[dict]:
|
|
188
|
+
"""Identify the set of time blocks covered by files.
|
|
189
|
+
|
|
190
|
+
The result is a minimal, sorted, and unique set of time partitions.
|
|
191
|
+
'start_times' and end_times can be derived using get_start_end_time_from_filepaths.
|
|
192
|
+
|
|
193
|
+
Parameters
|
|
194
|
+
----------
|
|
195
|
+
start_times : numpy.ndarray of datetime64[s]
|
|
196
|
+
Array of inclusive start times for each file.
|
|
197
|
+
end_times : numpy.ndarray of datetime64[s]
|
|
198
|
+
Array of inclusive end times for each file.
|
|
199
|
+
freq : {'none', 'hour', 'day', 'month', 'quarter', 'season', 'year'}
|
|
200
|
+
Frequency determining the granularity of candidate blocks.
|
|
201
|
+
See `generate_time_blocks` for more details.
|
|
202
|
+
|
|
203
|
+
Returns
|
|
204
|
+
-------
|
|
205
|
+
list of dict
|
|
206
|
+
A list of dictionaries, each containing:
|
|
207
|
+
|
|
208
|
+
- `start_time` (numpy.datetime64[s])
|
|
209
|
+
Inclusive start of a time block.
|
|
210
|
+
- `end_time` (numpy.datetime64[s])
|
|
211
|
+
Inclusive end of a time block.
|
|
212
|
+
|
|
213
|
+
Only those blocks that overlap at least one file's interval are returned.
|
|
214
|
+
The list is sorted by `start_time` and contains no duplicate blocks.
|
|
215
|
+
"""
|
|
216
|
+
# Define files time coverage
|
|
217
|
+
start_time, end_time = start_times.min(), end_times.max()
|
|
218
|
+
|
|
219
|
+
# Compute candidate time blocks
|
|
220
|
+
blocks = generate_time_blocks(start_time, end_time, freq=freq)
|
|
221
|
+
|
|
222
|
+
# Select time blocks with files
|
|
223
|
+
mask = (blocks[:, 0][:, None] <= end_times) & (blocks[:, 1][:, None] >= start_times)
|
|
224
|
+
blocks = blocks[mask.any(axis=1)]
|
|
225
|
+
|
|
226
|
+
# Ensure sorted unique time blocks
|
|
227
|
+
order = np.argsort(blocks[:, 0])
|
|
228
|
+
blocks = np.unique(blocks[order], axis=0)
|
|
229
|
+
|
|
230
|
+
# Convert to list of dicts
|
|
231
|
+
list_time_blocks = [{"start_time": start_time, "end_time": end_time} for start_time, end_time in blocks]
|
|
232
|
+
return list_time_blocks
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def define_temporal_partitions(filepaths, strategy, parallel, strategy_options):
|
|
236
|
+
"""Define temporal file processing partitions.
|
|
237
|
+
|
|
238
|
+
Parameters
|
|
239
|
+
----------
|
|
240
|
+
filepaths : list
|
|
241
|
+
List of files paths to be processed
|
|
242
|
+
|
|
243
|
+
strategy : str
|
|
244
|
+
Which partitioning strategy to apply:
|
|
245
|
+
|
|
246
|
+
- ``'time_block'`` defines fixed time intervals (e.g. monthly) covering input files.
|
|
247
|
+
- ``'event'`` detect clusters of precipitation ("events").
|
|
248
|
+
|
|
249
|
+
parallel : bool
|
|
250
|
+
If True, parallel data loading is used to identify events.
|
|
251
|
+
|
|
252
|
+
strategy_options : dict
|
|
253
|
+
Dictionary with strategy-specific parameters:
|
|
254
|
+
|
|
255
|
+
If ``strategy == 'time_block'``, supported options are:
|
|
256
|
+
|
|
257
|
+
- ``freq``: Time unit for blocks. One of {'year', 'season', 'month', 'day'}.
|
|
258
|
+
|
|
259
|
+
See identify_time_partitions for more information.
|
|
260
|
+
|
|
261
|
+
If ``strategy == 'event'``, supported options are:
|
|
262
|
+
|
|
263
|
+
- ``min_drops`` : int
|
|
264
|
+
Minimum number of drops to consider a timestep.
|
|
265
|
+
- ``neighbor_min_size`` : int
|
|
266
|
+
Minimum cluster size for merging neighboring events.
|
|
267
|
+
- ``neighbor_time_interval`` : str
|
|
268
|
+
Time window (e.g. "5MIN") to merge adjacent clusters.
|
|
269
|
+
- ``event_max_time_gap`` : str
|
|
270
|
+
Maximum allowed gap (e.g. "6H") within a single event.
|
|
271
|
+
- ``event_min_duration`` : str
|
|
272
|
+
Minimum total duration (e.g. "5MIN") of an event.
|
|
273
|
+
- ``event_min_size`` : int
|
|
274
|
+
Minimum number of records in an event.
|
|
275
|
+
|
|
276
|
+
See identify_events for more information.
|
|
277
|
+
|
|
278
|
+
Returns
|
|
279
|
+
-------
|
|
280
|
+
list
|
|
281
|
+
A list of dictionaries, each containing:
|
|
282
|
+
|
|
283
|
+
- ``start_time`` (numpy.datetime64[s])
|
|
284
|
+
Inclusive start of an event or time block.
|
|
285
|
+
- ``end_time`` (numpy.datetime64[s])
|
|
286
|
+
Inclusive end of an event or time block.
|
|
287
|
+
|
|
288
|
+
Notes
|
|
289
|
+
-----
|
|
290
|
+
- The ``'event'`` strategy requires loading data into memory to identify clusters.
|
|
291
|
+
- The ``'time_block'`` strategy can operate on metadata alone, without full data loading.
|
|
292
|
+
- The ``'event'`` strategy implicitly performs data selection on which files to process !
|
|
293
|
+
- The ``'time_block'`` strategy does not performs data selection on which files to process !
|
|
294
|
+
"""
|
|
295
|
+
if strategy not in ["time_block", "event"]:
|
|
296
|
+
raise ValueError(f"Unknown strategy: {strategy!r}. Must be 'time_block' or 'event'.")
|
|
297
|
+
if strategy == "event":
|
|
298
|
+
return identify_events(filepaths, parallel=parallel, **strategy_options)
|
|
299
|
+
|
|
300
|
+
start_times, end_times = get_start_end_time_from_filepaths(filepaths)
|
|
301
|
+
return identify_time_partitions(start_times=start_times, end_times=end_times, **strategy_options)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
####----------------------------------------------------------------------------
|
|
305
|
+
#### Filepaths partitioning
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def _map_files_to_blocks(files_start_time, files_end_time, filepaths, block_starts, block_ends):
|
|
309
|
+
"""Map each block start_time to list of overlapping filepaths."""
|
|
310
|
+
# Use broadcasting to create a boolean matrix indicating which files cover which time block
|
|
311
|
+
# Broadcasting: (n_files, n_blocks)
|
|
312
|
+
mask = (files_start_time[:, None] <= block_ends[None, :]) & (files_end_time[:, None] >= block_starts[None, :])
|
|
313
|
+
# Create a list with the a dictionary for each block
|
|
314
|
+
filepaths = np.array(filepaths)
|
|
315
|
+
results = []
|
|
316
|
+
for i, (start, end) in enumerate(zip(block_starts, block_ends)):
|
|
317
|
+
indices = np.where(mask[:, i])[0]
|
|
318
|
+
if indices.size > 0:
|
|
319
|
+
results.append(
|
|
320
|
+
{
|
|
321
|
+
"start_time": start.astype(datetime.datetime),
|
|
322
|
+
"end_time": end.astype(datetime.datetime),
|
|
323
|
+
"filepaths": filepaths[indices].tolist(),
|
|
324
|
+
},
|
|
325
|
+
)
|
|
326
|
+
return results
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def get_files_partitions(list_partitions, filepaths, sample_interval, accumulation_interval, rolling): # noqa: ARG001
|
|
330
|
+
"""
|
|
331
|
+
Provide information about the required files for each event.
|
|
332
|
+
|
|
333
|
+
For each event in `list_partitions`, this function identifies the file paths from `filepaths` that
|
|
334
|
+
overlap with the event period, adjusted by the `accumulation_interval`. The event period is
|
|
335
|
+
extended backward or forward based on the `rolling` parameter.
|
|
336
|
+
|
|
337
|
+
Parameters
|
|
338
|
+
----------
|
|
339
|
+
list_partitions : list of dict
|
|
340
|
+
List of events, where each event is a dictionary containing at least 'start_time' and 'end_time'
|
|
341
|
+
keys with `numpy.datetime64` values.
|
|
342
|
+
filepaths : list of str
|
|
343
|
+
List of file paths corresponding to data files.
|
|
344
|
+
sample_interval : numpy.timedelta64 or int
|
|
345
|
+
The sample interval of the input dataset.
|
|
346
|
+
accumulation_interval : numpy.timedelta64 or int
|
|
347
|
+
Time interval to adjust the event period for accumulation. If an integer is provided, it is
|
|
348
|
+
assumed to be in seconds.
|
|
349
|
+
rolling : bool
|
|
350
|
+
If True, adjust the event period backward by `accumulation_interval` (rolling backward).
|
|
351
|
+
If False, adjust forward (aggregate forward).
|
|
352
|
+
|
|
353
|
+
Returns
|
|
354
|
+
-------
|
|
355
|
+
list of dict
|
|
356
|
+
A list where each element is a dictionary containing:
|
|
357
|
+
- 'start_time': Adjusted start time of the event (`datetime.datetime64`).
|
|
358
|
+
- 'end_time': Adjusted end time of the event (`datetime.datetime64`).
|
|
359
|
+
- 'filepaths': List of file paths overlapping with the adjusted event period.
|
|
360
|
+
|
|
361
|
+
"""
|
|
362
|
+
if len(filepaths) == 0 or len(list_partitions) == 0:
|
|
363
|
+
return []
|
|
364
|
+
|
|
365
|
+
# Ensure sample_interval and accumulation_interval is numpy.timedelta64
|
|
366
|
+
accumulation_interval = ensure_timedelta_seconds(accumulation_interval)
|
|
367
|
+
sample_interval = ensure_timedelta_seconds(sample_interval)
|
|
368
|
+
|
|
369
|
+
# Define offset on event_end_time
|
|
370
|
+
offset = accumulation_interval if sample_interval != accumulation_interval else ensure_timedelta_seconds(0)
|
|
371
|
+
|
|
372
|
+
# Retrieve file start_time and end_time
|
|
373
|
+
files_start_time, files_end_time = get_start_end_time_from_filepaths(filepaths)
|
|
374
|
+
|
|
375
|
+
# Retrieve partitions blocks start and end time arrays
|
|
376
|
+
block_starts = np.array([p["start_time"] for p in list_partitions]).astype("M8[s]")
|
|
377
|
+
block_ends = np.array([p["end_time"] for p in list_partitions]).astype("M8[s]")
|
|
378
|
+
|
|
379
|
+
# Add optional offset for resampling
|
|
380
|
+
# TODO: expanding partition time should be done only at L1 stage when resampling
|
|
381
|
+
# In disdrodb, the time reported is time at the start of the accumulation period !
|
|
382
|
+
# If sensors report time at the end of measurement interval, we might being reporting time
|
|
383
|
+
# with an inaccuracy equals to the sensor measurement interval.
|
|
384
|
+
# We could correct for that at L0C stage already !
|
|
385
|
+
block_ends = block_ends + offset
|
|
386
|
+
|
|
387
|
+
# Map filepaths to corresponding time blocks
|
|
388
|
+
list_event_info = _map_files_to_blocks(files_start_time, files_end_time, filepaths, block_starts, block_ends)
|
|
389
|
+
return list_event_info
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def get_files_per_time_block(filepaths, freq="day", tolerance_seconds=120):
|
|
393
|
+
"""
|
|
394
|
+
Organize files by the days they cover based on their start and end times.
|
|
395
|
+
|
|
396
|
+
Parameters
|
|
397
|
+
----------
|
|
398
|
+
filepaths : list of str
|
|
399
|
+
List of file paths to be processed.
|
|
400
|
+
|
|
401
|
+
Returns
|
|
402
|
+
-------
|
|
403
|
+
dict
|
|
404
|
+
Dictionary where keys are days (as strings) and values are lists of file paths
|
|
405
|
+
that cover those days.
|
|
406
|
+
|
|
407
|
+
Notes
|
|
408
|
+
-----
|
|
409
|
+
This function adds a tolerance of 60 seconds to account for imprecise time logging by the sensors.
|
|
410
|
+
"""
|
|
411
|
+
# Empty filepaths list return a dictionary
|
|
412
|
+
if len(filepaths) == 0:
|
|
413
|
+
return []
|
|
414
|
+
|
|
415
|
+
# Retrieve file start_time and end_time
|
|
416
|
+
files_start_time, files_end_time = get_start_end_time_from_filepaths(filepaths)
|
|
417
|
+
|
|
418
|
+
# Add tolerance to account for imprecise time logging by the sensors
|
|
419
|
+
# - Example: timestep 23:59:30 might be 00.00 and goes into the next day file ...
|
|
420
|
+
files_start_time = files_start_time - np.array(tolerance_seconds, dtype="m8[s]")
|
|
421
|
+
files_end_time = files_end_time + np.array(tolerance_seconds, dtype="m8[s]")
|
|
422
|
+
|
|
423
|
+
# Identify candidate blocks
|
|
424
|
+
list_partitions = identify_time_partitions(
|
|
425
|
+
start_times=files_start_time,
|
|
426
|
+
end_times=files_end_time,
|
|
427
|
+
freq=freq,
|
|
428
|
+
)
|
|
429
|
+
block_starts = np.array([b["start_time"] for b in list_partitions]).astype("M8[s]")
|
|
430
|
+
block_ends = np.array([b["end_time"] for b in list_partitions]).astype("M8[s]")
|
|
431
|
+
|
|
432
|
+
# Map filepaths to corresponding time blocks
|
|
433
|
+
list_event_info = _map_files_to_blocks(files_start_time, files_end_time, filepaths, block_starts, block_ends)
|
|
434
|
+
return list_event_info
|
disdrodb/utils/attrs.py
CHANGED
|
@@ -95,6 +95,8 @@ def update_disdrodb_attrs(ds, product: str):
|
|
|
95
95
|
# ----------------------------------------------
|
|
96
96
|
# Add time_coverage_start and time_coverage_end
|
|
97
97
|
if "time" in ds.dims:
|
|
98
|
+
ds["time"] = ds["time"].dt.floor("s") # ensure no sub-second values
|
|
99
|
+
ds["time"] = ds["time"].astype("datetime64[s]")
|
|
98
100
|
attrs["time_coverage_start"] = str(ds["time"].data[0])
|
|
99
101
|
attrs["time_coverage_end"] = str(ds["time"].data[-1])
|
|
100
102
|
|
disdrodb/utils/cli.py
CHANGED
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
import click
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
def
|
|
24
|
+
def execute_cmd(cmd, raise_error=False):
|
|
25
25
|
"""Execute command in the terminal, streaming output in python console."""
|
|
26
26
|
from subprocess import PIPE, CalledProcessError, Popen
|
|
27
27
|
|
|
@@ -34,7 +34,7 @@ def _execute_cmd(cmd, raise_error=False):
|
|
|
34
34
|
raise CalledProcessError(p.returncode, p.args)
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
def
|
|
37
|
+
def parse_empty_string_and_none(args):
|
|
38
38
|
"""Utility to parse argument passed from the command line.
|
|
39
39
|
|
|
40
40
|
If ``args = ''``, returns None.
|
|
@@ -58,7 +58,7 @@ def parse_arg_to_list(args):
|
|
|
58
58
|
If ``args = 'variable1 variable2'`` returns ``[variable1, variable2]``.
|
|
59
59
|
"""
|
|
60
60
|
# If '' or 'None' --> Set to None
|
|
61
|
-
args =
|
|
61
|
+
args = parse_empty_string_and_none(args)
|
|
62
62
|
# - If multiple arguments, split by space
|
|
63
63
|
if isinstance(args, str):
|
|
64
64
|
# - Split by space
|
|
@@ -75,7 +75,7 @@ def parse_archive_dir(archive_dir: str):
|
|
|
75
75
|
If ``archive_dir = ''`` returns ``None``.
|
|
76
76
|
"""
|
|
77
77
|
# If '', set to 'None'
|
|
78
|
-
return
|
|
78
|
+
return parse_empty_string_and_none(archive_dir)
|
|
79
79
|
|
|
80
80
|
|
|
81
81
|
def click_station_arguments(function: object):
|
|
@@ -86,7 +86,7 @@ def click_station_arguments(function: object):
|
|
|
86
86
|
function : object
|
|
87
87
|
Function.
|
|
88
88
|
"""
|
|
89
|
-
function = click.argument("station_name", metavar="<
|
|
89
|
+
function = click.argument("station_name", metavar="<STATION_NAME>")(function)
|
|
90
90
|
function = click.argument("campaign_name", metavar="<CAMPAIGN_NAME>")(function)
|
|
91
91
|
function = click.argument("data_source", metavar="<DATA_SOURCE>")(function)
|
|
92
92
|
return function
|
disdrodb/utils/dask.py
CHANGED
|
@@ -90,7 +90,7 @@ def initialize_dask_cluster(minimum_memory=None):
|
|
|
90
90
|
n_workers=num_workers,
|
|
91
91
|
threads_per_worker=1,
|
|
92
92
|
processes=True,
|
|
93
|
-
|
|
93
|
+
memory_limit=0, # this avoid flexible dask memory management
|
|
94
94
|
silence_logs=logging.ERROR,
|
|
95
95
|
)
|
|
96
96
|
client = Client(cluster)
|
|
@@ -111,3 +111,64 @@ def close_dask_cluster(cluster, client):
|
|
|
111
111
|
finally:
|
|
112
112
|
# Restore the original log level
|
|
113
113
|
logger.setLevel(original_level)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def execute_tasks_safely(list_tasks, parallel: bool, logs_dir: str):
|
|
117
|
+
"""
|
|
118
|
+
Execute Dask tasks and skip failed ones.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
list_tasks : list
|
|
123
|
+
List of dask delayed objects or results.
|
|
124
|
+
parallel : bool
|
|
125
|
+
Whether to execute in parallel with Dask or not.
|
|
126
|
+
logs_dir : str
|
|
127
|
+
Directory to store FAILED_TASKS.log.
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
list_logs : list
|
|
132
|
+
List of task results. For failed tasks, adds the path
|
|
133
|
+
to FAILED_TASKS.log in place of the result.
|
|
134
|
+
"""
|
|
135
|
+
from dask.distributed import get_client
|
|
136
|
+
|
|
137
|
+
# Ensure logs_dir exists
|
|
138
|
+
os.makedirs(logs_dir, exist_ok=True)
|
|
139
|
+
|
|
140
|
+
# Define file name where to log failed dask tasks
|
|
141
|
+
failed_log_path = os.path.join(logs_dir, "FAILED_DASK_TASKS.log")
|
|
142
|
+
|
|
143
|
+
if not parallel:
|
|
144
|
+
# Non-parallel mode: just return results directly
|
|
145
|
+
return list_tasks
|
|
146
|
+
|
|
147
|
+
# Ensure we have a Dask client
|
|
148
|
+
try:
|
|
149
|
+
client = get_client()
|
|
150
|
+
except ValueError:
|
|
151
|
+
raise ValueError("No Dask Distributed Client found.")
|
|
152
|
+
|
|
153
|
+
# Compute tasks (all concurrently)
|
|
154
|
+
# - Runs tasks == num_workers * threads_per_worker (which is 1 for DISDRODB)
|
|
155
|
+
# - If errors occurs in some, skip it
|
|
156
|
+
futures = client.compute(list_tasks)
|
|
157
|
+
results = client.gather(futures, errors="skip")
|
|
158
|
+
|
|
159
|
+
# Collect failed futures
|
|
160
|
+
failed_futures = [f for f in futures if f.status != "finished"] # "error"
|
|
161
|
+
|
|
162
|
+
# If no tasks failed, return results
|
|
163
|
+
if not failed_futures:
|
|
164
|
+
return results
|
|
165
|
+
|
|
166
|
+
# Otherwise define log file listing failed tasks
|
|
167
|
+
with open(failed_log_path, "w") as f:
|
|
168
|
+
for fut in failed_futures:
|
|
169
|
+
err = fut.exception()
|
|
170
|
+
f.write(f"ERROR - DASK TASK FAILURE - Task {fut.key} failed: {err}\n")
|
|
171
|
+
|
|
172
|
+
# Append to list of log filepaths (results) the dask failing log
|
|
173
|
+
results.append(failed_log_path)
|
|
174
|
+
return results
|
disdrodb/utils/decorators.py
CHANGED
|
@@ -19,10 +19,34 @@
|
|
|
19
19
|
"""DISDRODB decorators."""
|
|
20
20
|
import functools
|
|
21
21
|
import importlib
|
|
22
|
+
import uuid
|
|
22
23
|
|
|
23
24
|
import dask
|
|
24
25
|
|
|
25
26
|
|
|
27
|
+
def create_dask_task_name(function_name: str, name=None) -> str | None:
|
|
28
|
+
"""
|
|
29
|
+
Create a custom dask task name.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
function_name : str
|
|
34
|
+
Name of the function being delayed.
|
|
35
|
+
name : str, optional
|
|
36
|
+
Custom name for the task (e.g., filepath or ID).
|
|
37
|
+
If None, returns None so that Dask generates is own default name.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
str | None
|
|
42
|
+
Custom dask task name string if `name` is given,
|
|
43
|
+
otherwise None (use Dask's default naming).
|
|
44
|
+
"""
|
|
45
|
+
if name is None:
|
|
46
|
+
return None
|
|
47
|
+
return f"{function_name}.{name}-{uuid.uuid4()}"
|
|
48
|
+
|
|
49
|
+
|
|
26
50
|
def delayed_if_parallel(function):
|
|
27
51
|
"""Decorator to make the function delayed if its ``parallel`` argument is ``True``."""
|
|
28
52
|
|
|
@@ -34,6 +58,13 @@ def delayed_if_parallel(function):
|
|
|
34
58
|
if parallel:
|
|
35
59
|
# Enforce verbose to be False
|
|
36
60
|
kwargs["verbose"] = False
|
|
61
|
+
# Define custom dask task name
|
|
62
|
+
if "logs_filename" in kwargs:
|
|
63
|
+
kwargs["dask_key_name"] = create_dask_task_name(
|
|
64
|
+
function_name=function.__name__,
|
|
65
|
+
name=kwargs["logs_filename"],
|
|
66
|
+
)
|
|
67
|
+
|
|
37
68
|
# Define the delayed task
|
|
38
69
|
result = dask.delayed(function)(*args, **kwargs)
|
|
39
70
|
else:
|
disdrodb/utils/encoding.py
CHANGED
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
"""DISDRODB netCDF4 encoding utilities."""
|
|
20
20
|
import os
|
|
21
21
|
|
|
22
|
+
import numpy as np
|
|
22
23
|
import xarray as xr
|
|
23
24
|
|
|
24
25
|
from disdrodb.utils.yaml import read_yaml
|
|
@@ -50,6 +51,9 @@ def set_encodings(ds: xr.Dataset, encodings_dict: dict) -> xr.Dataset:
|
|
|
50
51
|
xarray.Dataset
|
|
51
52
|
Output xarray dataset.
|
|
52
53
|
"""
|
|
54
|
+
# TODO: CHANGE CHUNKSIZES SPECIFICATION USING {<DIM>: <CHUNKSIZE>} INSTEAD OF LIST
|
|
55
|
+
# --> Then unwrap to list of chunksizes here
|
|
56
|
+
|
|
53
57
|
# Subset encoding dictionary
|
|
54
58
|
# - Here below encodings_dict contains only keys (variables) within the dataset
|
|
55
59
|
encodings_dict = {var: encodings_dict[var] for var in ds.data_vars if var in encodings_dict}
|
|
@@ -63,6 +67,8 @@ def set_encodings(ds: xr.Dataset, encodings_dict: dict) -> xr.Dataset:
|
|
|
63
67
|
|
|
64
68
|
# Set time encoding
|
|
65
69
|
if "time" in ds:
|
|
70
|
+
ds["time"] = ds["time"].dt.floor("s") # ensure no sub-second values
|
|
71
|
+
ds["time"] = ds["time"].astype("datetime64[s]")
|
|
66
72
|
ds["time"].encoding.update(get_time_encoding())
|
|
67
73
|
|
|
68
74
|
# Set the variable encodings
|
|
@@ -119,11 +125,12 @@ def rechunk_dataset(ds: xr.Dataset, encodings_dict: dict) -> xr.Dataset:
|
|
|
119
125
|
"""
|
|
120
126
|
for var in ds.data_vars:
|
|
121
127
|
if var in encodings_dict:
|
|
122
|
-
chunks = encodings_dict[var].pop("chunksizes", None)
|
|
128
|
+
chunks = encodings_dict[var].get("chunksizes", None) # .pop("chunksizes", None)
|
|
123
129
|
if chunks is not None:
|
|
124
130
|
dims = list(ds[var].dims)
|
|
125
131
|
chunks_dict = dict(zip(dims, chunks))
|
|
126
132
|
ds[var] = ds[var].chunk(chunks_dict)
|
|
133
|
+
ds[var].encoding["chunksizes"] = chunks
|
|
127
134
|
return ds
|
|
128
135
|
|
|
129
136
|
|
|
@@ -136,6 +143,8 @@ def get_time_encoding() -> dict:
|
|
|
136
143
|
Time encoding.
|
|
137
144
|
"""
|
|
138
145
|
encoding = {}
|
|
146
|
+
encoding["dtype"] = "int64" # if float trailing sub-seconds values
|
|
147
|
+
encoding["fillvalue"] = np.iinfo(np.int64).max
|
|
139
148
|
encoding["units"] = EPOCH
|
|
140
149
|
encoding["calendar"] = "proleptic_gregorian"
|
|
141
150
|
return encoding
|