disdrodb 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- disdrodb/__init__.py +4 -0
- disdrodb/_version.py +2 -2
- disdrodb/api/checks.py +70 -47
- disdrodb/api/configs.py +0 -2
- disdrodb/api/info.py +3 -3
- disdrodb/api/io.py +48 -8
- disdrodb/api/path.py +116 -133
- disdrodb/api/search.py +12 -3
- disdrodb/cli/disdrodb_create_summary.py +103 -0
- disdrodb/cli/disdrodb_create_summary_station.py +1 -1
- disdrodb/cli/disdrodb_run_l0a_station.py +1 -1
- disdrodb/cli/disdrodb_run_l0b_station.py +2 -2
- disdrodb/cli/disdrodb_run_l0c_station.py +2 -2
- disdrodb/cli/disdrodb_run_l1_station.py +2 -2
- disdrodb/cli/disdrodb_run_l2e_station.py +2 -2
- disdrodb/cli/disdrodb_run_l2m_station.py +2 -2
- disdrodb/data_transfer/download_data.py +123 -7
- disdrodb/issue/writer.py +2 -0
- disdrodb/l0/l0a_processing.py +10 -5
- disdrodb/l0/l0b_nc_processing.py +10 -6
- disdrodb/l0/l0b_processing.py +26 -61
- disdrodb/l0/l0c_processing.py +369 -251
- disdrodb/l0/readers/LPM/ARM/ARM_LPM.py +7 -0
- disdrodb/l0/readers/PARSIVEL2/ARM/ARM_PARSIVEL2.py +4 -0
- disdrodb/l0/readers/PARSIVEL2/CANADA/UQAM_NC.py +69 -0
- disdrodb/l0/readers/PARSIVEL2/MPI/BCO_PARSIVEL2.py +136 -0
- disdrodb/l0/readers/PARSIVEL2/MPI/BOWTIE.py +220 -0
- disdrodb/l0/readers/PARSIVEL2/NASA/LPVEX.py +109 -0
- disdrodb/l0/readers/PARSIVEL2/NETHERLANDS/DELFT_NC.py +3 -0
- disdrodb/l1/fall_velocity.py +46 -0
- disdrodb/l1/processing.py +1 -1
- disdrodb/l2/processing.py +1 -1
- disdrodb/metadata/checks.py +132 -125
- disdrodb/psd/fitting.py +172 -205
- disdrodb/psd/models.py +1 -1
- disdrodb/routines/__init__.py +54 -0
- disdrodb/{l0/routines.py → routines/l0.py} +288 -418
- disdrodb/{l1/routines.py → routines/l1.py} +60 -92
- disdrodb/{l2/routines.py → routines/l2.py} +249 -462
- disdrodb/{routines.py → routines/wrappers.py} +95 -7
- disdrodb/scattering/axis_ratio.py +5 -1
- disdrodb/scattering/permittivity.py +18 -0
- disdrodb/scattering/routines.py +56 -36
- disdrodb/summary/routines.py +110 -34
- disdrodb/utils/archiving.py +434 -0
- disdrodb/utils/cli.py +5 -5
- disdrodb/utils/dask.py +62 -1
- disdrodb/utils/decorators.py +31 -0
- disdrodb/utils/encoding.py +5 -1
- disdrodb/{l2 → utils}/event.py +1 -66
- disdrodb/utils/logger.py +1 -1
- disdrodb/utils/manipulations.py +22 -12
- disdrodb/utils/routines.py +166 -0
- disdrodb/utils/time.py +3 -291
- disdrodb/utils/xarray.py +3 -0
- disdrodb/viz/plots.py +85 -14
- {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/METADATA +2 -2
- {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/RECORD +62 -54
- {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/entry_points.txt +1 -0
- {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/WHEEL +0 -0
- {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/top_level.txt +0 -0
disdrodb/l0/l0c_processing.py
CHANGED
|
@@ -21,94 +21,31 @@ import logging
|
|
|
21
21
|
|
|
22
22
|
import numpy as np
|
|
23
23
|
import pandas as pd
|
|
24
|
+
import xarray as xr
|
|
24
25
|
|
|
25
|
-
from disdrodb.api.
|
|
26
|
-
from disdrodb.
|
|
26
|
+
from disdrodb.api.io import open_netcdf_files
|
|
27
|
+
from disdrodb.l0.l0b_processing import set_l0b_encodings
|
|
27
28
|
from disdrodb.l1.resampling import add_sample_interval
|
|
28
|
-
from disdrodb.utils.
|
|
29
|
-
from disdrodb.utils.
|
|
30
|
-
|
|
31
|
-
regularize_timesteps,
|
|
32
|
-
)
|
|
29
|
+
from disdrodb.utils.attrs import set_disdrodb_attrs
|
|
30
|
+
from disdrodb.utils.logger import log_info, log_warning
|
|
31
|
+
from disdrodb.utils.time import ensure_sorted_by_time
|
|
33
32
|
|
|
34
33
|
logger = logging.getLogger(__name__)
|
|
35
34
|
|
|
35
|
+
# L0C processing requires searching for data (per time blocks) into neighbouring files:
|
|
36
|
+
# - to account for possible trailing seconds in previous/next files
|
|
37
|
+
# - to get information if at the edges of the time blocks previous/next timesteps are available
|
|
38
|
+
# - to shift the time to ensure reported L0C time is the start of the measurement interval
|
|
39
|
+
TOLERANCE_SECONDS = 60 * 3
|
|
36
40
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def get_files_per_days(filepaths):
|
|
41
|
-
"""
|
|
42
|
-
Organize files by the days they cover based on their start and end times.
|
|
43
|
-
|
|
44
|
-
Parameters
|
|
45
|
-
----------
|
|
46
|
-
filepaths : list of str
|
|
47
|
-
List of file paths to be processed.
|
|
48
|
-
|
|
49
|
-
Returns
|
|
50
|
-
-------
|
|
51
|
-
dict
|
|
52
|
-
Dictionary where keys are days (as strings) and values are lists of file paths
|
|
53
|
-
that cover those days.
|
|
54
|
-
|
|
55
|
-
Notes
|
|
56
|
-
-----
|
|
57
|
-
This function adds a tolerance of 60 seconds to account for imprecise time logging by the sensors.
|
|
58
|
-
"""
|
|
59
|
-
# Retrieve file start_time and end_time
|
|
60
|
-
files_start_time, files_end_time = get_start_end_time_from_filepaths(filepaths)
|
|
61
|
-
|
|
62
|
-
# Add tolerance to account for imprecise time logging by the sensors
|
|
63
|
-
# - Example: timestep 23:59:30 might be 00.00 and goes into the next day file ...
|
|
64
|
-
files_start_time = files_start_time - np.array(TOLERANCE_SECONDS, dtype="m8[s]")
|
|
65
|
-
files_end_time = files_end_time + np.array(TOLERANCE_SECONDS, dtype="m8[s]")
|
|
66
|
-
|
|
67
|
-
# Retrieve file start day and end day
|
|
68
|
-
start_day = files_start_time.min().astype("M8[D]")
|
|
69
|
-
end_day = files_end_time.max().astype("M8[D]") + np.array(1, dtype="m8[D]")
|
|
70
|
-
|
|
71
|
-
# Create an array with all days in time period covered by the files
|
|
72
|
-
list_days = np.asanyarray(pd.date_range(start=start_day, end=end_day, freq="D")).astype("M8[D]")
|
|
73
|
-
|
|
74
|
-
# Expand dimension to match each day using broadcasting
|
|
75
|
-
files_start_time = files_start_time.astype("M8[D]")[:, np.newaxis] # shape (n_files, 1)
|
|
76
|
-
files_end_time = files_end_time.astype("M8[D]")[:, np.newaxis] # shape (n_files, 1)
|
|
77
|
-
|
|
78
|
-
# Create an array of all days
|
|
79
|
-
# - Expand dimension to match each day using broadcasting
|
|
80
|
-
days = list_days[np.newaxis, :] # shape (1, n_days)
|
|
81
|
-
|
|
82
|
-
# Use broadcasting to create a boolean matrix indicating which files cover which days
|
|
83
|
-
mask = (files_start_time <= days) & (files_end_time >= days) # shape (n_files, n_days)
|
|
84
|
-
|
|
85
|
-
# Build a mapping from days to file indices
|
|
86
|
-
# For each day (column), find the indices of files (rows) that cover that day
|
|
87
|
-
dict_days = {}
|
|
88
|
-
filepaths = np.array(filepaths)
|
|
89
|
-
for i, day in enumerate(list_days):
|
|
90
|
-
file_indices = np.where(mask[:, i])[0]
|
|
91
|
-
if file_indices.size > 0:
|
|
92
|
-
dict_days[str(day)] = filepaths[file_indices].tolist()
|
|
93
|
-
|
|
94
|
-
return dict_days
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def retrieve_possible_measurement_intervals(metadata):
|
|
98
|
-
"""Retrieve list of possible measurements intervals."""
|
|
99
|
-
measurement_intervals = metadata.get("measurement_interval", [])
|
|
100
|
-
return check_measurement_intervals(measurement_intervals)
|
|
41
|
+
####---------------------------------------------------------------------------------
|
|
42
|
+
#### Measurement intervals
|
|
101
43
|
|
|
102
44
|
|
|
103
45
|
def drop_timesteps_with_invalid_sample_interval(ds, measurement_intervals, verbose=True, logger=None):
|
|
104
46
|
"""Drop timesteps with unexpected sample intervals."""
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
# - Need to know more how Parsivel software computes sample_interval variable ...
|
|
108
|
-
|
|
109
|
-
# Retrieve logged sample_interval
|
|
110
|
-
sample_interval = ds["sample_interval"].compute().data
|
|
111
|
-
timesteps = ds["time"].compute().data
|
|
47
|
+
sample_interval = ds["sample_interval"].to_numpy()
|
|
48
|
+
timesteps = ds["time"].to_numpy()
|
|
112
49
|
is_valid_sample_interval = np.isin(sample_interval.data, measurement_intervals)
|
|
113
50
|
indices_invalid_sample_interval = np.where(~is_valid_sample_interval)[0]
|
|
114
51
|
if len(indices_invalid_sample_interval) > 0:
|
|
@@ -124,10 +61,26 @@ def drop_timesteps_with_invalid_sample_interval(ds, measurement_intervals, verbo
|
|
|
124
61
|
return ds
|
|
125
62
|
|
|
126
63
|
|
|
127
|
-
def split_dataset_by_sampling_intervals(
|
|
64
|
+
def split_dataset_by_sampling_intervals(
|
|
65
|
+
ds,
|
|
66
|
+
measurement_intervals,
|
|
67
|
+
min_sample_interval=10,
|
|
68
|
+
min_block_size=5,
|
|
69
|
+
time_is_end_interval=True,
|
|
70
|
+
):
|
|
128
71
|
"""
|
|
129
72
|
Split a dataset into subsets where each subset has a consistent sampling interval.
|
|
130
73
|
|
|
74
|
+
Notes
|
|
75
|
+
-----
|
|
76
|
+
- Does not modify timesteps (regularization is left to `regularize_timesteps`).
|
|
77
|
+
- Assumes no duplicated timesteps in the dataset.
|
|
78
|
+
- If only one measurement interval is specified, no timestep-diff checks are performed.
|
|
79
|
+
- If multiple measurement intervals are specified:
|
|
80
|
+
* Raises an error if *none* of the expected intervals appear.
|
|
81
|
+
* Splits where interval changes.
|
|
82
|
+
- Segments shorter than `min_block_size` are discarded.
|
|
83
|
+
|
|
131
84
|
Parameters
|
|
132
85
|
----------
|
|
133
86
|
ds : xarray.Dataset
|
|
@@ -136,30 +89,41 @@ def split_dataset_by_sampling_intervals(ds, measurement_intervals, min_sample_in
|
|
|
136
89
|
A list of possible primary sampling intervals (in seconds) that the dataset might have.
|
|
137
90
|
min_sample_interval : int, optional
|
|
138
91
|
The minimum expected sampling interval in seconds. Defaults to 10s.
|
|
92
|
+
This is used to deal with possible trailing seconds errors.
|
|
139
93
|
min_block_size : float, optional
|
|
140
94
|
The minimum number of timesteps with a given sampling interval to be considered.
|
|
141
95
|
Otherwise such portion of data is discarded !
|
|
142
96
|
Defaults to 5 timesteps.
|
|
97
|
+
time_is_end_interval: bool
|
|
98
|
+
Whether time refers to the end of the measurement interval.
|
|
99
|
+
The default is True.
|
|
143
100
|
|
|
144
101
|
Returns
|
|
145
102
|
-------
|
|
146
|
-
dict
|
|
103
|
+
dict[int, xr.Dataset]
|
|
147
104
|
A dictionary where keys are the identified sampling intervals (in seconds),
|
|
148
|
-
and values are xarray.Datasets containing only data from those intervals.
|
|
105
|
+
and values are xarray.Datasets containing only data from those sampling intervals.
|
|
149
106
|
"""
|
|
150
107
|
# Define array of possible measurement intervals
|
|
151
108
|
measurement_intervals = np.array(measurement_intervals)
|
|
152
109
|
|
|
110
|
+
# Check sorted by time and sort if necessary
|
|
111
|
+
ds = ensure_sorted_by_time(ds)
|
|
112
|
+
|
|
153
113
|
# If a single measurement interval expected, return dictionary with input dataset
|
|
154
114
|
if len(measurement_intervals) == 1:
|
|
155
|
-
dict_ds = {measurement_intervals[0]: ds}
|
|
115
|
+
dict_ds = {int(measurement_intervals[0]): ds}
|
|
156
116
|
return dict_ds
|
|
157
117
|
|
|
158
|
-
#
|
|
159
|
-
|
|
118
|
+
# If sample_interval is a dataset variable, use it to define dictionary of datasets
|
|
119
|
+
if "sample_interval" in ds:
|
|
120
|
+
return {int(interval): ds.isel(time=ds["sample_interval"] == interval) for interval in measurement_intervals}
|
|
121
|
+
|
|
122
|
+
# ---------------------------------------------------------------------------------------.
|
|
123
|
+
# Otherwise exploit difference between timesteps to identify change point
|
|
160
124
|
|
|
161
125
|
# Calculate time differences in seconds
|
|
162
|
-
deltadt = np.diff(ds["time"].data).astype("timedelta64[s]").astype(int)
|
|
126
|
+
deltadt = np.abs(np.diff(ds["time"].data)).astype("timedelta64[s]").astype(int)
|
|
163
127
|
|
|
164
128
|
# Round each delta to the nearest multiple of 5 (because the smallest possible sample interval is 10 s)
|
|
165
129
|
# - This account for possible trailing seconds of the logger
|
|
@@ -175,25 +139,46 @@ def split_dataset_by_sampling_intervals(ds, measurement_intervals, min_sample_in
|
|
|
175
139
|
if np.all(np.isnan(mapped_intervals)):
|
|
176
140
|
raise ValueError("Impossible to identify timesteps with expected sampling intervals.")
|
|
177
141
|
|
|
142
|
+
# Check which measurements intervals are occurring in the dataset
|
|
143
|
+
uniques = np.unique(mapped_intervals)
|
|
144
|
+
uniques_intervals = uniques[~np.isnan(uniques)]
|
|
145
|
+
n_different_intervals_occurring = len(uniques_intervals)
|
|
146
|
+
if n_different_intervals_occurring == 1:
|
|
147
|
+
dict_ds = {int(k): ds for k in uniques_intervals}
|
|
148
|
+
return dict_ds
|
|
149
|
+
|
|
150
|
+
# Fill NaNs: decide whether to attach to previous or next interval
|
|
151
|
+
for i in range(len(mapped_intervals)):
|
|
152
|
+
if np.isnan(mapped_intervals[i]):
|
|
153
|
+
# If next exists and is NaN → forward fill
|
|
154
|
+
if i + 1 < len(mapped_intervals) and np.isnan(mapped_intervals[i + 1]):
|
|
155
|
+
mapped_intervals[i] = mapped_intervals[i - 1] if i > 0 else mapped_intervals[i + 1]
|
|
156
|
+
# Otherwise → backward fill (attach to next valid)
|
|
157
|
+
else:
|
|
158
|
+
mapped_intervals[i] = (
|
|
159
|
+
mapped_intervals[i + 1] if i + 1 < len(mapped_intervals) else mapped_intervals[i - 1]
|
|
160
|
+
)
|
|
161
|
+
|
|
178
162
|
# Infill np.nan values by using neighbor intervals
|
|
179
163
|
# Forward fill
|
|
180
|
-
for i in range(1, len(mapped_intervals)):
|
|
181
|
-
|
|
182
|
-
|
|
164
|
+
# for i in range(1, len(mapped_intervals)):
|
|
165
|
+
# if np.isnan(mapped_intervals[i]):
|
|
166
|
+
# mapped_intervals[i] = mapped_intervals[i - 1]
|
|
183
167
|
|
|
184
|
-
# Backward fill (in case the first entries were np.nan)
|
|
185
|
-
for i in range(len(mapped_intervals) - 2, -1, -1):
|
|
186
|
-
|
|
187
|
-
|
|
168
|
+
# # Backward fill (in case the first entries were np.nan)
|
|
169
|
+
# for i in range(len(mapped_intervals) - 2, -1, -1):
|
|
170
|
+
# if np.isnan(mapped_intervals[i]):
|
|
171
|
+
# mapped_intervals[i] = mapped_intervals[i + 1]
|
|
188
172
|
|
|
189
173
|
# Now all intervals are assigned to one of the possible measurement_intervals.
|
|
190
174
|
# Identify boundaries where interval changes
|
|
191
175
|
change_points = np.where(mapped_intervals[:-1] != mapped_intervals[1:])[0] + 1
|
|
192
176
|
|
|
193
177
|
# Split ds into segments according to change_points
|
|
194
|
-
|
|
178
|
+
offset = 1 if time_is_end_interval else 0
|
|
179
|
+
segments = np.split(np.arange(ds.sizes["time"]), change_points + offset)
|
|
195
180
|
|
|
196
|
-
# Remove segments with less than
|
|
181
|
+
# Remove segments with less than min_block_size elements
|
|
197
182
|
segments = [seg for seg in segments if len(seg) >= min_block_size]
|
|
198
183
|
if len(segments) == 0:
|
|
199
184
|
raise ValueError(
|
|
@@ -202,23 +187,40 @@ def split_dataset_by_sampling_intervals(ds, measurement_intervals, min_sample_in
|
|
|
202
187
|
|
|
203
188
|
# Define dataset indices for each sampling interva
|
|
204
189
|
dict_sampling_interval_indices = {}
|
|
190
|
+
used_indices = set()
|
|
205
191
|
for seg in segments:
|
|
206
192
|
# Define the assumed sampling interval of such segment
|
|
207
193
|
start_idx = seg[0]
|
|
208
194
|
segment_sampling_interval = int(mapped_intervals[start_idx])
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
195
|
+
# Remove any indices that have already been assigned to another interval
|
|
196
|
+
seg_filtered = seg[~np.isin(seg, list(used_indices))]
|
|
197
|
+
|
|
198
|
+
# Only keep segment if it still meets minimum size after filtering
|
|
199
|
+
if len(seg_filtered) >= min_block_size:
|
|
200
|
+
if segment_sampling_interval not in dict_sampling_interval_indices:
|
|
201
|
+
dict_sampling_interval_indices[segment_sampling_interval] = [seg_filtered]
|
|
202
|
+
else:
|
|
203
|
+
dict_sampling_interval_indices[segment_sampling_interval].append(seg_filtered)
|
|
204
|
+
|
|
205
|
+
# Mark these indices as used
|
|
206
|
+
used_indices.update(seg_filtered)
|
|
207
|
+
|
|
208
|
+
# Concatenate indices for each sampling interval
|
|
213
209
|
dict_sampling_interval_indices = {
|
|
214
|
-
k: np.concatenate(list_indices)
|
|
210
|
+
k: np.concatenate(list_indices)
|
|
211
|
+
for k, list_indices in dict_sampling_interval_indices.items()
|
|
212
|
+
if list_indices # Only include if there are valid segments
|
|
215
213
|
}
|
|
216
214
|
|
|
217
215
|
# Define dictionary of datasets
|
|
218
|
-
dict_ds = {k: ds.isel(time=indices) for k, indices in dict_sampling_interval_indices.items()}
|
|
216
|
+
dict_ds = {int(k): ds.isel(time=indices) for k, indices in dict_sampling_interval_indices.items()}
|
|
219
217
|
return dict_ds
|
|
220
218
|
|
|
221
219
|
|
|
220
|
+
####---------------------------------------------------------------------------------
|
|
221
|
+
#### Timesteps duplicates
|
|
222
|
+
|
|
223
|
+
|
|
222
224
|
def has_same_value_over_time(da):
|
|
223
225
|
"""
|
|
224
226
|
Check if a DataArray has the same value over all timesteps, considering NaNs as equal.
|
|
@@ -317,6 +319,190 @@ def remove_duplicated_timesteps(ds, ensure_variables_equality=True, logger=None,
|
|
|
317
319
|
return ds
|
|
318
320
|
|
|
319
321
|
|
|
322
|
+
####---------------------------------------------------------------------------------
|
|
323
|
+
#### Timesteps regularization
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def get_problematic_timestep_indices(timesteps, sample_interval):
|
|
327
|
+
"""Identify timesteps with missing previous or following timesteps."""
|
|
328
|
+
previous_time = timesteps - pd.Timedelta(seconds=sample_interval)
|
|
329
|
+
next_time = timesteps + pd.Timedelta(seconds=sample_interval)
|
|
330
|
+
idx_previous_missing = np.where(~np.isin(previous_time, timesteps))[0][1:]
|
|
331
|
+
idx_next_missing = np.where(~np.isin(next_time, timesteps))[0][:-1]
|
|
332
|
+
idx_isolated_missing = np.intersect1d(idx_previous_missing, idx_next_missing)
|
|
333
|
+
idx_previous_missing = idx_previous_missing[np.isin(idx_previous_missing, idx_isolated_missing, invert=True)]
|
|
334
|
+
idx_next_missing = idx_next_missing[np.isin(idx_next_missing, idx_isolated_missing, invert=True)]
|
|
335
|
+
return idx_previous_missing, idx_next_missing, idx_isolated_missing
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def regularize_timesteps(ds, sample_interval, robust=False, add_quality_flag=True, logger=None, verbose=True):
|
|
339
|
+
"""Ensure timesteps match with the sample_interval.
|
|
340
|
+
|
|
341
|
+
This function:
|
|
342
|
+
- drop dataset indices with duplicated timesteps,
|
|
343
|
+
- but does not add missing timesteps to the dataset.
|
|
344
|
+
"""
|
|
345
|
+
# Check sorted by time and sort if necessary
|
|
346
|
+
ds = ensure_sorted_by_time(ds)
|
|
347
|
+
|
|
348
|
+
# Convert time to pandas.DatetimeIndex for easier manipulation
|
|
349
|
+
times = pd.to_datetime(ds["time"].to_numpy())
|
|
350
|
+
|
|
351
|
+
# Determine the start and end times
|
|
352
|
+
start_time = times[0].floor(f"{sample_interval}s")
|
|
353
|
+
end_time = times[-1].ceil(f"{sample_interval}s")
|
|
354
|
+
|
|
355
|
+
# Create the expected time grid
|
|
356
|
+
expected_times = pd.date_range(start=start_time, end=end_time, freq=f"{sample_interval}s")
|
|
357
|
+
|
|
358
|
+
# Convert to numpy arrays
|
|
359
|
+
times = times.to_numpy(dtype="M8[s]")
|
|
360
|
+
expected_times = expected_times.to_numpy(dtype="M8[s]")
|
|
361
|
+
|
|
362
|
+
# Map original times to the nearest expected times
|
|
363
|
+
# Calculate the difference between original times and expected times
|
|
364
|
+
time_deltas = np.abs(times - expected_times[:, None]).astype(int)
|
|
365
|
+
|
|
366
|
+
# Find the index of the closest expected time for each original time
|
|
367
|
+
nearest_indices = np.argmin(time_deltas, axis=0)
|
|
368
|
+
adjusted_times = expected_times[nearest_indices]
|
|
369
|
+
|
|
370
|
+
# Check for duplicates in adjusted times
|
|
371
|
+
unique_times, counts = np.unique(adjusted_times, return_counts=True)
|
|
372
|
+
duplicates = unique_times[counts > 1]
|
|
373
|
+
|
|
374
|
+
# Initialize time quality flag
|
|
375
|
+
# - 0 when ok or just rounded to closest 00
|
|
376
|
+
# - 1 if previous timestep is missing
|
|
377
|
+
# - 2 if next timestep is missing
|
|
378
|
+
# - 3 if previous and next timestep is missing
|
|
379
|
+
# - 4 if solved duplicated timesteps
|
|
380
|
+
# - 5 if needed to drop duplicated timesteps and select the last
|
|
381
|
+
flag_previous_missing = 1
|
|
382
|
+
flag_next_missing = 2
|
|
383
|
+
flag_isolated_timestep = 3
|
|
384
|
+
flag_solved_duplicated_timestep = 4
|
|
385
|
+
flag_dropped_duplicated_timestep = 5
|
|
386
|
+
qc_flag = np.zeros(adjusted_times.shape)
|
|
387
|
+
|
|
388
|
+
# Initialize list with the duplicated timesteps index to drop
|
|
389
|
+
# - We drop the first occurrence because is likely the shortest interval
|
|
390
|
+
idx_to_drop = []
|
|
391
|
+
|
|
392
|
+
# Attempt to resolve for duplicates
|
|
393
|
+
if duplicates.size > 0:
|
|
394
|
+
# Handle duplicates
|
|
395
|
+
for dup_time in duplicates:
|
|
396
|
+
# Indices of duplicates
|
|
397
|
+
dup_indices = np.where(adjusted_times == dup_time)[0]
|
|
398
|
+
n_duplicates = len(dup_indices)
|
|
399
|
+
# Define previous and following timestep
|
|
400
|
+
prev_time = dup_time - pd.Timedelta(seconds=sample_interval)
|
|
401
|
+
next_time = dup_time + pd.Timedelta(seconds=sample_interval)
|
|
402
|
+
# Try to find missing slots before and after
|
|
403
|
+
# - If more than 3 duplicates, impossible to solve !
|
|
404
|
+
count_solved = 0
|
|
405
|
+
# If the previous timestep is available, set that one
|
|
406
|
+
if n_duplicates == 2:
|
|
407
|
+
if prev_time not in adjusted_times:
|
|
408
|
+
adjusted_times[dup_indices[0]] = prev_time
|
|
409
|
+
qc_flag[dup_indices[0]] = flag_solved_duplicated_timestep
|
|
410
|
+
count_solved += 1
|
|
411
|
+
elif next_time not in adjusted_times:
|
|
412
|
+
adjusted_times[dup_indices[-1]] = next_time
|
|
413
|
+
qc_flag[dup_indices[-1]] = flag_solved_duplicated_timestep
|
|
414
|
+
count_solved += 1
|
|
415
|
+
else:
|
|
416
|
+
pass
|
|
417
|
+
elif n_duplicates == 3:
|
|
418
|
+
if prev_time not in adjusted_times:
|
|
419
|
+
adjusted_times[dup_indices[0]] = prev_time
|
|
420
|
+
qc_flag[dup_indices[0]] = flag_solved_duplicated_timestep
|
|
421
|
+
count_solved += 1
|
|
422
|
+
if next_time not in adjusted_times:
|
|
423
|
+
adjusted_times[dup_indices[-1]] = next_time
|
|
424
|
+
qc_flag[dup_indices[-1]] = flag_solved_duplicated_timestep
|
|
425
|
+
count_solved += 1
|
|
426
|
+
if count_solved != n_duplicates - 1:
|
|
427
|
+
idx_to_drop = np.append(idx_to_drop, dup_indices[0:-1])
|
|
428
|
+
qc_flag[dup_indices[-1]] = flag_dropped_duplicated_timestep
|
|
429
|
+
msg = (
|
|
430
|
+
f"Cannot resolve {n_duplicates} duplicated timesteps "
|
|
431
|
+
f"(after trailing seconds correction) around {dup_time}."
|
|
432
|
+
)
|
|
433
|
+
log_warning(logger=logger, msg=msg, verbose=verbose)
|
|
434
|
+
if robust:
|
|
435
|
+
raise ValueError(msg)
|
|
436
|
+
|
|
437
|
+
# Update the time coordinate (Convert to ns for xarray compatibility)
|
|
438
|
+
ds = ds.assign_coords({"time": adjusted_times.astype("datetime64[ns]")})
|
|
439
|
+
|
|
440
|
+
# Update quality flag values for next and previous timestep is missing
|
|
441
|
+
if add_quality_flag:
|
|
442
|
+
idx_previous_missing, idx_next_missing, idx_isolated_missing = get_problematic_timestep_indices(
|
|
443
|
+
adjusted_times,
|
|
444
|
+
sample_interval,
|
|
445
|
+
)
|
|
446
|
+
qc_flag[idx_previous_missing] = np.maximum(qc_flag[idx_previous_missing], flag_previous_missing)
|
|
447
|
+
qc_flag[idx_next_missing] = np.maximum(qc_flag[idx_next_missing], flag_next_missing)
|
|
448
|
+
qc_flag[idx_isolated_missing] = np.maximum(qc_flag[idx_isolated_missing], flag_isolated_timestep)
|
|
449
|
+
|
|
450
|
+
# If the first timestep is at 00:00 and currently flagged as previous missing (1), reset to 0
|
|
451
|
+
# first_time = pd.to_datetime(adjusted_times[0]).time()
|
|
452
|
+
# first_expected_time = pd.Timestamp("00:00:00").time()
|
|
453
|
+
# if first_time == first_expected_time and qc_flag[0] == flag_previous_missing:
|
|
454
|
+
# qc_flag[0] = 0
|
|
455
|
+
|
|
456
|
+
# # If the last timestep is flagged and currently flagged as next missing (2), reset it to 0
|
|
457
|
+
# last_time = pd.to_datetime(adjusted_times[-1]).time()
|
|
458
|
+
# last_time_expected = (pd.Timestamp("00:00:00") - pd.Timedelta(30, unit="seconds")).time()
|
|
459
|
+
# # Check if adding one interval would go beyond the end_time
|
|
460
|
+
# if last_time == last_time_expected and qc_flag[-1] == flag_next_missing:
|
|
461
|
+
# qc_flag[-1] = 0
|
|
462
|
+
|
|
463
|
+
# Assign time quality flag coordinate
|
|
464
|
+
ds["time_qc"] = xr.DataArray(qc_flag, dims="time")
|
|
465
|
+
ds = ds.set_coords("time_qc")
|
|
466
|
+
|
|
467
|
+
# Add CF attributes for time_qc
|
|
468
|
+
ds["time_qc"].attrs = {
|
|
469
|
+
"long_name": "time quality flag",
|
|
470
|
+
"standard_name": "status_flag",
|
|
471
|
+
"units": "1",
|
|
472
|
+
"valid_range": [0, 5],
|
|
473
|
+
"flag_values": [0, 1, 2, 3, 4, 5],
|
|
474
|
+
"flag_meanings": (
|
|
475
|
+
"good_data "
|
|
476
|
+
"previous_timestep_missing "
|
|
477
|
+
"next_timestep_missing "
|
|
478
|
+
"isolated_timestep "
|
|
479
|
+
"solved_duplicated_timestep "
|
|
480
|
+
"dropped_duplicated_timestep"
|
|
481
|
+
),
|
|
482
|
+
"comment": (
|
|
483
|
+
"Quality flag for time coordinate. "
|
|
484
|
+
"Flag 0: data is good or just rounded to nearest sampling interval. "
|
|
485
|
+
"Flag 1: previous timestep is missing in the time series. "
|
|
486
|
+
"Flag 2: next timestep is missing in the time series. "
|
|
487
|
+
"Flag 3: both previous and next timesteps are missing (isolated timestep). "
|
|
488
|
+
"Flag 4: timestep was moved from duplicate to fill missing timestep. "
|
|
489
|
+
"Flag 5: duplicate timestep was dropped, keeping the last occurrence."
|
|
490
|
+
),
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
# Drop duplicated timesteps
|
|
494
|
+
# - Using ds = ds.drop_isel({"time": idx_to_drop.astype(int)}) raise:
|
|
495
|
+
# --> pandas.errors.InvalidIndexError: Reindexing only valid with uniquely valued Index objects
|
|
496
|
+
# --> https://github.com/pydata/xarray/issues/6605
|
|
497
|
+
if len(idx_to_drop) > 0:
|
|
498
|
+
idx_to_drop = idx_to_drop.astype(int)
|
|
499
|
+
idx_valid_timesteps = np.arange(0, ds["time"].size)
|
|
500
|
+
idx_valid_timesteps = np.delete(idx_valid_timesteps, idx_to_drop)
|
|
501
|
+
ds = ds.isel(time=idx_valid_timesteps)
|
|
502
|
+
# Return dataset
|
|
503
|
+
return ds
|
|
504
|
+
|
|
505
|
+
|
|
320
506
|
def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
|
|
321
507
|
"""Check for the regularity of timesteps."""
|
|
322
508
|
# Check sorted by time and sort if necessary
|
|
@@ -339,12 +525,14 @@ def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
|
|
|
339
525
|
fractions = np.round(counts / len(deltadt) * 100, 2)
|
|
340
526
|
|
|
341
527
|
# Compute stats about expected deltadt
|
|
342
|
-
|
|
343
|
-
|
|
528
|
+
mask = unique_deltadt == sample_interval
|
|
529
|
+
sample_interval_counts = counts[mask].item() if mask.any() else 0
|
|
530
|
+
sample_interval_fraction = fractions[mask].item() if mask.any() else 0.0
|
|
344
531
|
|
|
345
532
|
# Compute stats about most frequent deltadt
|
|
346
|
-
|
|
347
|
-
|
|
533
|
+
mask = unique_deltadt == most_frequent_deltadt
|
|
534
|
+
most_frequent_deltadt_counts = counts[mask].item() if mask.any() else 0
|
|
535
|
+
most_frequent_deltadt_fraction = fractions[mask].item() if mask.any() else 0.0
|
|
348
536
|
|
|
349
537
|
# Compute stats about unexpected deltadt
|
|
350
538
|
unexpected_intervals = unique_deltadt[unique_deltadt != sample_interval]
|
|
@@ -352,13 +540,14 @@ def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
|
|
|
352
540
|
unexpected_intervals_fractions = fractions[unique_deltadt != sample_interval]
|
|
353
541
|
frequent_unexpected_intervals = unexpected_intervals[unexpected_intervals_fractions > 5]
|
|
354
542
|
|
|
355
|
-
# Report warning if the
|
|
543
|
+
# Report warning if the sampling_interval deltadt occurs less often than 60 % of times
|
|
356
544
|
# -> TODO: maybe only report in stations where the disdro does not log only data when rainy
|
|
357
545
|
if sample_interval_fraction < 60:
|
|
358
546
|
msg = (
|
|
359
547
|
f"The expected (sampling) interval between observations occurs only "
|
|
360
548
|
f"{sample_interval_counts}/{n} times ({sample_interval_fraction} %)."
|
|
361
549
|
)
|
|
550
|
+
log_warning(logger=logger, msg=msg, verbose=verbose)
|
|
362
551
|
|
|
363
552
|
# Report warning if a deltadt occurs more often then the sampling interval
|
|
364
553
|
if most_frequent_deltadt != sample_interval:
|
|
@@ -372,14 +561,7 @@ def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
|
|
|
372
561
|
|
|
373
562
|
# Report with a warning all unexpected deltadt with frequency larger than 5 %
|
|
374
563
|
if len(frequent_unexpected_intervals) > 0:
|
|
375
|
-
|
|
376
|
-
for interval in frequent_unexpected_intervals:
|
|
377
|
-
c = unexpected_intervals_counts[unexpected_intervals == interval].item()
|
|
378
|
-
f = unexpected_intervals_fractions[unexpected_intervals == interval].item()
|
|
379
|
-
msg_parts.append(f" {interval} ({f}%) ({c}/{n}) | ")
|
|
380
|
-
msg = " ".join(msg_parts)
|
|
381
|
-
|
|
382
|
-
msg = "The following time intervals between observations occurs often: "
|
|
564
|
+
msg = "The following time intervals between observations occur frequently: "
|
|
383
565
|
for interval in frequent_unexpected_intervals:
|
|
384
566
|
c = unexpected_intervals_counts[unexpected_intervals == interval].item()
|
|
385
567
|
f = unexpected_intervals_fractions[unexpected_intervals == interval].item()
|
|
@@ -388,7 +570,11 @@ def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
|
|
|
388
570
|
return ds
|
|
389
571
|
|
|
390
572
|
|
|
391
|
-
|
|
573
|
+
####----------------------------------------------------------------------------------------------.
|
|
574
|
+
#### Wrapper
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
def _finalize_l0c_dataset(ds, sample_interval, sensor_name, verbose=True, logger=None):
|
|
392
578
|
"""Finalize a L0C dataset with unique sampling interval.
|
|
393
579
|
|
|
394
580
|
It adds the sampling_interval coordinate and it regularizes the timesteps for trailing seconds.
|
|
@@ -407,26 +593,45 @@ def finalize_l0c_dataset(ds, sample_interval, verbose=True, logger=None):
|
|
|
407
593
|
)
|
|
408
594
|
|
|
409
595
|
# Performs checks about timesteps regularity
|
|
596
|
+
# - Do not discard anything
|
|
597
|
+
# - Just log warnings in the log file
|
|
410
598
|
ds = check_timesteps_regularity(ds=ds, sample_interval=sample_interval, verbose=verbose, logger=logger)
|
|
599
|
+
|
|
600
|
+
# Shift timesteps to ensure time correspond to start of measurement interval
|
|
601
|
+
# TODO as function of sensor name
|
|
602
|
+
|
|
603
|
+
# Set netCDF dimension order
|
|
604
|
+
# --> Required for correct encoding !
|
|
605
|
+
ds = ds.transpose("time", "diameter_bin_center", ...)
|
|
606
|
+
|
|
607
|
+
# Set encodings
|
|
608
|
+
ds = set_l0b_encodings(ds=ds, sensor_name=sensor_name)
|
|
609
|
+
|
|
610
|
+
# Update global attributes
|
|
611
|
+
ds = set_disdrodb_attrs(ds, product="L0C")
|
|
411
612
|
return ds
|
|
412
613
|
|
|
413
614
|
|
|
414
|
-
def
|
|
615
|
+
def create_l0c_datasets(
|
|
616
|
+
event_info,
|
|
617
|
+
measurement_intervals,
|
|
618
|
+
sensor_name,
|
|
619
|
+
ensure_variables_equality=True,
|
|
620
|
+
logger=None,
|
|
621
|
+
verbose=True,
|
|
622
|
+
):
|
|
415
623
|
"""
|
|
416
|
-
Create a
|
|
624
|
+
Create a single dataset by merging and processing data from multiple filepaths.
|
|
417
625
|
|
|
418
626
|
Parameters
|
|
419
627
|
----------
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
Should be in a format that can be converted to numpy.datetime64.
|
|
423
|
-
filepaths : list of str
|
|
424
|
-
List of filepaths to the data files to be processed.
|
|
628
|
+
event_info : dict
|
|
629
|
+
Dictionary with start_time, end_time and filepaths keys.
|
|
425
630
|
|
|
426
631
|
Returns
|
|
427
632
|
-------
|
|
428
|
-
|
|
429
|
-
|
|
633
|
+
dict
|
|
634
|
+
A dictionary with an xarray.Dataset for each measurement interval.
|
|
430
635
|
|
|
431
636
|
Raises
|
|
432
637
|
------
|
|
@@ -435,50 +640,39 @@ def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_eq
|
|
|
435
640
|
|
|
436
641
|
Notes
|
|
437
642
|
-----
|
|
438
|
-
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
-
|
|
442
|
-
|
|
443
|
-
-
|
|
444
|
-
are closed before returning the dataset.
|
|
643
|
+
- Data is loaded into memory and connections to source files are closed before returning the dataset.
|
|
644
|
+
- Tolerance in input files is used around expected dataset start_time and end_time to account for
|
|
645
|
+
imprecise logging times and ensuring correct definition of qc_time at files boundaries (e.g. 00:00).
|
|
646
|
+
- Duplicated timesteps with different raw drop number values are dropped
|
|
647
|
+
- First occurrence of duplicated timesteps with equal raw drop number values is kept.
|
|
648
|
+
- Regularizes timesteps to handle trailing seconds.
|
|
445
649
|
"""
|
|
446
|
-
import xarray as xr # Load in each process when function is called !
|
|
447
|
-
|
|
448
650
|
# ---------------------------------------------------------------------------------------.
|
|
449
|
-
#
|
|
450
|
-
|
|
451
|
-
|
|
651
|
+
# Retrieve information
|
|
652
|
+
start_time = np.array(event_info["start_time"], dtype="M8[s]")
|
|
653
|
+
end_time = np.array(event_info["end_time"], dtype="M8[s]")
|
|
654
|
+
filepaths = event_info["filepaths"]
|
|
452
655
|
|
|
453
|
-
#
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
end_day_tol = end_day + np.array(TOLERANCE_SECONDS, dtype="m8[s]")
|
|
656
|
+
# Define expected dataset time coverage
|
|
657
|
+
start_time_tol = start_time - np.array(TOLERANCE_SECONDS, dtype="m8[s]")
|
|
658
|
+
end_time_tol = end_time + np.array(TOLERANCE_SECONDS, dtype="m8[s]")
|
|
457
659
|
|
|
458
660
|
# ---------------------------------------------------------------------------------------.
|
|
459
661
|
# Open files with data within the provided day and concatenate them
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
# - The resulting dataset can have duplicated timesteps !
|
|
469
|
-
ds = xr.concat(list_ds, dim="time", join="outer", compat="no_conflicts", combine_attrs="override").sortby(
|
|
470
|
-
"time",
|
|
471
|
-
)
|
|
472
|
-
else:
|
|
473
|
-
ds = list_ds[0]
|
|
474
|
-
|
|
475
|
-
# Compute data
|
|
476
|
-
ds = ds.compute()
|
|
662
|
+
ds = open_netcdf_files(
|
|
663
|
+
filepaths,
|
|
664
|
+
start_time=start_time_tol,
|
|
665
|
+
end_time=end_time_tol,
|
|
666
|
+
chunks={},
|
|
667
|
+
parallel=False,
|
|
668
|
+
compute=True,
|
|
669
|
+
)
|
|
477
670
|
|
|
478
|
-
#
|
|
479
|
-
|
|
480
|
-
ds.
|
|
481
|
-
|
|
671
|
+
# If not data for that time block, return empty dictionary
|
|
672
|
+
# - Can occur when raw files are already by block of months and e.g. here saving to daily blocks !
|
|
673
|
+
if ds.sizes["time"] == 0:
|
|
674
|
+
log_info(logger=logger, msg=f"No data between {start_time} and {end_time}.", verbose=verbose)
|
|
675
|
+
return {}
|
|
482
676
|
|
|
483
677
|
# ---------------------------------------------------------------------------------------.
|
|
484
678
|
# If sample interval is a dataset variable, drop timesteps with unexpected measurement intervals !
|
|
@@ -489,9 +683,16 @@ def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_eq
|
|
|
489
683
|
verbose=verbose,
|
|
490
684
|
logger=logger,
|
|
491
685
|
)
|
|
686
|
+
n_timesteps = len(ds["time"])
|
|
687
|
+
if n_timesteps < 3:
|
|
688
|
+
raise ValueError(f"Only {n_timesteps} timesteps left after removing those with unexpected sample interval.")
|
|
492
689
|
|
|
493
690
|
# ---------------------------------------------------------------------------------------.
|
|
494
|
-
# Remove duplicated timesteps
|
|
691
|
+
# Remove duplicated timesteps (before correcting for trailing seconds)
|
|
692
|
+
# - It checks that duplicated timesteps have the same raw_drop_number values
|
|
693
|
+
# - If duplicated timesteps have different raw_drop_number values:
|
|
694
|
+
# --> warning is raised
|
|
695
|
+
# --> timesteps are dropped
|
|
495
696
|
ds = remove_duplicated_timesteps(
|
|
496
697
|
ds,
|
|
497
698
|
ensure_variables_equality=ensure_variables_equality,
|
|
@@ -502,7 +703,7 @@ def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_eq
|
|
|
502
703
|
# Raise error if less than 3 timesteps left
|
|
503
704
|
n_timesteps = len(ds["time"])
|
|
504
705
|
if n_timesteps < 3:
|
|
505
|
-
raise ValueError(f"{n_timesteps} timesteps left after removing duplicated
|
|
706
|
+
raise ValueError(f"{n_timesteps} timesteps left after removing duplicated.")
|
|
506
707
|
|
|
507
708
|
# ---------------------------------------------------------------------------------------.
|
|
508
709
|
# Split dataset by sampling intervals
|
|
@@ -513,107 +714,24 @@ def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_eq
|
|
|
513
714
|
min_block_size=5,
|
|
514
715
|
)
|
|
515
716
|
|
|
516
|
-
# Log a warning if two sampling intervals are present within a given
|
|
717
|
+
# Log a warning if two sampling intervals are present within a given time block
|
|
517
718
|
if len(dict_ds) > 1:
|
|
518
719
|
occuring_sampling_intervals = list(dict_ds)
|
|
519
|
-
msg = f"The
|
|
720
|
+
msg = f"The input files contains these sampling intervals: {occuring_sampling_intervals}."
|
|
520
721
|
log_warning(logger=logger, msg=msg, verbose=verbose)
|
|
521
722
|
|
|
522
723
|
# ---------------------------------------------------------------------------------------.
|
|
523
724
|
# Finalize L0C datasets
|
|
524
|
-
# - Add sample_interval coordinate
|
|
725
|
+
# - Add and ensure sample_interval coordinate has just 1 value (not varying with time)
|
|
525
726
|
# - Regularize timesteps for trailing seconds
|
|
526
727
|
dict_ds = {
|
|
527
|
-
sample_interval:
|
|
728
|
+
sample_interval: _finalize_l0c_dataset(
|
|
528
729
|
ds=ds,
|
|
529
730
|
sample_interval=sample_interval,
|
|
731
|
+
sensor_name=sensor_name,
|
|
530
732
|
verbose=verbose,
|
|
531
733
|
logger=logger,
|
|
532
|
-
).sel({"time": slice(
|
|
734
|
+
).sel({"time": slice(start_time, end_time)})
|
|
533
735
|
for sample_interval, ds in dict_ds.items()
|
|
534
736
|
}
|
|
535
737
|
return dict_ds
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
# ---------------------------------------------------------------------------------------.
|
|
539
|
-
#### DEPRECATED CODE
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
# def copy_l0b_to_l0c_directory(filepath):
|
|
543
|
-
# """Copy L0B file to L0C directory."""
|
|
544
|
-
# import netCDF4
|
|
545
|
-
|
|
546
|
-
# # Copy file
|
|
547
|
-
# l0c_filepath = filepath.replace("L0B", "L0C")
|
|
548
|
-
# _ = shutil.copy(filepath, l0c_filepath)
|
|
549
|
-
|
|
550
|
-
# # Edit DISDRODB product attribute
|
|
551
|
-
# with netCDF4.Dataset(l0c_filepath, mode="a") as nc_file:
|
|
552
|
-
# # Modify the global attribute
|
|
553
|
-
# nc_file.setncattr("disdrodb_product", "L0C")
|
|
554
|
-
|
|
555
|
-
# def find_isel_common_time(da1, da2):
|
|
556
|
-
# """
|
|
557
|
-
# Find the indices of common time steps between two data arrays.
|
|
558
|
-
|
|
559
|
-
# Parameters
|
|
560
|
-
# ----------
|
|
561
|
-
# da1 : xarray.DataArray
|
|
562
|
-
# The first data array with a time coordinate.
|
|
563
|
-
# da2 : xarray.DataArray
|
|
564
|
-
# The second data array with a time coordinate.
|
|
565
|
-
|
|
566
|
-
# Returns
|
|
567
|
-
# -------
|
|
568
|
-
# da1_isel : numpy.ndarray
|
|
569
|
-
# Indices of the common time steps in the first data array.
|
|
570
|
-
# da2_isel : numpy.ndarray
|
|
571
|
-
# Indices of the common time steps in the second data array.
|
|
572
|
-
|
|
573
|
-
# Notes
|
|
574
|
-
# -----
|
|
575
|
-
# This function assumes that both input data arrays have a "time" coordinate.
|
|
576
|
-
# The function finds the intersection of the time steps in both data arrays
|
|
577
|
-
# and returns the indices of these common time steps for each data array.
|
|
578
|
-
# """
|
|
579
|
-
# intersecting_timesteps = np.intersect1d(da1["time"], da2["time"])
|
|
580
|
-
# da1_isel = np.where(np.isin(da1["time"], intersecting_timesteps))[0]
|
|
581
|
-
# da2_isel = np.where(np.isin(da2["time"], intersecting_timesteps))[0]
|
|
582
|
-
# return da1_isel, da2_isel
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
# def check_same_raw_drop_number_values(list_ds, filepaths):
|
|
586
|
-
# """
|
|
587
|
-
# Check if the 'raw_drop_number' values are the same across multiple datasets.
|
|
588
|
-
|
|
589
|
-
# This function compares the 'raw_drop_number' values of multiple datasets to ensure they are identical
|
|
590
|
-
# at common timesteps.
|
|
591
|
-
|
|
592
|
-
# If any discrepancies are found, a ValueError is raised indicating which files
|
|
593
|
-
# have differing values.
|
|
594
|
-
|
|
595
|
-
# Parameters
|
|
596
|
-
# ----------
|
|
597
|
-
# list_ds : list of xarray.Dataset
|
|
598
|
-
# A list of xarray Datasets to be compared.
|
|
599
|
-
# filepaths : list of str
|
|
600
|
-
# A list of file paths corresponding to the datasets in `list_ds`.
|
|
601
|
-
|
|
602
|
-
# Raises
|
|
603
|
-
# ------
|
|
604
|
-
# ValueError
|
|
605
|
-
# If 'raw_drop_number' values differ at any common timestep between any two datasets.
|
|
606
|
-
# """
|
|
607
|
-
# # Retrieve variable to compare
|
|
608
|
-
# list_drop_number = [ds["raw_drop_number"].compute() for ds in list_ds]
|
|
609
|
-
# # Compare values
|
|
610
|
-
# combos = list(itertools.combinations(range(len(list_drop_number)), 2))
|
|
611
|
-
# for i, j in combos:
|
|
612
|
-
# da1 = list_drop_number[i]
|
|
613
|
-
# da2 = list_drop_number[j]
|
|
614
|
-
# da1_isel, da2_isel = find_isel_common_time(da1=da1, da2=da2)
|
|
615
|
-
# if not np.all(da1.isel(time=da1_isel).data == da2.isel(time=da2_isel).data):
|
|
616
|
-
# file1 = filepaths[i]
|
|
617
|
-
# file2 = filepaths[i]
|
|
618
|
-
# msg = f"Duplicated timesteps have different values between file {file1} and {file2}"
|
|
619
|
-
# raise ValueError(msg)
|