PyPI - disdrodb - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

disdrodb 0.1.3py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

disdrodb/__init__.py +4 -0
disdrodb/_version.py +2 -2
disdrodb/api/checks.py +70 -47
disdrodb/api/configs.py +0 -2
disdrodb/api/info.py +3 -3
disdrodb/api/io.py +48 -8
disdrodb/api/path.py +116 -133
disdrodb/api/search.py +12 -3
disdrodb/cli/disdrodb_create_summary.py +103 -0
disdrodb/cli/disdrodb_create_summary_station.py +1 -1
disdrodb/cli/disdrodb_run_l0a_station.py +1 -1
disdrodb/cli/disdrodb_run_l0b_station.py +2 -2
disdrodb/cli/disdrodb_run_l0c_station.py +2 -2
disdrodb/cli/disdrodb_run_l1_station.py +2 -2
disdrodb/cli/disdrodb_run_l2e_station.py +2 -2
disdrodb/cli/disdrodb_run_l2m_station.py +2 -2
disdrodb/data_transfer/download_data.py +123 -7
disdrodb/issue/writer.py +2 -0
disdrodb/l0/l0a_processing.py +10 -5
disdrodb/l0/l0b_nc_processing.py +10 -6
disdrodb/l0/l0b_processing.py +26 -61
disdrodb/l0/l0c_processing.py +369 -251
disdrodb/l0/readers/LPM/ARM/ARM_LPM.py +7 -0
disdrodb/l0/readers/PARSIVEL2/ARM/ARM_PARSIVEL2.py +4 -0
disdrodb/l0/readers/PARSIVEL2/CANADA/UQAM_NC.py +69 -0
disdrodb/l0/readers/PARSIVEL2/MPI/BCO_PARSIVEL2.py +136 -0
disdrodb/l0/readers/PARSIVEL2/MPI/BOWTIE.py +220 -0
disdrodb/l0/readers/PARSIVEL2/NASA/LPVEX.py +109 -0
disdrodb/l0/readers/PARSIVEL2/NETHERLANDS/DELFT_NC.py +3 -0
disdrodb/l1/fall_velocity.py +46 -0
disdrodb/l1/processing.py +1 -1
disdrodb/l2/processing.py +1 -1
disdrodb/metadata/checks.py +132 -125
disdrodb/psd/fitting.py +172 -205
disdrodb/psd/models.py +1 -1
disdrodb/routines/__init__.py +54 -0
disdrodb/{l0/routines.py → routines/l0.py} +288 -418
disdrodb/{l1/routines.py → routines/l1.py} +60 -92
disdrodb/{l2/routines.py → routines/l2.py} +249 -462
disdrodb/{routines.py → routines/wrappers.py} +95 -7
disdrodb/scattering/axis_ratio.py +5 -1
disdrodb/scattering/permittivity.py +18 -0
disdrodb/scattering/routines.py +56 -36
disdrodb/summary/routines.py +110 -34
disdrodb/utils/archiving.py +434 -0
disdrodb/utils/cli.py +5 -5
disdrodb/utils/dask.py +62 -1
disdrodb/utils/decorators.py +31 -0
disdrodb/utils/encoding.py +5 -1
disdrodb/{l2 → utils}/event.py +1 -66
disdrodb/utils/logger.py +1 -1
disdrodb/utils/manipulations.py +22 -12
disdrodb/utils/routines.py +166 -0
disdrodb/utils/time.py +3 -291
disdrodb/utils/xarray.py +3 -0
disdrodb/viz/plots.py +85 -14
{disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/METADATA +2 -2
{disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/RECORD +62 -54
{disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/entry_points.txt +1 -0
{disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/WHEEL +0 -0
{disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/licenses/LICENSE +0 -0
{disdrodb-0.1.3.dist-info → disdrodb-0.1.4.dist-info}/top_level.txt +0 -0

disdrodb/l0/l0c_processing.py CHANGED Viewed

@@ -21,94 +21,31 @@ import logging
 import numpy as np
 import pandas as pd
+import xarray as xr
-from disdrodb.api.checks import check_measurement_intervals
-from disdrodb.api.info import get_start_end_time_from_filepaths
+from disdrodb.api.io import open_netcdf_files
+from disdrodb.l0.l0b_processing import set_l0b_encodings
 from disdrodb.l1.resampling import add_sample_interval
-from disdrodb.utils.logger import log_warning  # , log_info
-from disdrodb.utils.time import (
-    ensure_sorted_by_time,
-    regularize_timesteps,
-)
+from disdrodb.utils.attrs import set_disdrodb_attrs
+from disdrodb.utils.logger import log_info, log_warning
+from disdrodb.utils.time import ensure_sorted_by_time
 logger = logging.getLogger(__name__)
+# L0C processing requires searching for data (per time blocks) into neighbouring files:
+# - to account for possible trailing seconds in previous/next files
+# - to get information if at the edges of the time blocks previous/next timesteps are available
+# - to shift the time to ensure reported L0C time is the start of the measurement interval
+TOLERANCE_SECONDS = 60 * 3
-TOLERANCE_SECONDS = 120
-def get_files_per_days(filepaths):
-    """
-    Organize files by the days they cover based on their start and end times.
-    Parameters
-    ----------
-    filepaths : list of str
-        List of file paths to be processed.
-    Returns
-    -------
-    dict
-        Dictionary where keys are days (as strings) and values are lists of file paths
-        that cover those days.
-    Notes
-    -----
-    This function adds a tolerance of 60 seconds to account for imprecise time logging by the sensors.
-    """
-    # Retrieve file start_time and end_time
-    files_start_time, files_end_time = get_start_end_time_from_filepaths(filepaths)
-    # Add tolerance to account for imprecise time logging by the sensors
-    # - Example: timestep 23:59:30 might be 00.00 and goes into the next day file ...
-    files_start_time = files_start_time - np.array(TOLERANCE_SECONDS, dtype="m8[s]")
-    files_end_time = files_end_time + np.array(TOLERANCE_SECONDS, dtype="m8[s]")
-    # Retrieve file start day and end day
-    start_day = files_start_time.min().astype("M8[D]")
-    end_day = files_end_time.max().astype("M8[D]") + np.array(1, dtype="m8[D]")
-    # Create an array with all days in time period covered by the files
-    list_days = np.asanyarray(pd.date_range(start=start_day, end=end_day, freq="D")).astype("M8[D]")
-    # Expand dimension to match each day using broadcasting
-    files_start_time = files_start_time.astype("M8[D]")[:, np.newaxis]  # shape (n_files, 1)
-    files_end_time = files_end_time.astype("M8[D]")[:, np.newaxis]  # shape (n_files, 1)
-    # Create an array of all days
-    # - Expand dimension to match each day using broadcasting
-    days = list_days[np.newaxis, :]  # shape (1, n_days)
-    # Use broadcasting to create a boolean matrix indicating which files cover which days
-    mask = (files_start_time <= days) & (files_end_time >= days)  # shape (n_files, n_days)
-    # Build a mapping from days to file indices
-    # For each day (column), find the indices of files (rows) that cover that day
-    dict_days = {}
-    filepaths = np.array(filepaths)
-    for i, day in enumerate(list_days):
-        file_indices = np.where(mask[:, i])[0]
-        if file_indices.size > 0:
-            dict_days[str(day)] = filepaths[file_indices].tolist()
-    return dict_days
-def retrieve_possible_measurement_intervals(metadata):
-    """Retrieve list of possible measurements intervals."""
-    measurement_intervals = metadata.get("measurement_interval", [])
-    return check_measurement_intervals(measurement_intervals)
+####---------------------------------------------------------------------------------
+#### Measurement intervals
 def drop_timesteps_with_invalid_sample_interval(ds, measurement_intervals, verbose=True, logger=None):
     """Drop timesteps with unexpected sample intervals."""
-    # TODO
-    # - correct logged sample_interval for trailing seconds. Example (58,59,61,62) converted to 60 s ?
-    # - Need to know more how Parsivel software computes sample_interval variable ...
-    # Retrieve logged sample_interval
-    sample_interval = ds["sample_interval"].compute().data
-    timesteps = ds["time"].compute().data
+    sample_interval = ds["sample_interval"].to_numpy()
+    timesteps = ds["time"].to_numpy()
     is_valid_sample_interval = np.isin(sample_interval.data, measurement_intervals)
     indices_invalid_sample_interval = np.where(~is_valid_sample_interval)[0]
     if len(indices_invalid_sample_interval) > 0:
@@ -124,10 +61,26 @@ def drop_timesteps_with_invalid_sample_interval(ds, measurement_intervals, verbo
     return ds
-def split_dataset_by_sampling_intervals(ds, measurement_intervals, min_sample_interval=10, min_block_size=5):
+def split_dataset_by_sampling_intervals(
+    ds,
+    measurement_intervals,
+    min_sample_interval=10,
+    min_block_size=5,
+    time_is_end_interval=True,
+):
     """
     Split a dataset into subsets where each subset has a consistent sampling interval.
+    Notes
+    -----
+    - Does not modify timesteps (regularization is left to `regularize_timesteps`).
+    - Assumes no duplicated timesteps in the dataset.
+    - If only one measurement interval is specified, no timestep-diff checks are performed.
+    - If multiple measurement intervals are specified:
+        * Raises an error if *none* of the expected intervals appear.
+        * Splits where interval changes.
+    - Segments shorter than `min_block_size` are discarded.
     Parameters
     ----------
     ds : xarray.Dataset
@@ -136,30 +89,41 @@ def split_dataset_by_sampling_intervals(ds, measurement_intervals, min_sample_in
         A list of possible primary sampling intervals (in seconds) that the dataset might have.
     min_sample_interval : int, optional
         The minimum expected sampling interval in seconds. Defaults to 10s.
+        This is used to deal with possible trailing seconds errors.
     min_block_size : float, optional
         The minimum number of timesteps with a given sampling interval to be considered.
         Otherwise such portion of data is discarded !
         Defaults to 5 timesteps.
+    time_is_end_interval: bool
+        Whether time refers to the end of the measurement interval.
+        The default is True.
     Returns
     -------
-    dict
+    dict[int, xr.Dataset]
         A dictionary where keys are the identified sampling intervals (in seconds),
-        and values are xarray.Datasets containing only data from those intervals.
+        and values are xarray.Datasets containing only data from those sampling intervals.
     """
     # Define array of possible measurement intervals
     measurement_intervals = np.array(measurement_intervals)
+    # Check sorted by time and sort if necessary
+    ds = ensure_sorted_by_time(ds)
     # If a single measurement interval expected, return dictionary with input dataset
     if len(measurement_intervals) == 1:
-        dict_ds = {measurement_intervals[0]: ds}
+        dict_ds = {int(measurement_intervals[0]): ds}
         return dict_ds
-    # Check sorted by time and sort if necessary
-    ds = ensure_sorted_by_time(ds)
+    # If sample_interval is a dataset variable, use it to define dictionary of datasets
+    if "sample_interval" in ds:
+        return {int(interval): ds.isel(time=ds["sample_interval"] == interval) for interval in measurement_intervals}
+    # ---------------------------------------------------------------------------------------.
+    # Otherwise exploit difference between timesteps to identify change point
     # Calculate time differences in seconds
-    deltadt = np.diff(ds["time"].data).astype("timedelta64[s]").astype(int)
+    deltadt = np.abs(np.diff(ds["time"].data)).astype("timedelta64[s]").astype(int)
     # Round each delta to the nearest multiple of 5 (because the smallest possible sample interval is 10 s)
     # - This account for possible trailing seconds of the logger
@@ -175,25 +139,46 @@ def split_dataset_by_sampling_intervals(ds, measurement_intervals, min_sample_in
     if np.all(np.isnan(mapped_intervals)):
         raise ValueError("Impossible to identify timesteps with expected sampling intervals.")
+    # Check which measurements intervals are occurring in the dataset
+    uniques = np.unique(mapped_intervals)
+    uniques_intervals = uniques[~np.isnan(uniques)]
+    n_different_intervals_occurring = len(uniques_intervals)
+    if n_different_intervals_occurring == 1:
+        dict_ds = {int(k): ds for k in uniques_intervals}
+        return dict_ds
+    # Fill NaNs: decide whether to attach to previous or next interval
+    for i in range(len(mapped_intervals)):
+        if np.isnan(mapped_intervals[i]):
+            # If next exists and is NaN → forward fill
+            if i + 1 < len(mapped_intervals) and np.isnan(mapped_intervals[i + 1]):
+                mapped_intervals[i] = mapped_intervals[i - 1] if i > 0 else mapped_intervals[i + 1]
+            # Otherwise → backward fill (attach to next valid)
+            else:
+                mapped_intervals[i] = (
+                    mapped_intervals[i + 1] if i + 1 < len(mapped_intervals) else mapped_intervals[i - 1]
+                )
     # Infill np.nan values by using neighbor intervals
     # Forward fill
-    for i in range(1, len(mapped_intervals)):
-        if np.isnan(mapped_intervals[i]):
-            mapped_intervals[i] = mapped_intervals[i - 1]
+    # for i in range(1, len(mapped_intervals)):
+    #     if np.isnan(mapped_intervals[i]):
+    #         mapped_intervals[i] = mapped_intervals[i - 1]
-    # Backward fill (in case the first entries were np.nan)
-    for i in range(len(mapped_intervals) - 2, -1, -1):
-        if np.isnan(mapped_intervals[i]):
-            mapped_intervals[i] = mapped_intervals[i + 1]
+    # # Backward fill (in case the first entries were np.nan)
+    # for i in range(len(mapped_intervals) - 2, -1, -1):
+    #     if np.isnan(mapped_intervals[i]):
+    #         mapped_intervals[i] = mapped_intervals[i + 1]
     # Now all intervals are assigned to one of the possible measurement_intervals.
     # Identify boundaries where interval changes
     change_points = np.where(mapped_intervals[:-1] != mapped_intervals[1:])[0] + 1
     # Split ds into segments according to change_points
-    segments = np.split(np.arange(ds.sizes["time"]), change_points)
+    offset = 1 if time_is_end_interval else 0
+    segments = np.split(np.arange(ds.sizes["time"]), change_points + offset)
-    # Remove segments with less than 10 points
+    # Remove segments with less than min_block_size elements
     segments = [seg for seg in segments if len(seg) >= min_block_size]
     if len(segments) == 0:
         raise ValueError(
@@ -202,23 +187,40 @@ def split_dataset_by_sampling_intervals(ds, measurement_intervals, min_sample_in
     # Define dataset indices for each sampling interva
     dict_sampling_interval_indices = {}
+    used_indices = set()
     for seg in segments:
         # Define the assumed sampling interval of such segment
         start_idx = seg[0]
         segment_sampling_interval = int(mapped_intervals[start_idx])
-        if segment_sampling_interval not in dict_sampling_interval_indices:
-            dict_sampling_interval_indices[segment_sampling_interval] = [seg]
-        else:
-            dict_sampling_interval_indices[segment_sampling_interval].append(seg)
+        # Remove any indices that have already been assigned to another interval
+        seg_filtered = seg[~np.isin(seg, list(used_indices))]
+        # Only keep segment if it still meets minimum size after filtering
+        if len(seg_filtered) >= min_block_size:
+            if segment_sampling_interval not in dict_sampling_interval_indices:
+                dict_sampling_interval_indices[segment_sampling_interval] = [seg_filtered]
+            else:
+                dict_sampling_interval_indices[segment_sampling_interval].append(seg_filtered)
+            # Mark these indices as used
+            used_indices.update(seg_filtered)
+    # Concatenate indices for each sampling interval
     dict_sampling_interval_indices = {
-        k: np.concatenate(list_indices) for k, list_indices in dict_sampling_interval_indices.items()
+        k: np.concatenate(list_indices)
+        for k, list_indices in dict_sampling_interval_indices.items()
+        if list_indices  # Only include if there are valid segments
     }
     # Define dictionary of datasets
-    dict_ds = {k: ds.isel(time=indices) for k, indices in dict_sampling_interval_indices.items()}
+    dict_ds = {int(k): ds.isel(time=indices) for k, indices in dict_sampling_interval_indices.items()}
     return dict_ds
+####---------------------------------------------------------------------------------
+#### Timesteps duplicates
 def has_same_value_over_time(da):
     """
     Check if a DataArray has the same value over all timesteps, considering NaNs as equal.
@@ -317,6 +319,190 @@ def remove_duplicated_timesteps(ds, ensure_variables_equality=True, logger=None,
     return ds
+####---------------------------------------------------------------------------------
+#### Timesteps regularization
+def get_problematic_timestep_indices(timesteps, sample_interval):
+    """Identify timesteps with missing previous or following timesteps."""
+    previous_time = timesteps - pd.Timedelta(seconds=sample_interval)
+    next_time = timesteps + pd.Timedelta(seconds=sample_interval)
+    idx_previous_missing = np.where(~np.isin(previous_time, timesteps))[0][1:]
+    idx_next_missing = np.where(~np.isin(next_time, timesteps))[0][:-1]
+    idx_isolated_missing = np.intersect1d(idx_previous_missing, idx_next_missing)
+    idx_previous_missing = idx_previous_missing[np.isin(idx_previous_missing, idx_isolated_missing, invert=True)]
+    idx_next_missing = idx_next_missing[np.isin(idx_next_missing, idx_isolated_missing, invert=True)]
+    return idx_previous_missing, idx_next_missing, idx_isolated_missing
+def regularize_timesteps(ds, sample_interval, robust=False, add_quality_flag=True, logger=None, verbose=True):
+    """Ensure timesteps match with the sample_interval.
+    This function:
+    - drop dataset indices with duplicated timesteps,
+    - but does not add missing timesteps to the dataset.
+    """
+    # Check sorted by time and sort if necessary
+    ds = ensure_sorted_by_time(ds)
+    # Convert time to pandas.DatetimeIndex for easier manipulation
+    times = pd.to_datetime(ds["time"].to_numpy())
+    # Determine the start and end times
+    start_time = times[0].floor(f"{sample_interval}s")
+    end_time = times[-1].ceil(f"{sample_interval}s")
+    # Create the expected time grid
+    expected_times = pd.date_range(start=start_time, end=end_time, freq=f"{sample_interval}s")
+    # Convert to numpy arrays
+    times = times.to_numpy(dtype="M8[s]")
+    expected_times = expected_times.to_numpy(dtype="M8[s]")
+    # Map original times to the nearest expected times
+    # Calculate the difference between original times and expected times
+    time_deltas = np.abs(times - expected_times[:, None]).astype(int)
+    # Find the index of the closest expected time for each original time
+    nearest_indices = np.argmin(time_deltas, axis=0)
+    adjusted_times = expected_times[nearest_indices]
+    # Check for duplicates in adjusted times
+    unique_times, counts = np.unique(adjusted_times, return_counts=True)
+    duplicates = unique_times[counts > 1]
+    # Initialize time quality flag
+    # - 0 when ok or just rounded to closest 00
+    # - 1 if previous timestep is missing
+    # - 2 if next timestep is missing
+    # - 3 if previous and next timestep is missing
+    # - 4 if solved duplicated timesteps
+    # - 5 if needed to drop duplicated timesteps and select the last
+    flag_previous_missing = 1
+    flag_next_missing = 2
+    flag_isolated_timestep = 3
+    flag_solved_duplicated_timestep = 4
+    flag_dropped_duplicated_timestep = 5
+    qc_flag = np.zeros(adjusted_times.shape)
+    # Initialize list with the duplicated timesteps index to drop
+    # - We drop the first occurrence because is likely the shortest interval
+    idx_to_drop = []
+    # Attempt to resolve for duplicates
+    if duplicates.size > 0:
+        # Handle duplicates
+        for dup_time in duplicates:
+            # Indices of duplicates
+            dup_indices = np.where(adjusted_times == dup_time)[0]
+            n_duplicates = len(dup_indices)
+            # Define previous and following timestep
+            prev_time = dup_time - pd.Timedelta(seconds=sample_interval)
+            next_time = dup_time + pd.Timedelta(seconds=sample_interval)
+            # Try to find missing slots before and after
+            # - If more than 3 duplicates, impossible to solve !
+            count_solved = 0
+            # If the previous timestep is available, set that one
+            if n_duplicates == 2:
+                if prev_time not in adjusted_times:
+                    adjusted_times[dup_indices[0]] = prev_time
+                    qc_flag[dup_indices[0]] = flag_solved_duplicated_timestep
+                    count_solved += 1
+                elif next_time not in adjusted_times:
+                    adjusted_times[dup_indices[-1]] = next_time
+                    qc_flag[dup_indices[-1]] = flag_solved_duplicated_timestep
+                    count_solved += 1
+                else:
+                    pass
+            elif n_duplicates == 3:
+                if prev_time not in adjusted_times:
+                    adjusted_times[dup_indices[0]] = prev_time
+                    qc_flag[dup_indices[0]] = flag_solved_duplicated_timestep
+                    count_solved += 1
+                if next_time not in adjusted_times:
+                    adjusted_times[dup_indices[-1]] = next_time
+                    qc_flag[dup_indices[-1]] = flag_solved_duplicated_timestep
+                    count_solved += 1
+            if count_solved != n_duplicates - 1:
+                idx_to_drop = np.append(idx_to_drop, dup_indices[0:-1])
+                qc_flag[dup_indices[-1]] = flag_dropped_duplicated_timestep
+                msg = (
+                    f"Cannot resolve {n_duplicates} duplicated timesteps "
+                    f"(after trailing seconds correction) around {dup_time}."
+                )
+                log_warning(logger=logger, msg=msg, verbose=verbose)
+                if robust:
+                    raise ValueError(msg)
+    # Update the time coordinate (Convert to ns for xarray compatibility)
+    ds = ds.assign_coords({"time": adjusted_times.astype("datetime64[ns]")})
+    # Update quality flag values for next and previous timestep is missing
+    if add_quality_flag:
+        idx_previous_missing, idx_next_missing, idx_isolated_missing = get_problematic_timestep_indices(
+            adjusted_times,
+            sample_interval,
+        )
+        qc_flag[idx_previous_missing] = np.maximum(qc_flag[idx_previous_missing], flag_previous_missing)
+        qc_flag[idx_next_missing] = np.maximum(qc_flag[idx_next_missing], flag_next_missing)
+        qc_flag[idx_isolated_missing] = np.maximum(qc_flag[idx_isolated_missing], flag_isolated_timestep)
+        # If the first timestep is at 00:00 and currently flagged as previous missing (1), reset to 0
+        # first_time = pd.to_datetime(adjusted_times[0]).time()
+        # first_expected_time = pd.Timestamp("00:00:00").time()
+        # if first_time == first_expected_time and qc_flag[0] == flag_previous_missing:
+        #     qc_flag[0] = 0
+        # # If the last timestep is flagged and currently flagged as next missing (2), reset it to 0
+        # last_time = pd.to_datetime(adjusted_times[-1]).time()
+        # last_time_expected = (pd.Timestamp("00:00:00") - pd.Timedelta(30, unit="seconds")).time()
+        # # Check if adding one interval would go beyond the end_time
+        # if last_time == last_time_expected and qc_flag[-1] == flag_next_missing:
+        #     qc_flag[-1] = 0
+        # Assign time quality flag coordinate
+        ds["time_qc"] = xr.DataArray(qc_flag, dims="time")
+        ds = ds.set_coords("time_qc")
+        # Add CF attributes for time_qc
+        ds["time_qc"].attrs = {
+            "long_name": "time quality flag",
+            "standard_name": "status_flag",
+            "units": "1",
+            "valid_range": [0, 5],
+            "flag_values": [0, 1, 2, 3, 4, 5],
+            "flag_meanings": (
+                "good_data "
+                "previous_timestep_missing "
+                "next_timestep_missing "
+                "isolated_timestep "
+                "solved_duplicated_timestep "
+                "dropped_duplicated_timestep"
+            ),
+            "comment": (
+                "Quality flag for time coordinate. "
+                "Flag 0: data is good or just rounded to nearest sampling interval. "
+                "Flag 1: previous timestep is missing in the time series. "
+                "Flag 2: next timestep is missing in the time series. "
+                "Flag 3: both previous and next timesteps are missing (isolated timestep). "
+                "Flag 4: timestep was moved from duplicate to fill missing timestep. "
+                "Flag 5: duplicate timestep was dropped, keeping the last occurrence."
+            ),
+        }
+    # Drop duplicated timesteps
+    # - Using ds =  ds.drop_isel({"time": idx_to_drop.astype(int)}) raise:
+    #   --> pandas.errors.InvalidIndexError: Reindexing only valid with uniquely valued Index objects
+    #   --> https://github.com/pydata/xarray/issues/6605
+    if len(idx_to_drop) > 0:
+        idx_to_drop = idx_to_drop.astype(int)
+        idx_valid_timesteps = np.arange(0, ds["time"].size)
+        idx_valid_timesteps = np.delete(idx_valid_timesteps, idx_to_drop)
+        ds = ds.isel(time=idx_valid_timesteps)
+    # Return dataset
+    return ds
 def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
     """Check for the regularity of timesteps."""
     # Check sorted by time and sort if necessary
@@ -339,12 +525,14 @@ def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
     fractions = np.round(counts / len(deltadt) * 100, 2)
     # Compute stats about expected deltadt
-    sample_interval_counts = counts[unique_deltadt == sample_interval].item()
-    sample_interval_fraction = fractions[unique_deltadt == sample_interval].item()
+    mask = unique_deltadt == sample_interval
+    sample_interval_counts = counts[mask].item() if mask.any() else 0
+    sample_interval_fraction = fractions[mask].item() if mask.any() else 0.0
     # Compute stats about most frequent deltadt
-    most_frequent_deltadt_counts = counts[unique_deltadt == most_frequent_deltadt].item()
-    most_frequent_deltadt_fraction = fractions[unique_deltadt == most_frequent_deltadt].item()
+    mask = unique_deltadt == most_frequent_deltadt
+    most_frequent_deltadt_counts = counts[mask].item() if mask.any() else 0
+    most_frequent_deltadt_fraction = fractions[mask].item() if mask.any() else 0.0
     # Compute stats about unexpected deltadt
     unexpected_intervals = unique_deltadt[unique_deltadt != sample_interval]
@@ -352,13 +540,14 @@ def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
     unexpected_intervals_fractions = fractions[unique_deltadt != sample_interval]
     frequent_unexpected_intervals = unexpected_intervals[unexpected_intervals_fractions > 5]
-    # Report warning if the samplin_interval deltadt occurs less often than 60 % of times
+    # Report warning if the sampling_interval deltadt occurs less often than 60 % of times
     # -> TODO: maybe only report in stations where the disdro does not log only data when rainy
     if sample_interval_fraction < 60:
         msg = (
             f"The expected (sampling) interval between observations occurs only "
             f"{sample_interval_counts}/{n} times ({sample_interval_fraction} %)."
         )
+        log_warning(logger=logger, msg=msg, verbose=verbose)
     # Report warning if a deltadt occurs more often then the sampling interval
     if most_frequent_deltadt != sample_interval:
@@ -372,14 +561,7 @@ def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
     # Report with a warning all unexpected deltadt with frequency larger than 5 %
     if len(frequent_unexpected_intervals) > 0:
-        msg_parts = ["The following unexpected intervals occur frequently:"]
-        for interval in frequent_unexpected_intervals:
-            c = unexpected_intervals_counts[unexpected_intervals == interval].item()
-            f = unexpected_intervals_fractions[unexpected_intervals == interval].item()
-            msg_parts.append(f" {interval} ({f}%) ({c}/{n}) | ")
-        msg = " ".join(msg_parts)
-        msg = "The following time intervals between observations occurs often: "
+        msg = "The following time intervals between observations occur frequently: "
         for interval in frequent_unexpected_intervals:
             c = unexpected_intervals_counts[unexpected_intervals == interval].item()
             f = unexpected_intervals_fractions[unexpected_intervals == interval].item()
@@ -388,7 +570,11 @@ def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
     return ds
-def finalize_l0c_dataset(ds, sample_interval, verbose=True, logger=None):
+####----------------------------------------------------------------------------------------------.
+#### Wrapper
+def _finalize_l0c_dataset(ds, sample_interval, sensor_name, verbose=True, logger=None):
     """Finalize a L0C dataset with unique sampling interval.
     It adds the sampling_interval coordinate and it regularizes the timesteps for trailing seconds.
@@ -407,26 +593,45 @@ def finalize_l0c_dataset(ds, sample_interval, verbose=True, logger=None):
     )
     # Performs checks about timesteps regularity
+    # - Do not discard anything
+    # - Just log warnings in the log file
     ds = check_timesteps_regularity(ds=ds, sample_interval=sample_interval, verbose=verbose, logger=logger)
+    # Shift timesteps to ensure time correspond to start of measurement interval
+    # TODO as function of sensor name
+    # Set netCDF dimension order
+    # --> Required for correct encoding !
+    ds = ds.transpose("time", "diameter_bin_center", ...)
+    # Set encodings
+    ds = set_l0b_encodings(ds=ds, sensor_name=sensor_name)
+    # Update global attributes
+    ds = set_disdrodb_attrs(ds, product="L0C")
     return ds
-def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_equality=True, logger=None, verbose=True):
+def create_l0c_datasets(
+    event_info,
+    measurement_intervals,
+    sensor_name,
+    ensure_variables_equality=True,
+    logger=None,
+    verbose=True,
+):
     """
-    Create a daily file by merging and processing data from multiple filepaths.
+    Create a single dataset by merging and processing data from multiple filepaths.
     Parameters
     ----------
-    day : str or numpy.datetime64
-        The day for which the daily file is to be created.
-        Should be in a format that can be converted to numpy.datetime64.
-    filepaths : list of str
-        List of filepaths to the data files to be processed.
+    event_info : dict
+        Dictionary with start_time, end_time and filepaths keys.
     Returns
     -------
-    xarray.Dataset
-        The processed dataset containing data for the specified day.
+    dict
+        A dictionary with an xarray.Dataset for each measurement interval.
     Raises
     ------
@@ -435,50 +640,39 @@ def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_eq
     Notes
     -----
-    - The function adds a tolerance for searching timesteps
-    before and after 00:00 to account for imprecise logging times.
-    - It checks that duplicated timesteps have the same raw drop number values.
-    - The function infers the sample interval and
-    regularizes timesteps to handle trailing seconds.
-    - The data is loaded into memory and connections to source files
-    are closed before returning the dataset.
+    - Data is loaded into memory and connections to source files are closed before returning the dataset.
+    - Tolerance in input files is used around expected dataset start_time and end_time to account for
+      imprecise logging times and ensuring correct definition of qc_time at files boundaries (e.g. 00:00).
+    - Duplicated timesteps with different raw drop number values are dropped
+    - First occurrence of duplicated timesteps with equal raw drop number values is kept.
+    - Regularizes timesteps to handle trailing seconds.
     """
-    import xarray as xr  # Load in each process when function is called !
     # ---------------------------------------------------------------------------------------.
-    # Define start day and end of day
-    start_day = np.array(day).astype("M8[D]")
-    end_day = start_day + np.array(1, dtype="m8[D]") - np.array(1, dtype="m8[s]")  # avoid 00:00 of next day !
+    # Retrieve information
+    start_time = np.array(event_info["start_time"], dtype="M8[s]")
+    end_time = np.array(event_info["end_time"], dtype="M8[s]")
+    filepaths = event_info["filepaths"]
-    # Add tolerance for searching timesteps before and after 00:00 to account for imprecise logging time
-    # - Example: timestep 23:59:30 that should be 00.00 goes into the next day ...
-    start_day_tol = start_day - np.array(TOLERANCE_SECONDS, dtype="m8[s]")
-    end_day_tol = end_day + np.array(TOLERANCE_SECONDS, dtype="m8[s]")
+    # Define expected dataset time coverage
+    start_time_tol = start_time - np.array(TOLERANCE_SECONDS, dtype="m8[s]")
+    end_time_tol = end_time + np.array(TOLERANCE_SECONDS, dtype="m8[s]")
     # ---------------------------------------------------------------------------------------.
     # Open files with data within the provided day and concatenate them
-    list_ds = [
-        xr.open_dataset(filepath, decode_timedelta=False, chunks=-1, cache=False).sortby("time")
-        for filepath in filepaths
-    ]
-    list_ds = [ds.sel({"time": slice(start_day_tol, end_day_tol)}) for ds in list_ds]
-    if len(list_ds) > 1:
-        # Concatenate dataset
-        # - If some variable are missing in one file, it is filled with NaN. This should not occur anyway.
-        # - The resulting dataset can have duplicated timesteps !
-        ds = xr.concat(list_ds, dim="time", join="outer", compat="no_conflicts", combine_attrs="override").sortby(
-            "time",
-        )
-    else:
-        ds = list_ds[0]
-    # Compute data
-    ds = ds.compute()
+    ds = open_netcdf_files(
+        filepaths,
+        start_time=start_time_tol,
+        end_time=end_time_tol,
+        chunks={},
+        parallel=False,
+        compute=True,
+    )
-    # Close connection to source files
-    _ = [ds.close() for ds in list_ds]
-    ds.close()
-    del list_ds
+    # If not data for that time block, return empty dictionary
+    # - Can occur when raw files are already by block of months and e.g. here saving to daily blocks !
+    if ds.sizes["time"] == 0:
+        log_info(logger=logger, msg=f"No data between {start_time} and {end_time}.", verbose=verbose)
+        return {}
     # ---------------------------------------------------------------------------------------.
     # If sample interval is a dataset variable, drop timesteps with unexpected measurement intervals !
@@ -489,9 +683,16 @@ def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_eq
             verbose=verbose,
             logger=logger,
         )
+        n_timesteps = len(ds["time"])
+        if n_timesteps < 3:
+            raise ValueError(f"Only {n_timesteps} timesteps left after removing those with unexpected sample interval.")
     # ---------------------------------------------------------------------------------------.
-    # Remove duplicated timesteps
+    # Remove duplicated timesteps (before correcting for trailing seconds)
+    # - It checks that duplicated timesteps have the same raw_drop_number values
+    # - If duplicated timesteps have different raw_drop_number values:
+    #   --> warning is raised
+    #   --> timesteps are dropped
     ds = remove_duplicated_timesteps(
         ds,
         ensure_variables_equality=ensure_variables_equality,
@@ -502,7 +703,7 @@ def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_eq
     # Raise error if less than 3 timesteps left
     n_timesteps = len(ds["time"])
     if n_timesteps < 3:
-        raise ValueError(f"{n_timesteps} timesteps left after removing duplicated timesteps.")
+        raise ValueError(f"{n_timesteps} timesteps left after removing duplicated.")
     # ---------------------------------------------------------------------------------------.
     # Split dataset by sampling intervals
@@ -513,107 +714,24 @@ def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_eq
         min_block_size=5,
     )
-    # Log a warning if two sampling intervals are present within a given day
+    # Log a warning if two sampling intervals are present within a given time block
     if len(dict_ds) > 1:
         occuring_sampling_intervals = list(dict_ds)
-        msg = f"The dataset contains both sampling intervals {occuring_sampling_intervals}."
+        msg = f"The input files contains these sampling intervals: {occuring_sampling_intervals}."
         log_warning(logger=logger, msg=msg, verbose=verbose)
     # ---------------------------------------------------------------------------------------.
     # Finalize L0C datasets
-    # - Add sample_interval coordinate
+    # - Add and ensure sample_interval coordinate has just 1 value (not varying with time)
     # - Regularize timesteps for trailing seconds
     dict_ds = {
-        sample_interval: finalize_l0c_dataset(
+        sample_interval: _finalize_l0c_dataset(
             ds=ds,
             sample_interval=sample_interval,
+            sensor_name=sensor_name,
             verbose=verbose,
             logger=logger,
-        ).sel({"time": slice(start_day, end_day)})
+        ).sel({"time": slice(start_time, end_time)})
         for sample_interval, ds in dict_ds.items()
     }
     return dict_ds
-# ---------------------------------------------------------------------------------------.
-#### DEPRECATED CODE
-# def copy_l0b_to_l0c_directory(filepath):
-#     """Copy L0B file to L0C directory."""
-#     import netCDF4
-#     # Copy file
-#     l0c_filepath = filepath.replace("L0B", "L0C")
-#     _ = shutil.copy(filepath, l0c_filepath)
-#     # Edit DISDRODB product attribute
-#     with netCDF4.Dataset(l0c_filepath, mode="a") as nc_file:
-#         # Modify the global attribute
-#         nc_file.setncattr("disdrodb_product", "L0C")
-# def find_isel_common_time(da1, da2):
-#     """
-#     Find the indices of common time steps between two data arrays.
-#     Parameters
-#     ----------
-#     da1 : xarray.DataArray
-#         The first data array with a time coordinate.
-#     da2 : xarray.DataArray
-#         The second data array with a time coordinate.
-#     Returns
-#     -------
-#     da1_isel : numpy.ndarray
-#         Indices of the common time steps in the first data array.
-#     da2_isel : numpy.ndarray
-#         Indices of the common time steps in the second data array.
-#     Notes
-#     -----
-#     This function assumes that both input data arrays have a "time" coordinate.
-#     The function finds the intersection of the time steps in both data arrays
-#     and returns the indices of these common time steps for each data array.
-#     """
-#     intersecting_timesteps = np.intersect1d(da1["time"], da2["time"])
-#     da1_isel = np.where(np.isin(da1["time"], intersecting_timesteps))[0]
-#     da2_isel = np.where(np.isin(da2["time"], intersecting_timesteps))[0]
-#     return da1_isel, da2_isel
-# def check_same_raw_drop_number_values(list_ds, filepaths):
-#     """
-#     Check if the 'raw_drop_number' values are the same across multiple datasets.
-#     This function compares the 'raw_drop_number' values of multiple datasets to ensure they are identical
-#     at common timesteps.
-#     If any discrepancies are found, a ValueError is raised indicating which files
-#     have differing values.
-#     Parameters
-#     ----------
-#     list_ds : list of xarray.Dataset
-#         A list of xarray Datasets to be compared.
-#     filepaths : list of str
-#         A list of file paths corresponding to the datasets in `list_ds`.
-#     Raises
-#     ------
-#     ValueError
-#         If 'raw_drop_number' values differ at any common timestep between any two datasets.
-#     """
-#     # Retrieve variable to compare
-#     list_drop_number = [ds["raw_drop_number"].compute() for ds in list_ds]
-#     # Compare values
-#     combos = list(itertools.combinations(range(len(list_drop_number)), 2))
-#     for i, j in combos:
-#         da1 = list_drop_number[i]
-#         da2 = list_drop_number[j]
-#         da1_isel, da2_isel = find_isel_common_time(da1=da1, da2=da2)
-#         if not np.all(da1.isel(time=da1_isel).data == da2.isel(time=da2_isel).data):
-#             file1 = filepaths[i]
-#             file2 = filepaths[i]
-#             msg = f"Duplicated timesteps have different values between file {file1} and {file2}"
-#             raise ValueError(msg)

disdrodb 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

disdrodb 0.1.3py3-none-any.whl → 0.1.4py3-none-any.whl