PyPI - disdrodb - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

disdrodb 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

disdrodb/__init__.py +68 -34
disdrodb/_config.py +5 -4
disdrodb/_version.py +16 -3
disdrodb/accessor/__init__.py +20 -0
disdrodb/accessor/methods.py +125 -0
disdrodb/api/checks.py +177 -24
disdrodb/api/configs.py +3 -3
disdrodb/api/info.py +13 -13
disdrodb/api/io.py +281 -22
disdrodb/api/path.py +184 -195
disdrodb/api/search.py +18 -9
disdrodb/cli/disdrodb_create_summary.py +103 -0
disdrodb/cli/disdrodb_create_summary_station.py +91 -0
disdrodb/cli/disdrodb_run_l0.py +1 -1
disdrodb/cli/disdrodb_run_l0_station.py +1 -1
disdrodb/cli/disdrodb_run_l0a_station.py +1 -1
disdrodb/cli/disdrodb_run_l0b.py +1 -1
disdrodb/cli/disdrodb_run_l0b_station.py +3 -3
disdrodb/cli/disdrodb_run_l0c.py +1 -1
disdrodb/cli/disdrodb_run_l0c_station.py +3 -3
disdrodb/cli/disdrodb_run_l1_station.py +2 -2
disdrodb/cli/disdrodb_run_l2e_station.py +2 -2
disdrodb/cli/disdrodb_run_l2m_station.py +2 -2
disdrodb/configs.py +149 -4
disdrodb/constants.py +61 -0
disdrodb/data_transfer/download_data.py +127 -11
disdrodb/etc/configs/attributes.yaml +339 -0
disdrodb/etc/configs/encodings.yaml +473 -0
disdrodb/etc/products/L1/global.yaml +13 -0
disdrodb/etc/products/L2E/10MIN.yaml +12 -0
disdrodb/etc/products/L2E/1MIN.yaml +1 -0
disdrodb/etc/products/L2E/global.yaml +22 -0
disdrodb/etc/products/L2M/10MIN.yaml +12 -0
disdrodb/etc/products/L2M/GAMMA_ML.yaml +8 -0
disdrodb/etc/products/L2M/NGAMMA_GS_LOG_ND_MAE.yaml +6 -0
disdrodb/etc/products/L2M/NGAMMA_GS_ND_MAE.yaml +6 -0
disdrodb/etc/products/L2M/NGAMMA_GS_Z_MAE.yaml +6 -0
disdrodb/etc/products/L2M/global.yaml +26 -0
disdrodb/issue/writer.py +2 -0
disdrodb/l0/__init__.py +13 -0
disdrodb/l0/configs/LPM/l0b_cf_attrs.yml +4 -4
disdrodb/l0/configs/PARSIVEL/l0b_cf_attrs.yml +1 -1
disdrodb/l0/configs/PARSIVEL/l0b_encodings.yml +3 -3
disdrodb/l0/configs/PARSIVEL/raw_data_format.yml +1 -1
disdrodb/l0/configs/PARSIVEL2/l0b_cf_attrs.yml +5 -5
disdrodb/l0/configs/PARSIVEL2/l0b_encodings.yml +3 -3
disdrodb/l0/configs/PARSIVEL2/raw_data_format.yml +1 -1
disdrodb/l0/configs/PWS100/l0b_cf_attrs.yml +4 -4
disdrodb/l0/configs/PWS100/raw_data_format.yml +1 -1
disdrodb/l0/l0a_processing.py +37 -32
disdrodb/l0/l0b_nc_processing.py +118 -8
disdrodb/l0/l0b_processing.py +30 -65
disdrodb/l0/l0c_processing.py +369 -259
disdrodb/l0/readers/LPM/ARM/ARM_LPM.py +7 -0
disdrodb/l0/readers/LPM/NETHERLANDS/DELFT_LPM_NC.py +66 -0
disdrodb/l0/readers/LPM/SLOVENIA/{CRNI_VRH.py → UL.py} +3 -0
disdrodb/l0/readers/LPM/SWITZERLAND/INNERERIZ_LPM.py +195 -0
disdrodb/l0/readers/PARSIVEL/GPM/PIERS.py +0 -2
disdrodb/l0/readers/PARSIVEL/JAPAN/JMA.py +4 -1
disdrodb/l0/readers/PARSIVEL/NCAR/PECAN_MOBILE.py +1 -1
disdrodb/l0/readers/PARSIVEL/NCAR/VORTEX2_2009.py +1 -1
disdrodb/l0/readers/PARSIVEL2/ARM/ARM_PARSIVEL2.py +4 -0
disdrodb/l0/readers/PARSIVEL2/BELGIUM/ILVO.py +168 -0
disdrodb/l0/readers/PARSIVEL2/CANADA/UQAM_NC.py +69 -0
disdrodb/l0/readers/PARSIVEL2/DENMARK/DTU.py +165 -0
disdrodb/l0/readers/PARSIVEL2/FINLAND/FMI_PARSIVEL2.py +69 -0
disdrodb/l0/readers/PARSIVEL2/FRANCE/ENPC_PARSIVEL2.py +255 -134
disdrodb/l0/readers/PARSIVEL2/FRANCE/OSUG.py +525 -0
disdrodb/l0/readers/PARSIVEL2/FRANCE/SIRTA_PARSIVEL2.py +1 -1
disdrodb/l0/readers/PARSIVEL2/GPM/GCPEX.py +9 -7
disdrodb/l0/readers/PARSIVEL2/KIT/BURKINA_FASO.py +1 -1
disdrodb/l0/readers/PARSIVEL2/KIT/TEAMX.py +123 -0
disdrodb/l0/readers/PARSIVEL2/{NETHERLANDS/DELFT.py → MPI/BCO_PARSIVEL2.py} +41 -71
disdrodb/l0/readers/PARSIVEL2/MPI/BOWTIE.py +220 -0
disdrodb/l0/readers/PARSIVEL2/NASA/APU.py +120 -0
disdrodb/l0/readers/PARSIVEL2/NASA/LPVEX.py +109 -0
disdrodb/l0/readers/PARSIVEL2/NCAR/FARM_PARSIVEL2.py +1 -0
disdrodb/l0/readers/PARSIVEL2/NCAR/PECAN_FP3.py +1 -1
disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_MIPS.py +126 -0
disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_PIPS.py +165 -0
disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_P2.py +1 -1
disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_PIPS.py +20 -12
disdrodb/l0/readers/PARSIVEL2/NETHERLANDS/DELFT_NC.py +5 -0
disdrodb/l0/readers/PARSIVEL2/SPAIN/CENER.py +144 -0
disdrodb/l0/readers/PARSIVEL2/SPAIN/CR1000DL.py +201 -0
disdrodb/l0/readers/PARSIVEL2/SPAIN/LIAISE.py +137 -0
disdrodb/l0/readers/PARSIVEL2/USA/C3WE.py +146 -0
disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100.py +105 -99
disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100_SIRTA.py +151 -0
disdrodb/l1/__init__.py +5 -0
disdrodb/l1/fall_velocity.py +46 -0
disdrodb/l1/filters.py +34 -20
disdrodb/l1/processing.py +46 -45
disdrodb/l1/resampling.py +77 -66
disdrodb/l1_env/routines.py +18 -3
disdrodb/l2/__init__.py +7 -0
disdrodb/l2/empirical_dsd.py +58 -10
disdrodb/l2/processing.py +268 -117
disdrodb/metadata/checks.py +132 -125
disdrodb/metadata/standards.py +3 -1
disdrodb/psd/fitting.py +631 -345
disdrodb/psd/models.py +9 -6
disdrodb/routines/__init__.py +54 -0
disdrodb/{l0/routines.py → routines/l0.py} +316 -355
disdrodb/{l1/routines.py → routines/l1.py} +76 -116
disdrodb/routines/l2.py +1019 -0
disdrodb/{routines.py → routines/wrappers.py} +98 -10
disdrodb/scattering/__init__.py +16 -4
disdrodb/scattering/axis_ratio.py +61 -37
disdrodb/scattering/permittivity.py +504 -0
disdrodb/scattering/routines.py +746 -184
disdrodb/summary/__init__.py +17 -0
disdrodb/summary/routines.py +4196 -0
disdrodb/utils/archiving.py +434 -0
disdrodb/utils/attrs.py +68 -125
disdrodb/utils/cli.py +5 -5
disdrodb/utils/compression.py +30 -1
disdrodb/utils/dask.py +121 -9
disdrodb/utils/dataframe.py +61 -7
disdrodb/utils/decorators.py +31 -0
disdrodb/utils/directories.py +35 -15
disdrodb/utils/encoding.py +37 -19
disdrodb/{l2 → utils}/event.py +15 -173
disdrodb/utils/logger.py +14 -7
disdrodb/utils/manipulations.py +81 -0
disdrodb/utils/routines.py +166 -0
disdrodb/utils/subsetting.py +214 -0
disdrodb/utils/time.py +35 -177
disdrodb/utils/writer.py +20 -7
disdrodb/utils/xarray.py +5 -4
disdrodb/viz/__init__.py +13 -0
disdrodb/viz/plots.py +398 -0
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/METADATA +4 -3
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/RECORD +139 -98
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/entry_points.txt +2 -0
disdrodb/l1/encoding_attrs.py +0 -642
disdrodb/l2/processing_options.py +0 -213
disdrodb/l2/routines.py +0 -868
/disdrodb/l0/readers/PARSIVEL/SLOVENIA/{UL_FGG.py → UL.py} +0 -0
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/WHEEL +0 -0
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/licenses/LICENSE +0 -0
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/top_level.txt +0 -0

disdrodb/utils/archiving.py ADDED Viewed

@@ -0,0 +1,434 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# -----------------------------------------------------------------------------.
+"""Utility function for DISDRODB product archiving."""
+import datetime
+import numpy as np
+import pandas as pd
+from disdrodb.api.info import get_start_end_time_from_filepaths
+from disdrodb.api.io import open_netcdf_files
+from disdrodb.utils.event import group_timesteps_into_event
+from disdrodb.utils.time import (
+    ensure_sorted_by_time,
+    ensure_timedelta_seconds,
+)
+####---------------------------------------------------------------------------------
+#### Time blocks
+def check_freq(freq: str) -> None:
+    """Check validity of freq argument."""
+    valid_freq = ["none", "year", "season", "quarter", "month", "day", "hour"]
+    if not isinstance(freq, str):
+        raise TypeError("'freq' must be a string.")
+    if freq not in valid_freq:
+        raise ValueError(
+            f"'freq' '{freq}' is not possible. Must be one of: {valid_freq}.",
+        )
+    return freq
+def generate_time_blocks(
+    start_time: np.datetime64,
+    end_time: np.datetime64,
+    freq: str,
+    inclusive_end_time: bool = True,
+) -> np.ndarray:
+    """Generate time blocks between `start_time` and `end_time` for a given frequency.
+    Parameters
+    ----------
+    start_time : numpy.datetime64
+        Inclusive start of the overall time range.
+    end_time : numpy.datetime64
+        End of the overall time range. Inclusive by default (see inclusive_end_time argument).
+    freq : str
+        Frequency specifier. Accepted values are:
+        - 'none'    : return a single block [start_time, end_time]
+        - 'day'     : split into daily blocks
+        - 'month'   : split into calendar months
+        - 'quarter' : split into calendar quarters
+        - 'year'    : split into calendar years
+        - 'season'  : split into meteorological seasons (MAM, JJA, SON, DJF)
+    inclusive_end_time: bool
+        The default is True.
+        If False, if the last block end_time is equal to input end_time, such block is removed.
+    Returns
+    -------
+    numpy.ndarray
+        Array of shape (n, 2) with dtype datetime64[s], where each row is [block_start, block_end].
+    """
+    freq = check_freq(freq)
+    if freq == "none":
+        return np.array([[start_time, end_time]], dtype="datetime64[s]")
+    # Mapping from our custom freq to pandas frequency codes
+    freq_map = {
+        "hour": "h",
+        "day": "d",
+        "month": "M",
+        "quarter": "Q",
+        "year": "Y",
+        "season": "Q-FEB",  # seasons DJF, MAM, JJA, SON
+    }
+    # Define periods
+    periods = pd.period_range(start=start_time, end=end_time, freq=freq_map[freq])
+    # Create time blocks
+    blocks = []
+    for period in periods:
+        start = period.start_time.to_datetime64().astype("datetime64[s]")
+        if freq == "quarter":
+            end = period.end_time.floor("s").to_datetime64().astype("datetime64[s]")
+        else:
+            end = period.end_time.to_datetime64().astype("datetime64[s]")
+        blocks.append([start, end])
+    blocks = np.array(blocks, dtype="datetime64[s]")
+    if not inclusive_end_time and len(blocks) > 0 and blocks[-1, 0] == end_time:
+        blocks = blocks[:-1]
+    return blocks
+####----------------------------------------------------------------------------
+#### Event/Time partitioning
+def identify_events(
+    filepaths,
+    parallel=False,
+    min_drops=5,
+    neighbor_min_size=2,
+    neighbor_time_interval="5MIN",
+    event_max_time_gap="6H",
+    event_min_duration="5MIN",
+    event_min_size=3,
+):
+    """Return a list of rainy events.
+    Rainy timesteps are defined when N > min_drops.
+    Any rainy isolated timesteps (based on neighborhood criteria) is removed.
+    Then, consecutive rainy timesteps are grouped into the same event if the time gap between them does not
+    exceed `event_max_time_gap`. Finally, events that do not meet minimum size or duration
+    requirements are filtered out.
+    Parameters
+    ----------
+    filepaths: list
+        List of L1C file paths.
+    parallel: bool
+        Whether to load the files in parallel.
+        Set parallel=True only in a multiprocessing environment.
+        The default is False.
+    neighbor_time_interval : str
+        The time interval around a given a timestep defining the neighborhood.
+        Only timesteps that fall within this time interval before or after a timestep are considered neighbors.
+    neighbor_min_size : int, optional
+        The minimum number of neighboring timesteps required within `neighbor_time_interval` for a
+        timestep to be considered non-isolated.  Isolated timesteps are removed !
+        - If `neighbor_min_size=0,  then no timestep is considered isolated and no filtering occurs.
+        - If `neighbor_min_size=1`, the timestep must have at least one neighbor within `neighbor_time_interval`.
+        - If `neighbor_min_size=2`, the timestep must have at least two timesteps within `neighbor_time_interval`.
+        Defaults to 1.
+    event_max_time_gap: str
+        The maximum time interval between two timesteps to be considered part of the same event.
+        This parameters is used to group timesteps into events !
+    event_min_duration : str
+        The minimum duration an event must span. Events shorter than this duration are discarded.
+    event_min_size : int, optional
+        The minimum number of valid timesteps required for an event. Defaults to 1.
+    Returns
+    -------
+    list of dict
+        A list of events, where each event is represented as a dictionary with keys:
+        - "start_time": np.datetime64, start time of the event
+        - "end_time": np.datetime64, end time of the event
+        - "duration": np.timedelta64, duration of the event
+        - "n_timesteps": int, number of valid timesteps in the event
+    """
+    # Open datasets in parallel
+    ds = open_netcdf_files(filepaths, variables=["time", "N"], parallel=parallel, compute=True)
+    # Sort dataset by time
+    ds = ensure_sorted_by_time(ds)
+    # Define candidate timesteps to group into events
+    idx_valid = ds["N"].to_numpy() > min_drops
+    timesteps = ds["time"].to_numpy()[idx_valid]
+    # Define event list
+    event_list = group_timesteps_into_event(
+        timesteps=timesteps,
+        neighbor_min_size=neighbor_min_size,
+        neighbor_time_interval=neighbor_time_interval,
+        event_max_time_gap=event_max_time_gap,
+        event_min_duration=event_min_duration,
+        event_min_size=event_min_size,
+    )
+    del ds
+    return event_list
+def identify_time_partitions(start_times, end_times, freq: str) -> list[dict]:
+    """Identify the set of time blocks covered by files.
+    The result is a minimal, sorted, and unique set of time partitions.
+    'start_times' and end_times can be derived using get_start_end_time_from_filepaths.
+    Parameters
+    ----------
+    start_times : numpy.ndarray of datetime64[s]
+        Array of inclusive start times for each file.
+    end_times : numpy.ndarray of datetime64[s]
+        Array of inclusive end times for each file.
+    freq : {'none', 'hour', 'day', 'month', 'quarter', 'season', 'year'}
+        Frequency determining the granularity of candidate blocks.
+        See `generate_time_blocks` for more details.
+    Returns
+    -------
+    list of dict
+        A list of dictionaries, each containing:
+        - `start_time` (numpy.datetime64[s])
+            Inclusive start of a time block.
+        - `end_time` (numpy.datetime64[s])
+            Inclusive end of a time block.
+        Only those blocks that overlap at least one file's interval are returned.
+        The list is sorted by `start_time` and contains no duplicate blocks.
+    """
+    # Define files time coverage
+    start_time, end_time = start_times.min(), end_times.max()
+    # Compute candidate time blocks
+    blocks = generate_time_blocks(start_time, end_time, freq=freq)
+    # Select time blocks with files
+    mask = (blocks[:, 0][:, None] <= end_times) & (blocks[:, 1][:, None] >= start_times)
+    blocks = blocks[mask.any(axis=1)]
+    # Ensure sorted unique time blocks
+    order = np.argsort(blocks[:, 0])
+    blocks = np.unique(blocks[order], axis=0)
+    # Convert to list of dicts
+    list_time_blocks = [{"start_time": start_time, "end_time": end_time} for start_time, end_time in blocks]
+    return list_time_blocks
+def define_temporal_partitions(filepaths, strategy, parallel, strategy_options):
+    """Define temporal file processing partitions.
+    Parameters
+    ----------
+    filepaths : list
+        List of files paths to be processed
+    strategy : str
+        Which partitioning strategy to apply:
+        - ``'time_block'`` defines fixed time intervals (e.g. monthly) covering input files.
+        - ``'event'`` detect clusters of precipitation ("events").
+    parallel : bool
+         If True, parallel data loading is used to identify events.
+    strategy_options : dict
+        Dictionary with strategy-specific parameters:
+        If ``strategy == 'time_block'``, supported options are:
+        - ``freq``: Time unit for blocks. One of {'year', 'season', 'month', 'day'}.
+        See identify_time_partitions for more information.
+        If ``strategy == 'event'``, supported options are:
+        - ``min_drops`` : int
+          Minimum number of drops to consider a timestep.
+        - ``neighbor_min_size`` : int
+          Minimum cluster size for merging neighboring events.
+        - ``neighbor_time_interval`` : str
+          Time window (e.g. "5MIN") to merge adjacent clusters.
+        - ``event_max_time_gap`` : str
+          Maximum allowed gap (e.g. "6H") within a single event.
+        - ``event_min_duration`` : str
+          Minimum total duration (e.g. "5MIN") of an event.
+        - ``event_min_size`` : int
+          Minimum number of records in an event.
+        See identify_events for more information.
+    Returns
+    -------
+    list
+        A list of dictionaries, each containing:
+        - ``start_time`` (numpy.datetime64[s])
+            Inclusive start of an event or time block.
+        - ``end_time`` (numpy.datetime64[s])
+            Inclusive end of an event or time block.
+    Notes
+    -----
+    - The ``'event'`` strategy requires loading data into memory to identify clusters.
+    - The ``'time_block'`` strategy can operate on metadata alone, without full data loading.
+    - The ``'event'`` strategy implicitly performs data selection on which files to process !
+    - The ``'time_block'`` strategy does not performs data selection on which files to process !
+    """
+    if strategy not in ["time_block", "event"]:
+        raise ValueError(f"Unknown strategy: {strategy!r}. Must be 'time_block' or 'event'.")
+    if strategy == "event":
+        return identify_events(filepaths, parallel=parallel, **strategy_options)
+    start_times, end_times = get_start_end_time_from_filepaths(filepaths)
+    return identify_time_partitions(start_times=start_times, end_times=end_times, **strategy_options)
+####----------------------------------------------------------------------------
+#### Filepaths partitioning
+def _map_files_to_blocks(files_start_time, files_end_time, filepaths, block_starts, block_ends):
+    """Map each block start_time to list of overlapping filepaths."""
+    # Use broadcasting to create a boolean matrix indicating which files cover which time block
+    # Broadcasting: (n_files, n_blocks)
+    mask = (files_start_time[:, None] <= block_ends[None, :]) & (files_end_time[:, None] >= block_starts[None, :])
+    # Create a list with the a dictionary for each block
+    filepaths = np.array(filepaths)
+    results = []
+    for i, (start, end) in enumerate(zip(block_starts, block_ends)):
+        indices = np.where(mask[:, i])[0]
+        if indices.size > 0:
+            results.append(
+                {
+                    "start_time": start.astype(datetime.datetime),
+                    "end_time": end.astype(datetime.datetime),
+                    "filepaths": filepaths[indices].tolist(),
+                },
+            )
+    return results
+def get_files_partitions(list_partitions, filepaths, sample_interval, accumulation_interval, rolling):  # noqa: ARG001
+    """
+    Provide information about the required files for each event.
+    For each event in `list_partitions`, this function identifies the file paths from `filepaths` that
+    overlap with the event period, adjusted by the `accumulation_interval`. The event period is
+    extended backward or forward based on the `rolling` parameter.
+    Parameters
+    ----------
+    list_partitions : list of dict
+        List of events, where each event is a dictionary containing at least 'start_time' and 'end_time'
+        keys with `numpy.datetime64` values.
+    filepaths : list of str
+        List of file paths corresponding to data files.
+    sample_interval : numpy.timedelta64 or int
+        The sample interval of the input dataset.
+    accumulation_interval : numpy.timedelta64 or int
+        Time interval to adjust the event period for accumulation. If an integer is provided, it is
+        assumed to be in seconds.
+    rolling : bool
+        If True, adjust the event period backward by `accumulation_interval` (rolling backward).
+        If False, adjust forward (aggregate forward).
+    Returns
+    -------
+    list of dict
+        A list where each element is a dictionary containing:
+        - 'start_time': Adjusted start time of the event (`datetime.datetime64`).
+        - 'end_time': Adjusted end time of the event (`datetime.datetime64`).
+        - 'filepaths': List of file paths overlapping with the adjusted event period.
+    """
+    if len(filepaths) == 0 or len(list_partitions) == 0:
+        return []
+    # Ensure sample_interval and accumulation_interval is numpy.timedelta64
+    accumulation_interval = ensure_timedelta_seconds(accumulation_interval)
+    sample_interval = ensure_timedelta_seconds(sample_interval)
+    # Define offset on event_end_time
+    offset = accumulation_interval if sample_interval != accumulation_interval else ensure_timedelta_seconds(0)
+    # Retrieve file start_time and end_time
+    files_start_time, files_end_time = get_start_end_time_from_filepaths(filepaths)
+    # Retrieve partitions blocks start and end time arrays
+    block_starts = np.array([p["start_time"] for p in list_partitions]).astype("M8[s]")
+    block_ends = np.array([p["end_time"] for p in list_partitions]).astype("M8[s]")
+    # Add optional offset for resampling
+    # TODO: expanding partition time should be done only at L1 stage when resampling
+    # In disdrodb, the time reported is time at the start of the accumulation period !
+    # If sensors report time at the end of measurement interval, we might being reporting time
+    #  with an inaccuracy equals to the sensor measurement interval.
+    # We could correct for that at L0C stage already !
+    block_ends = block_ends + offset
+    # Map filepaths to corresponding time blocks
+    list_event_info = _map_files_to_blocks(files_start_time, files_end_time, filepaths, block_starts, block_ends)
+    return list_event_info
+def get_files_per_time_block(filepaths, freq="day", tolerance_seconds=120):
+    """
+    Organize files by the days they cover based on their start and end times.
+    Parameters
+    ----------
+    filepaths : list of str
+        List of file paths to be processed.
+    Returns
+    -------
+    dict
+        Dictionary where keys are days (as strings) and values are lists of file paths
+        that cover those days.
+    Notes
+    -----
+    This function adds a tolerance of 60 seconds to account for imprecise time logging by the sensors.
+    """
+    # Empty filepaths list return a dictionary
+    if len(filepaths) == 0:
+        return []
+    # Retrieve file start_time and end_time
+    files_start_time, files_end_time = get_start_end_time_from_filepaths(filepaths)
+    # Add tolerance to account for imprecise time logging by the sensors
+    # - Example: timestep 23:59:30 might be 00.00 and goes into the next day file ...
+    files_start_time = files_start_time - np.array(tolerance_seconds, dtype="m8[s]")
+    files_end_time = files_end_time + np.array(tolerance_seconds, dtype="m8[s]")
+    # Identify candidate blocks
+    list_partitions = identify_time_partitions(
+        start_times=files_start_time,
+        end_times=files_end_time,
+        freq=freq,
+    )
+    block_starts = np.array([b["start_time"] for b in list_partitions]).astype("M8[s]")
+    block_ends = np.array([b["end_time"] for b in list_partitions]).astype("M8[s]")
+    # Map filepaths to corresponding time blocks
+    list_event_info = _map_files_to_blocks(files_start_time, files_end_time, filepaths, block_starts, block_ends)
+    return list_event_info

disdrodb/utils/attrs.py CHANGED Viewed

@@ -18,15 +18,26 @@
 # -----------------------------------------------------------------------------.
 """DISDRODB netCDF4 attributes utilities."""
 import datetime
+import os
-from disdrodb import ARCHIVE_VERSION, CONVENTIONS, SOFTWARE_VERSION
+from disdrodb.constants import ARCHIVE_VERSION, CONVENTIONS, COORDINATES, SOFTWARE_VERSION
+from disdrodb.utils.yaml import read_yaml
 ####---------------------------------------------------------------------.
-#### Variable attributes
+#### Variable and coordinates attributes
+def get_attrs_dict():
+    """Get attributes dictionary for DISDRODB product variables and coordinates."""
+    import disdrodb
+    configs_path = os.path.join(disdrodb.__root_path__, "disdrodb", "etc", "configs")
+    attrs_dict = read_yaml(os.path.join(configs_path, "attributes.yaml"))
+    return attrs_dict
 def set_attrs(ds, attrs_dict):
-    """Set attributes to the variables of the xr.Dataset."""
+    """Set attributes to the variables and coordinates of the xr.Dataset."""
     for var in attrs_dict:
         if var in ds:
             ds[var].attrs.update(attrs_dict[var])
@@ -37,104 +48,13 @@ def set_attrs(ds, attrs_dict):
 #### Coordinates attributes
-def get_coords_attrs_dict():
-    """Return dictionary with DISDRODB coordinates attributes."""
-    attrs_dict = {}
-    # Define diameter attributes
-    attrs_dict["diameter_bin_center"] = {
-        "name": "diameter_bin_center",
-        "standard_name": "diameter_bin_center",
-        "long_name": "diameter_bin_center",
-        "units": "mm",
-        "description": "Bin center drop diameter value",
-    }
-    attrs_dict["diameter_bin_width"] = {
-        "name": "diameter_bin_width",
-        "standard_name": "diameter_bin_width",
-        "long_name": "diameter_bin_width",
-        "units": "mm",
-        "description": "Drop diameter bin width",
-    }
-    attrs_dict["diameter_bin_upper"] = {
-        "name": "diameter_bin_upper",
-        "standard_name": "diameter_bin_upper",
-        "long_name": "diameter_bin_upper",
-        "units": "mm",
-        "description": "Bin upper bound drop diameter value",
-    }
-    attrs_dict["velocity_bin_lower"] = {
-        "name": "velocity_bin_lower",
-        "standard_name": "velocity_bin_lower",
-        "long_name": "velocity_bin_lower",
-        "units": "mm",
-        "description": "Bin lower bound drop diameter value",
-    }
-    # Define velocity attributes
-    attrs_dict["velocity_bin_center"] = {
-        "name": "velocity_bin_center",
-        "standard_name": "velocity_bin_center",
-        "long_name": "velocity_bin_center",
-        "units": "m/s",
-        "description": "Bin center drop fall velocity value",
-    }
-    attrs_dict["velocity_bin_width"] = {
-        "name": "velocity_bin_width",
-        "standard_name": "velocity_bin_width",
-        "long_name": "velocity_bin_width",
-        "units": "m/s",
-        "description": "Drop fall velocity bin width",
-    }
-    attrs_dict["velocity_bin_upper"] = {
-        "name": "velocity_bin_upper",
-        "standard_name": "velocity_bin_upper",
-        "long_name": "velocity_bin_upper",
-        "units": "m/s",
-        "description": "Bin upper bound drop fall velocity value",
-    }
-    attrs_dict["velocity_bin_lower"] = {
-        "name": "velocity_bin_lower",
-        "standard_name": "velocity_bin_lower",
-        "long_name": "velocity_bin_lower",
-        "units": "m/s",
-        "description": "Bin lower bound drop fall velocity value",
-    }
-    # Define geolocation attributes
-    attrs_dict["latitude"] = {
-        "name": "latitude",
-        "standard_name": "latitude",
-        "long_name": "Latitude",
-        "units": "degrees_north",
-    }
-    attrs_dict["longitude"] = {
-        "name": "longitude",
-        "standard_name": "longitude",
-        "long_name": "Longitude",
-        "units": "degrees_east",
-    }
-    attrs_dict["altitude"] = {
-        "name": "altitude",
-        "standard_name": "altitude",
-        "long_name": "Altitude",
-        "units": "m",
-        "description": "Elevation above sea level",
-    }
-    # Define time attributes
-    attrs_dict["time"] = {
-        "name": "time",
-        "standard_name": "time",
-        "long_name": "time",
-        "description": "UTC Time",
-    }
-    return attrs_dict
 def set_coordinate_attributes(ds):
     """Set coordinates attributes."""
     # Get attributes dictionary
-    attrs_dict = get_coords_attrs_dict()
+    attrs_dict = get_attrs_dict()
+    coords_dict = {coord: attrs_dict[coord] for coord in COORDINATES if coord in attrs_dict}
     # Set attributes
-    ds = set_attrs(ds, attrs_dict)
+    ds = set_attrs(ds, coords_dict)
     return ds
@@ -142,14 +62,14 @@ def set_coordinate_attributes(ds):
 #### DISDRODB Global Attributes
-def set_disdrodb_attrs(ds, product: str):
+def update_disdrodb_attrs(ds, product: str):
     """Add DISDRODB processing information to the netCDF global attributes.
     It assumes stations metadata are already added the dataset.
     Parameters
     ----------
-    ds : xarray.Dataset
+    ds : xarray dataset.
         Dataset
     product: str
         DISDRODB product.
@@ -159,30 +79,53 @@ def set_disdrodb_attrs(ds, product: str):
     xarray dataset
         Dataset.
     """
-    # Add dataset conventions
-    ds.attrs["Conventions"] = CONVENTIONS
-    # Add featureType
-    if "platform_type" in ds.attrs:
-        platform_type = ds.attrs["platform_type"]
-        if platform_type == "fixed":
-            ds.attrs["featureType"] = "timeSeries"
-        else:
-            ds.attrs["featureType"] = "trajectory"
+    attrs = ds.attrs.copy()
+    # ----------------------------------------------
+    # Drop metadata not relevant for DISDRODB products
+    keys_to_drop = [
+        "disdrodb_reader",
+        "disdrodb_data_url",
+        "raw_data_glob_pattern",
+        "raw_data_format",
+    ]
+    for key in keys_to_drop:
+        _ = attrs.pop(key, None)
+    # ----------------------------------------------
+    # Add time_coverage_start and time_coverage_end
+    if "time" in ds.dims:
+        attrs["time_coverage_start"] = str(ds["time"].data[0])
+        attrs["time_coverage_end"] = str(ds["time"].data[-1])
-    # Update DISDRODDB attributes
-    ds = update_disdrodb_attrs(ds=ds, product=product)
+    # ----------------------------------------------
+    # Set DISDRODDB attributes
+    # - Add DISDRODB processing info
+    now = datetime.datetime.utcnow()
+    current_time = now.strftime("%Y-%m-%d %H:%M:%S")
+    attrs["disdrodb_processing_date"] = current_time
+    # - Add DISDRODB product and version
+    attrs["disdrodb_product_version"] = ARCHIVE_VERSION
+    attrs["disdrodb_software_version"] = SOFTWARE_VERSION
+    attrs["disdrodb_product"] = product
+    # ----------------------------------------------
+    # Finalize attributes dictionary
+    # - Sort attributes alphabetically
+    attrs = dict(sorted(attrs.items()))
+    # - Set attributes
+    ds.attrs = attrs
     return ds
-def update_disdrodb_attrs(ds, product: str):
+def set_disdrodb_attrs(ds, product: str):
     """Add DISDRODB processing information to the netCDF global attributes.
     It assumes stations metadata are already added the dataset.
     Parameters
     ----------
-    ds : xarray dataset.
+    ds : xarray.Dataset
         Dataset
     product: str
         DISDRODB product.
@@ -192,17 +135,17 @@ def update_disdrodb_attrs(ds, product: str):
     xarray dataset
         Dataset.
     """
-    # Add time_coverage_start and time_coverage_end
-    ds.attrs["time_coverage_start"] = str(ds["time"].data[0])
-    ds.attrs["time_coverage_end"] = str(ds["time"].data[-1])
+    # Add dataset conventions
+    ds.attrs["Conventions"] = CONVENTIONS
-    # DISDRODDB attributes
-    # - Add DISDRODB processing info
-    now = datetime.datetime.utcnow()
-    current_time = now.strftime("%Y-%m-%d %H:%M:%S")
-    ds.attrs["disdrodb_processing_date"] = current_time
-    # - Add DISDRODB product and version
-    ds.attrs["disdrodb_product_version"] = ARCHIVE_VERSION
-    ds.attrs["disdrodb_software_version"] = SOFTWARE_VERSION
-    ds.attrs["disdrodb_product"] = product
+    # Add featureType
+    if "platform_type" in ds.attrs:
+        platform_type = ds.attrs["platform_type"]
+        if platform_type == "fixed":
+            ds.attrs["featureType"] = "timeSeries"
+        else:
+            ds.attrs["featureType"] = "trajectory"
+    # Update DISDRODDB attributes
+    ds = update_disdrodb_attrs(ds=ds, product=product)
     return ds

disdrodb 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

disdrodb 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl