PyPI - disdrodb - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

disdrodb 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

disdrodb/__init__.py +68 -34
disdrodb/_config.py +5 -4
disdrodb/_version.py +16 -3
disdrodb/accessor/__init__.py +20 -0
disdrodb/accessor/methods.py +125 -0
disdrodb/api/checks.py +177 -24
disdrodb/api/configs.py +3 -3
disdrodb/api/info.py +13 -13
disdrodb/api/io.py +281 -22
disdrodb/api/path.py +184 -195
disdrodb/api/search.py +18 -9
disdrodb/cli/disdrodb_create_summary.py +103 -0
disdrodb/cli/disdrodb_create_summary_station.py +91 -0
disdrodb/cli/disdrodb_run_l0.py +1 -1
disdrodb/cli/disdrodb_run_l0_station.py +1 -1
disdrodb/cli/disdrodb_run_l0a_station.py +1 -1
disdrodb/cli/disdrodb_run_l0b.py +1 -1
disdrodb/cli/disdrodb_run_l0b_station.py +3 -3
disdrodb/cli/disdrodb_run_l0c.py +1 -1
disdrodb/cli/disdrodb_run_l0c_station.py +3 -3
disdrodb/cli/disdrodb_run_l1_station.py +2 -2
disdrodb/cli/disdrodb_run_l2e_station.py +2 -2
disdrodb/cli/disdrodb_run_l2m_station.py +2 -2
disdrodb/configs.py +149 -4
disdrodb/constants.py +61 -0
disdrodb/data_transfer/download_data.py +127 -11
disdrodb/etc/configs/attributes.yaml +339 -0
disdrodb/etc/configs/encodings.yaml +473 -0
disdrodb/etc/products/L1/global.yaml +13 -0
disdrodb/etc/products/L2E/10MIN.yaml +12 -0
disdrodb/etc/products/L2E/1MIN.yaml +1 -0
disdrodb/etc/products/L2E/global.yaml +22 -0
disdrodb/etc/products/L2M/10MIN.yaml +12 -0
disdrodb/etc/products/L2M/GAMMA_ML.yaml +8 -0
disdrodb/etc/products/L2M/NGAMMA_GS_LOG_ND_MAE.yaml +6 -0
disdrodb/etc/products/L2M/NGAMMA_GS_ND_MAE.yaml +6 -0
disdrodb/etc/products/L2M/NGAMMA_GS_Z_MAE.yaml +6 -0
disdrodb/etc/products/L2M/global.yaml +26 -0
disdrodb/issue/writer.py +2 -0
disdrodb/l0/__init__.py +13 -0
disdrodb/l0/configs/LPM/l0b_cf_attrs.yml +4 -4
disdrodb/l0/configs/PARSIVEL/l0b_cf_attrs.yml +1 -1
disdrodb/l0/configs/PARSIVEL/l0b_encodings.yml +3 -3
disdrodb/l0/configs/PARSIVEL/raw_data_format.yml +1 -1
disdrodb/l0/configs/PARSIVEL2/l0b_cf_attrs.yml +5 -5
disdrodb/l0/configs/PARSIVEL2/l0b_encodings.yml +3 -3
disdrodb/l0/configs/PARSIVEL2/raw_data_format.yml +1 -1
disdrodb/l0/configs/PWS100/l0b_cf_attrs.yml +4 -4
disdrodb/l0/configs/PWS100/raw_data_format.yml +1 -1
disdrodb/l0/l0a_processing.py +37 -32
disdrodb/l0/l0b_nc_processing.py +118 -8
disdrodb/l0/l0b_processing.py +30 -65
disdrodb/l0/l0c_processing.py +369 -259
disdrodb/l0/readers/LPM/ARM/ARM_LPM.py +7 -0
disdrodb/l0/readers/LPM/NETHERLANDS/DELFT_LPM_NC.py +66 -0
disdrodb/l0/readers/LPM/SLOVENIA/{CRNI_VRH.py → UL.py} +3 -0
disdrodb/l0/readers/LPM/SWITZERLAND/INNERERIZ_LPM.py +195 -0
disdrodb/l0/readers/PARSIVEL/GPM/PIERS.py +0 -2
disdrodb/l0/readers/PARSIVEL/JAPAN/JMA.py +4 -1
disdrodb/l0/readers/PARSIVEL/NCAR/PECAN_MOBILE.py +1 -1
disdrodb/l0/readers/PARSIVEL/NCAR/VORTEX2_2009.py +1 -1
disdrodb/l0/readers/PARSIVEL2/ARM/ARM_PARSIVEL2.py +4 -0
disdrodb/l0/readers/PARSIVEL2/BELGIUM/ILVO.py +168 -0
disdrodb/l0/readers/PARSIVEL2/CANADA/UQAM_NC.py +69 -0
disdrodb/l0/readers/PARSIVEL2/DENMARK/DTU.py +165 -0
disdrodb/l0/readers/PARSIVEL2/FINLAND/FMI_PARSIVEL2.py +69 -0
disdrodb/l0/readers/PARSIVEL2/FRANCE/ENPC_PARSIVEL2.py +255 -134
disdrodb/l0/readers/PARSIVEL2/FRANCE/OSUG.py +525 -0
disdrodb/l0/readers/PARSIVEL2/FRANCE/SIRTA_PARSIVEL2.py +1 -1
disdrodb/l0/readers/PARSIVEL2/GPM/GCPEX.py +9 -7
disdrodb/l0/readers/PARSIVEL2/KIT/BURKINA_FASO.py +1 -1
disdrodb/l0/readers/PARSIVEL2/KIT/TEAMX.py +123 -0
disdrodb/l0/readers/PARSIVEL2/{NETHERLANDS/DELFT.py → MPI/BCO_PARSIVEL2.py} +41 -71
disdrodb/l0/readers/PARSIVEL2/MPI/BOWTIE.py +220 -0
disdrodb/l0/readers/PARSIVEL2/NASA/APU.py +120 -0
disdrodb/l0/readers/PARSIVEL2/NASA/LPVEX.py +109 -0
disdrodb/l0/readers/PARSIVEL2/NCAR/FARM_PARSIVEL2.py +1 -0
disdrodb/l0/readers/PARSIVEL2/NCAR/PECAN_FP3.py +1 -1
disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_MIPS.py +126 -0
disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_PIPS.py +165 -0
disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_P2.py +1 -1
disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_PIPS.py +20 -12
disdrodb/l0/readers/PARSIVEL2/NETHERLANDS/DELFT_NC.py +5 -0
disdrodb/l0/readers/PARSIVEL2/SPAIN/CENER.py +144 -0
disdrodb/l0/readers/PARSIVEL2/SPAIN/CR1000DL.py +201 -0
disdrodb/l0/readers/PARSIVEL2/SPAIN/LIAISE.py +137 -0
disdrodb/l0/readers/PARSIVEL2/USA/C3WE.py +146 -0
disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100.py +105 -99
disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100_SIRTA.py +151 -0
disdrodb/l1/__init__.py +5 -0
disdrodb/l1/fall_velocity.py +46 -0
disdrodb/l1/filters.py +34 -20
disdrodb/l1/processing.py +46 -45
disdrodb/l1/resampling.py +77 -66
disdrodb/l1_env/routines.py +18 -3
disdrodb/l2/__init__.py +7 -0
disdrodb/l2/empirical_dsd.py +58 -10
disdrodb/l2/processing.py +268 -117
disdrodb/metadata/checks.py +132 -125
disdrodb/metadata/standards.py +3 -1
disdrodb/psd/fitting.py +631 -345
disdrodb/psd/models.py +9 -6
disdrodb/routines/__init__.py +54 -0
disdrodb/{l0/routines.py → routines/l0.py} +316 -355
disdrodb/{l1/routines.py → routines/l1.py} +76 -116
disdrodb/routines/l2.py +1019 -0
disdrodb/{routines.py → routines/wrappers.py} +98 -10
disdrodb/scattering/__init__.py +16 -4
disdrodb/scattering/axis_ratio.py +61 -37
disdrodb/scattering/permittivity.py +504 -0
disdrodb/scattering/routines.py +746 -184
disdrodb/summary/__init__.py +17 -0
disdrodb/summary/routines.py +4196 -0
disdrodb/utils/archiving.py +434 -0
disdrodb/utils/attrs.py +68 -125
disdrodb/utils/cli.py +5 -5
disdrodb/utils/compression.py +30 -1
disdrodb/utils/dask.py +121 -9
disdrodb/utils/dataframe.py +61 -7
disdrodb/utils/decorators.py +31 -0
disdrodb/utils/directories.py +35 -15
disdrodb/utils/encoding.py +37 -19
disdrodb/{l2 → utils}/event.py +15 -173
disdrodb/utils/logger.py +14 -7
disdrodb/utils/manipulations.py +81 -0
disdrodb/utils/routines.py +166 -0
disdrodb/utils/subsetting.py +214 -0
disdrodb/utils/time.py +35 -177
disdrodb/utils/writer.py +20 -7
disdrodb/utils/xarray.py +5 -4
disdrodb/viz/__init__.py +13 -0
disdrodb/viz/plots.py +398 -0
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/METADATA +4 -3
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/RECORD +139 -98
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/entry_points.txt +2 -0
disdrodb/l1/encoding_attrs.py +0 -642
disdrodb/l2/processing_options.py +0 -213
disdrodb/l2/routines.py +0 -868
/disdrodb/l0/readers/PARSIVEL/SLOVENIA/{UL_FGG.py → UL.py} +0 -0
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/WHEEL +0 -0
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/licenses/LICENSE +0 -0
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/top_level.txt +0 -0

disdrodb/utils/cli.py CHANGED Viewed

@@ -21,7 +21,7 @@
 import click
-def _execute_cmd(cmd, raise_error=False):
+def execute_cmd(cmd, raise_error=False):
     """Execute command in the terminal, streaming output in python console."""
     from subprocess import PIPE, CalledProcessError, Popen
@@ -34,7 +34,7 @@ def _execute_cmd(cmd, raise_error=False):
         raise CalledProcessError(p.returncode, p.args)
-def _parse_empty_string_and_none(args):
+def parse_empty_string_and_none(args):
     """Utility to parse argument passed from the command line.
     If ``args = ''``, returns None.
@@ -58,7 +58,7 @@ def parse_arg_to_list(args):
     If ``args = 'variable1 variable2'`` returns ``[variable1, variable2]``.
     """
     # If '' or 'None' --> Set to None
-    args = _parse_empty_string_and_none(args)
+    args = parse_empty_string_and_none(args)
     # - If multiple arguments, split by space
     if isinstance(args, str):
         # - Split by space
@@ -75,7 +75,7 @@ def parse_archive_dir(archive_dir: str):
     If ``archive_dir = ''`` returns ``None``.
     """
     # If '', set to 'None'
-    return _parse_empty_string_and_none(archive_dir)
+    return parse_empty_string_and_none(archive_dir)
 def click_station_arguments(function: object):
@@ -86,7 +86,7 @@ def click_station_arguments(function: object):
     function : object
         Function.
     """
-    function = click.argument("station_name", metavar="<station>")(function)
+    function = click.argument("station_name", metavar="<STATION_NAME>")(function)
     function = click.argument("campaign_name", metavar="<CAMPAIGN_NAME>")(function)
     function = click.argument("data_source", metavar="<DATA_SOURCE>")(function)
     return function

disdrodb/utils/compression.py CHANGED Viewed

@@ -22,6 +22,7 @@ import bz2
 import gzip
 import os
 import shutil
+import subprocess
 import tempfile
 import zipfile
 from typing import Optional
@@ -53,6 +54,34 @@ def unzip_file(filepath: str, dest_path: str) -> None:
         zip_ref.extractall(dest_path)
+def unzip_file_on_terminal(filepath: str, dest_path: str) -> str:
+    """Unzip a file into a directory using the terminal command.
+    Parameters
+    ----------
+    filepath : str
+        Path of the file to unzip.
+    dest_path : str
+        Path of the destination directory.
+    """
+    os.makedirs(dest_path, exist_ok=True)
+    if os.name == "nt":
+        # Windows: use PowerShell Expand-Archive
+        cmd = [
+            "powershell.exe",
+            "-NoProfile",
+            "-NonInteractive",
+            "-Command",
+            f"Expand-Archive -LiteralPath '{filepath}' -DestinationPath '{dest_path}' -Force",
+        ]
+    else:
+        # macOS/Linux: use unzip
+        cmd = ["unzip", "-q", filepath, "-d", dest_path]
+    subprocess.run(cmd, check=True)
 def _zip_dir(dir_path: str) -> str:
     """Zip a directory into a file located in the same directory.
@@ -157,7 +186,7 @@ def compress_station_files(
         raise ValueError(f"Station data directory {station_dir} does not exist.")
     # Get list of files inside the station directory (in all nested directories)
-    filepaths = list_files(station_dir, glob_pattern="*", recursive=True)
+    filepaths = list_files(station_dir, recursive=True)
     for filepath in filepaths:
         _ = _compress_file(filepath, method, skip=skip)

disdrodb/utils/dask.py CHANGED Viewed

@@ -16,31 +16,82 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # -----------------------------------------------------------------------------.
-"""Utilities for Dask Distributed computations."""
+"""Utilities for Dask Distributed Computations."""
 import logging
 import os
+import numpy as np
-def initialize_dask_cluster():
+def check_parallel_validity(parallel):
+    """Check validity of parallel option given Dask settings."""
+    import dask
+    scheduler = dask.config.get("scheduler", None)
+    if scheduler is None:
+        return parallel
+    if scheduler in ["synchronous", "threads"]:
+        return False
+    if scheduler == "distributed":
+        from dask.distributed import default_client
+        client = default_client()
+        info = client.scheduler_info()
+        # If ThreadWorker, only 1 pid
+        pids = list(client.run(os.getpid).values())
+        if len(np.unique(pids)) == 1:
+            return False
+        # If ProcessWorker
+        # - Check single thread per worker to avoid locks
+        nthreads_per_process = np.array([v["nthreads"] for v in info["workers"].values()])
+        if not np.all(nthreads_per_process == 1):
+            print(
+                "To open netCDFs in parallel with dask distributed (processes=True), please set threads_per_worker=1 !",
+            )
+            return False
+    # Otherwise let the user choose
+    return parallel
+def initialize_dask_cluster(minimum_memory=None):
     """Initialize Dask Cluster."""
     import dask
+    import psutil
+    # Silence dask warnings
+    # dask.config.set({"logging.distributed": "error"})
+    # Import dask.distributed after setting the config
     from dask.distributed import Client, LocalCluster
+    from dask.utils import parse_bytes
     # Set HDF5_USE_FILE_LOCKING to avoid going stuck with HDF
     os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
-    # Retrieve the number of process to run
-    available_workers = os.cpu_count() - 2  # if not set, all CPUs
+    # Retrieve the number of processes to run
+    available_workers = os.cpu_count() - 2  # if not set, all CPUs minus 2
     num_workers = dask.config.get("num_workers", available_workers)
-    # Silence dask warnings
-    dask.config.set({"logging.distributed": "error"})
-    # dask.config.set({"distributed.admin.system-monitor.gil.enabled": False})
+    # If memory limit specified, ensure correct amount of workers
+    if minimum_memory is not None:
+        # Compute available memory (in bytes)
+        total_memory = psutil.virtual_memory().total
+        # Get minimum memory per worker (in bytes)
+        minimum_memory = parse_bytes(minimum_memory)
+        # Determine number of workers constrained by memory
+        maximum_workers_allowed = max(1, total_memory // minimum_memory)
+        # Respect both CPU and memory requirements
+        num_workers = min(maximum_workers_allowed, num_workers)
     # Create dask.distributed local cluster
     cluster = LocalCluster(
         n_workers=num_workers,
         threads_per_worker=1,
         processes=True,
-        # memory_limit='8GB',
-        # silence_logs=False,
+        memory_limit=0,  # this avoid flexible dask memory management
+        silence_logs=logging.ERROR,
     )
     client = Client(cluster)
     return cluster, client
@@ -60,3 +111,64 @@ def close_dask_cluster(cluster, client):
     finally:
         # Restore the original log level
         logger.setLevel(original_level)
+def execute_tasks_safely(list_tasks, parallel: bool, logs_dir: str):
+    """
+    Execute Dask tasks and skip failed ones.
+    Parameters
+    ----------
+    list_tasks : list
+        List of dask delayed objects or results.
+    parallel : bool
+        Whether to execute in parallel with Dask or not.
+    logs_dir : str
+        Directory to store FAILED_TASKS.log.
+    Returns
+    -------
+    list_logs : list
+        List of task results. For failed tasks, adds the path
+        to FAILED_TASKS.log in place of the result.
+    """
+    from dask.distributed import get_client
+    # Ensure logs_dir exists
+    os.makedirs(logs_dir, exist_ok=True)
+    # Define file name where to log failed dask tasks
+    failed_log_path = os.path.join(logs_dir, "FAILED_DASK_TASKS.log")
+    if not parallel:
+        # Non-parallel mode: just return results directly
+        return list_tasks
+    # Ensure we have a Dask client
+    try:
+        client = get_client()
+    except ValueError:
+        raise ValueError("No Dask Distributed Client found.")
+    # Compute tasks (all concurrently)
+    # - Runs tasks == num_workers * threads_per_worker (which is 1 for DISDRODB)
+    # - If errors occurs in some, skip it
+    futures = client.compute(list_tasks)
+    results = client.gather(futures, errors="skip")
+    # Collect failed futures
+    failed_futures = [f for f in futures if f.status != "finished"]  # "error"
+    # If no tasks failed, return results
+    if not failed_futures:
+        return results
+    # Otherwise define log file listing failed tasks
+    with open(failed_log_path, "w") as f:
+        for fut in failed_futures:
+            err = fut.exception()
+            f.write(f"ERROR - DASK TASK FAILURE - Task {fut.key} failed: {err}\n")
+    # Append to list of log filepaths (results) the dask failing log
+    results.append(failed_log_path)
+    return results

disdrodb/utils/dataframe.py CHANGED Viewed

@@ -20,6 +20,8 @@
 import numpy as np
 import pandas as pd
+from disdrodb.utils.warnings import suppress_warnings
 def log_arange(start, stop, log_step=0.1, base=10):
     """
@@ -47,7 +49,39 @@ def log_arange(start, stop, log_step=0.1, base=10):
     log_start = np.log(start) / np.log(base)
     log_stop = np.log(stop) / np.log(base)
-    log_values = np.arange(log_start, log_stop, log_step)
+    log_values = np.arange(log_start, log_stop + log_step / 2, log_step)
+    return base**log_values
+def log_linspace(start, stop, n_bins, base=10):
+    """
+    Return numbers spaced evenly on a log scale between start and stop.
+    Parameters
+    ----------
+    start : float
+        The starting value of the sequence (must be > 0).
+    stop : float
+        The end value of the sequence (must be > 0).
+    n_bins : int
+        The number of points to generate (including start and stop).
+    base : float
+        The logarithmic base (default is 10).
+    Returns
+    -------
+    np.ndarray
+        Array of values spaced evenly in log space.
+    """
+    if start <= 0 or stop <= 0:
+        raise ValueError("Both start and stop must be > 0 for log spacing.")
+    if n_bins < 2:
+        raise ValueError("n_bins must be >= 2 to include start and stop values.")
+    log_start = np.log(start) / np.log(base)
+    log_stop = np.log(stop) / np.log(base)
+    log_values = np.linspace(log_start, log_stop, n_bins)
     return base**log_values
@@ -100,6 +134,9 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
     if len(df) == 0:
         raise ValueError("No valid data points after removing NaN values")
+    # Keep only data within bin range
+    df = df[(df[column] >= bins[0]) & (df[column] < bins[-1])]
     # Create binned columns with explicit handling of out-of-bounds values
     df[f"{column}_binned"] = pd.cut(df[column], bins=bins, include_lowest=True)
@@ -134,7 +171,7 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
                 (f"{prefix}std", "std"),
                 (f"{prefix}min", "min"),
                 (f"{prefix}max", "max"),
-                (f"{prefix}mad", lambda s: np.median(np.abs(s - np.median(s)))),
+                (f"{prefix}mad", lambda s: (s - s.median()).abs().median()),
             ]
             if i == 0:
                 list_stats.append(("count", "count"))
@@ -142,7 +179,8 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
             list_stats = [("count", "count")]
         # Compute statistics
-        df_stats = df_grouped[var].agg(list_stats)
+        with suppress_warnings():
+            df_stats = df_grouped[var].agg(list_stats)
         # Compute other variable statistics
         if variables_specified:
@@ -253,8 +291,18 @@ def compute_2d_histogram(
         raise ValueError("No valid data points after removing NaN values")
     # Create binned columns with explicit handling of out-of-bounds values
-    df[f"{x}_binned"] = pd.cut(df[x], bins=x_bins, include_lowest=True)
-    df[f"{y}_binned"] = pd.cut(df[y], bins=y_bins, include_lowest=True)
+    df[f"{x}_binned"] = pd.cut(
+        df[x],
+        bins=pd.IntervalIndex.from_breaks(x_bins, closed="right"),
+        include_lowest=True,
+        ordered=True,
+    )
+    df[f"{y}_binned"] = pd.cut(
+        df[y],
+        bins=pd.IntervalIndex.from_breaks(y_bins, closed="right"),
+        include_lowest=True,
+        ordered=True,
+    )
     # Create complete IntervalIndex for both dimensions
     x_intervals = df[f"{x}_binned"].cat.categories
@@ -318,8 +366,8 @@ def compute_2d_histogram(
     df_stats = df_stats.reindex(full_index)
     # Determine coordinates
-    x_centers = x_intervals.mid
-    y_centers = y_intervals.mid
+    x_centers = np.array(x_intervals.mid)
+    y_centers = np.array(y_intervals.mid)
     # Use provided labels if available
     x_coords = x_labels if x_labels is not None else x_centers
@@ -337,6 +385,12 @@ def compute_2d_histogram(
     # Convert to dataset
     ds = df_stats.to_xarray()
+    # Convert Categorical coordinates to float if possible
+    if np.issubdtype(x_coords.dtype, np.number):
+        ds[f"{x}"] = ds[f"{x}"].astype(float)
+    if np.issubdtype(y_coords.dtype, np.number):
+        ds[f"{y}"] = ds[f"{y}"].astype(float)
     # Transpose arrays
     ds = ds.transpose(y, x)
     return ds

disdrodb/utils/decorators.py CHANGED Viewed

@@ -19,10 +19,34 @@
 """DISDRODB decorators."""
 import functools
 import importlib
+import uuid
 import dask
+def create_dask_task_name(function_name: str, name=None) -> str | None:
+    """
+    Create a custom dask task name.
+    Parameters
+    ----------
+    function_name : str
+        Name of the function being delayed.
+    name : str, optional
+        Custom name for the task (e.g., filepath or ID).
+        If None, returns None so that Dask generates is own default name.
+    Returns
+    -------
+    str | None
+        Custom dask task name string if `name` is given,
+        otherwise None (use Dask's default naming).
+    """
+    if name is None:
+        return None
+    return f"{function_name}.{name}-{uuid.uuid4()}"
 def delayed_if_parallel(function):
     """Decorator to make the function delayed if its ``parallel`` argument is ``True``."""
@@ -34,6 +58,13 @@ def delayed_if_parallel(function):
         if parallel:
             # Enforce verbose to be False
             kwargs["verbose"] = False
+            # Define custom dask task name
+            if "logs_filename" in kwargs:
+                kwargs["dask_key_name"] = create_dask_task_name(
+                    function_name=function.__name__,
+                    name=kwargs["logs_filename"],
+                )
             # Define the delayed task
             result = dask.delayed(function)(*args, **kwargs)
         else:

disdrodb/utils/directories.py CHANGED Viewed

@@ -98,18 +98,29 @@ def _recursive_glob(dir_path, glob_pattern):
     return [str(path) for path in dir_path.rglob(glob_pattern)]
-def _list_paths(dir_path, glob_pattern, recursive=False):
+def _is_hidden(path):
+    """Return True if any component of path is hidden."""
+    return any(part.startswith(".") for part in path.split(os.sep))
+def _list_paths(dir_path, glob_pattern, recursive=False, skip_hidden=True):
     """Return a list of filepaths and directory paths based on a single glob pattern."""
     # If glob pattern has separators, disable recursive option
     if "/" in glob_pattern and "**" not in glob_pattern:
         recursive = False
     # Search paths
     if not recursive:
-        return glob.glob(os.path.join(dir_path, glob_pattern))
-    return _recursive_glob(dir_path, glob_pattern)
+        matches = glob.glob(os.path.join(dir_path, glob_pattern))
+    else:
+        matches = _recursive_glob(dir_path, glob_pattern)
+    # Filter out anything with a hidden component
+    if skip_hidden:
+        matches = [p for p in matches if not _is_hidden(os.path.relpath(p, dir_path))]
+    return matches
-def list_paths(dir_path, glob_pattern, recursive=False):
+def list_paths(dir_path, glob_pattern, recursive=False, skip_hidden=True):
     """Return a list of filepaths and directory paths.
     This function accept also a list of glob patterns !
@@ -119,35 +130,41 @@ def list_paths(dir_path, glob_pattern, recursive=False):
     # Search path for specified glob patterns
     paths = flatten_list(
         [
-            _list_paths(dir_path=dir_path, glob_pattern=glob_pattern, recursive=recursive)
+            _list_paths(dir_path=dir_path, glob_pattern=glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
             for glob_pattern in glob_patterns
         ],
     )
     return paths
-def list_files(dir_path, glob_pattern, recursive=False):
+def list_files(dir_path, glob_pattern="*", recursive=False, skip_hidden=True, return_paths=True):
     """Return a list of filepaths (exclude directory paths)."""
-    paths = list_paths(dir_path, glob_pattern, recursive=recursive)
+    paths = list_paths(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
     filepaths = [f for f in paths if os.path.isfile(f)]
+    # If return_paths is False, return only files names
+    if not return_paths:
+        filepaths = [os.path.basename(f) for f in filepaths]
     return filepaths
-def list_directories(dir_path, glob_pattern, recursive=False):
+def list_directories(dir_path, glob_pattern="*", recursive=False, skip_hidden=True, return_paths=True):
     """Return a list of directory paths (exclude file paths)."""
-    paths = list_paths(dir_path, glob_pattern, recursive=recursive)
+    paths = list_paths(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
     dir_paths = [f for f in paths if os.path.isdir(f)]
+    # If return_paths is False, return only directory names
+    if not return_paths:
+        dir_paths = [os.path.basename(f) for f in dir_paths]
     return dir_paths
-def count_files(dir_path, glob_pattern, recursive=False):
+def count_files(dir_path, glob_pattern="*", recursive=False, skip_hidden=True):
     """Return the number of files (exclude directories)."""
-    return len(list_files(dir_path, glob_pattern, recursive=recursive))
+    return len(list_files(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden))
-def count_directories(dir_path, glob_pattern, recursive=False):
+def count_directories(dir_path, glob_pattern="*", recursive=False, skip_hidden=True):
     """Return the number of files (exclude directories)."""
-    return len(list_directories(dir_path, glob_pattern, recursive=recursive))
+    return len(list_directories(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden))
 def check_directory_exists(dir_path):
@@ -177,7 +194,7 @@ def create_required_directory(dir_path, dir_name, exist_ok=True):
     create_directory(path=new_dir_path, exist_ok=exist_ok)
-def is_empty_directory(path):
+def is_empty_directory(path, skip_hidden=True):
     """Check if a directory path is empty.
     Return ``False`` if path is a file or non-empty directory.
@@ -187,8 +204,11 @@ def is_empty_directory(path):
         raise OSError(f"{path} does not exist.")
     if not os.path.isdir(path):
         return False
     paths = os.listdir(path)
+    # If skip_hidden is True, filter out hidden files/directories
+    if skip_hidden:
+        paths = [f for f in paths if not f.startswith(".")]
     return len(paths) == 0

disdrodb/utils/encoding.py CHANGED Viewed

@@ -17,42 +17,59 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # -----------------------------------------------------------------------------.
 """DISDRODB netCDF4 encoding utilities."""
+import os
 import xarray as xr
+from disdrodb.utils.yaml import read_yaml
 EPOCH = "seconds since 1970-01-01 00:00:00"
-def set_encodings(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset:
+def get_encodings_dict():
+    """Get encoding dictionary for DISDRODB product variables and coordinates."""
+    import disdrodb
+    configs_path = os.path.join(disdrodb.__root_path__, "disdrodb", "etc", "configs")
+    encodings_dict = read_yaml(os.path.join(configs_path, "encodings.yaml"))
+    return encodings_dict
+def set_encodings(ds: xr.Dataset, encodings_dict: dict) -> xr.Dataset:
     """Apply the encodings to the xarray Dataset.
     Parameters
     ----------
     ds  : xarray.Dataset
         Input xarray dataset.
-    encoding_dict : dict
-        Dictionary with encoding specifications.
+    encodings_dict : dict
+        Dictionary with encodings specifications.
     Returns
     -------
     xarray.Dataset
         Output xarray dataset.
     """
+    # TODO: CHANGE CHUNKSIZES SPECIFICATION USING {<DIM>: <CHUNKSIZE>} INSTEAD OF LIST
+    # --> Then unwrap to list of chunksizes here
     # Subset encoding dictionary
-    # - Here below encoding_dict contains only keys (variables) within the dataset
-    encoding_dict = {var: encoding_dict[var] for var in ds.data_vars if var in encoding_dict}
+    # - Here below encodings_dict contains only keys (variables) within the dataset
+    encodings_dict = {var: encodings_dict[var] for var in ds.data_vars if var in encodings_dict}
     # Ensure chunksize smaller than the array shape
-    encoding_dict = sanitize_encodings_dict(encoding_dict, ds)
+    encodings_dict = sanitize_encodings_dict(encodings_dict, ds)
     # Rechunk variables for fast writing !
     # - This pop the chunksize argument from the encoding dict !
-    ds = rechunk_dataset(ds, encoding_dict)
+    ds = rechunk_dataset(ds, encodings_dict)
     # Set time encoding
-    ds["time"].encoding.update(get_time_encoding())
+    if "time" in ds:
+        ds["time"].encoding.update(get_time_encoding())
     # Set the variable encodings
-    for var, encoding in encoding_dict.items():
+    for var, encoding in encodings_dict.items():
         ds[var].encoding.update(encoding)
     # Ensure no deprecated "missing_value" attribute
@@ -63,12 +80,12 @@ def set_encodings(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset:
     return ds
-def sanitize_encodings_dict(encoding_dict: dict, ds: xr.Dataset) -> dict:
+def sanitize_encodings_dict(encodings_dict: dict, ds: xr.Dataset) -> dict:
     """Ensure chunk size to be smaller than the array shape.
     Parameters
     ----------
-    encoding_dict : dict
+    encodings_dict : dict
         Dictionary containing the variable encodings.
     ds  : xarray.Dataset
         Input dataset.
@@ -79,23 +96,23 @@ def sanitize_encodings_dict(encoding_dict: dict, ds: xr.Dataset) -> dict:
         Encoding dictionary.
     """
     for var in ds.data_vars:
-        if var in encoding_dict:
+        if var in encodings_dict:
             shape = ds[var].shape
-            chunks = encoding_dict[var].get("chunksizes", None)
+            chunks = encodings_dict[var].get("chunksizes", None)
             if chunks is not None:
                 chunks = [shape[i] if chunks[i] > shape[i] else chunks[i] for i in range(len(chunks))]
-                encoding_dict[var]["chunksizes"] = chunks
-    return encoding_dict
+                encodings_dict[var]["chunksizes"] = chunks
+    return encodings_dict
-def rechunk_dataset(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset:
+def rechunk_dataset(ds: xr.Dataset, encodings_dict: dict) -> xr.Dataset:
     """Coerce the dataset arrays to have the chunk size specified in the encoding dictionary.
     Parameters
     ----------
     ds  : xarray.Dataset
         Input xarray dataset
-    encoding_dict : dict
+    encodings_dict : dict
         Dictionary containing the encoding to write the xarray dataset as a netCDF.
     Returns
@@ -104,12 +121,13 @@ def rechunk_dataset(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset:
         Output xarray dataset
     """
     for var in ds.data_vars:
-        if var in encoding_dict:
-            chunks = encoding_dict[var].pop("chunksizes", None)
+        if var in encodings_dict:
+            chunks = encodings_dict[var].get("chunksizes", None)  # .pop("chunksizes", None)
             if chunks is not None:
                 dims = list(ds[var].dims)
                 chunks_dict = dict(zip(dims, chunks))
                 ds[var] = ds[var].chunk(chunks_dict)
+                ds[var].encoding["chunksizes"] = chunks
     return ds

disdrodb 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

disdrodb 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl