PyPI - disdrodb - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

disdrodb 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

disdrodb/__init__.py +68 -34
disdrodb/_config.py +5 -4
disdrodb/_version.py +16 -3
disdrodb/accessor/__init__.py +20 -0
disdrodb/accessor/methods.py +125 -0
disdrodb/api/checks.py +177 -24
disdrodb/api/configs.py +3 -3
disdrodb/api/info.py +13 -13
disdrodb/api/io.py +281 -22
disdrodb/api/path.py +184 -195
disdrodb/api/search.py +18 -9
disdrodb/cli/disdrodb_create_summary.py +103 -0
disdrodb/cli/disdrodb_create_summary_station.py +91 -0
disdrodb/cli/disdrodb_run_l0.py +1 -1
disdrodb/cli/disdrodb_run_l0_station.py +1 -1
disdrodb/cli/disdrodb_run_l0a_station.py +1 -1
disdrodb/cli/disdrodb_run_l0b.py +1 -1
disdrodb/cli/disdrodb_run_l0b_station.py +3 -3
disdrodb/cli/disdrodb_run_l0c.py +1 -1
disdrodb/cli/disdrodb_run_l0c_station.py +3 -3
disdrodb/cli/disdrodb_run_l1_station.py +2 -2
disdrodb/cli/disdrodb_run_l2e_station.py +2 -2
disdrodb/cli/disdrodb_run_l2m_station.py +2 -2
disdrodb/configs.py +149 -4
disdrodb/constants.py +61 -0
disdrodb/data_transfer/download_data.py +127 -11
disdrodb/etc/configs/attributes.yaml +339 -0
disdrodb/etc/configs/encodings.yaml +473 -0
disdrodb/etc/products/L1/global.yaml +13 -0
disdrodb/etc/products/L2E/10MIN.yaml +12 -0
disdrodb/etc/products/L2E/1MIN.yaml +1 -0
disdrodb/etc/products/L2E/global.yaml +22 -0
disdrodb/etc/products/L2M/10MIN.yaml +12 -0
disdrodb/etc/products/L2M/GAMMA_ML.yaml +8 -0
disdrodb/etc/products/L2M/NGAMMA_GS_LOG_ND_MAE.yaml +6 -0
disdrodb/etc/products/L2M/NGAMMA_GS_ND_MAE.yaml +6 -0
disdrodb/etc/products/L2M/NGAMMA_GS_Z_MAE.yaml +6 -0
disdrodb/etc/products/L2M/global.yaml +26 -0
disdrodb/issue/writer.py +2 -0
disdrodb/l0/__init__.py +13 -0
disdrodb/l0/configs/LPM/l0b_cf_attrs.yml +4 -4
disdrodb/l0/configs/PARSIVEL/l0b_cf_attrs.yml +1 -1
disdrodb/l0/configs/PARSIVEL/l0b_encodings.yml +3 -3
disdrodb/l0/configs/PARSIVEL/raw_data_format.yml +1 -1
disdrodb/l0/configs/PARSIVEL2/l0b_cf_attrs.yml +5 -5
disdrodb/l0/configs/PARSIVEL2/l0b_encodings.yml +3 -3
disdrodb/l0/configs/PARSIVEL2/raw_data_format.yml +1 -1
disdrodb/l0/configs/PWS100/l0b_cf_attrs.yml +4 -4
disdrodb/l0/configs/PWS100/raw_data_format.yml +1 -1
disdrodb/l0/l0a_processing.py +37 -32
disdrodb/l0/l0b_nc_processing.py +118 -8
disdrodb/l0/l0b_processing.py +30 -65
disdrodb/l0/l0c_processing.py +369 -259
disdrodb/l0/readers/LPM/ARM/ARM_LPM.py +7 -0
disdrodb/l0/readers/LPM/NETHERLANDS/DELFT_LPM_NC.py +66 -0
disdrodb/l0/readers/LPM/SLOVENIA/{CRNI_VRH.py → UL.py} +3 -0
disdrodb/l0/readers/LPM/SWITZERLAND/INNERERIZ_LPM.py +195 -0
disdrodb/l0/readers/PARSIVEL/GPM/PIERS.py +0 -2
disdrodb/l0/readers/PARSIVEL/JAPAN/JMA.py +4 -1
disdrodb/l0/readers/PARSIVEL/NCAR/PECAN_MOBILE.py +1 -1
disdrodb/l0/readers/PARSIVEL/NCAR/VORTEX2_2009.py +1 -1
disdrodb/l0/readers/PARSIVEL2/ARM/ARM_PARSIVEL2.py +4 -0
disdrodb/l0/readers/PARSIVEL2/BELGIUM/ILVO.py +168 -0
disdrodb/l0/readers/PARSIVEL2/CANADA/UQAM_NC.py +69 -0
disdrodb/l0/readers/PARSIVEL2/DENMARK/DTU.py +165 -0
disdrodb/l0/readers/PARSIVEL2/FINLAND/FMI_PARSIVEL2.py +69 -0
disdrodb/l0/readers/PARSIVEL2/FRANCE/ENPC_PARSIVEL2.py +255 -134
disdrodb/l0/readers/PARSIVEL2/FRANCE/OSUG.py +525 -0
disdrodb/l0/readers/PARSIVEL2/FRANCE/SIRTA_PARSIVEL2.py +1 -1
disdrodb/l0/readers/PARSIVEL2/GPM/GCPEX.py +9 -7
disdrodb/l0/readers/PARSIVEL2/KIT/BURKINA_FASO.py +1 -1
disdrodb/l0/readers/PARSIVEL2/KIT/TEAMX.py +123 -0
disdrodb/l0/readers/PARSIVEL2/{NETHERLANDS/DELFT.py → MPI/BCO_PARSIVEL2.py} +41 -71
disdrodb/l0/readers/PARSIVEL2/MPI/BOWTIE.py +220 -0
disdrodb/l0/readers/PARSIVEL2/NASA/APU.py +120 -0
disdrodb/l0/readers/PARSIVEL2/NASA/LPVEX.py +109 -0
disdrodb/l0/readers/PARSIVEL2/NCAR/FARM_PARSIVEL2.py +1 -0
disdrodb/l0/readers/PARSIVEL2/NCAR/PECAN_FP3.py +1 -1
disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_MIPS.py +126 -0
disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_PIPS.py +165 -0
disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_P2.py +1 -1
disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_PIPS.py +20 -12
disdrodb/l0/readers/PARSIVEL2/NETHERLANDS/DELFT_NC.py +5 -0
disdrodb/l0/readers/PARSIVEL2/SPAIN/CENER.py +144 -0
disdrodb/l0/readers/PARSIVEL2/SPAIN/CR1000DL.py +201 -0
disdrodb/l0/readers/PARSIVEL2/SPAIN/LIAISE.py +137 -0
disdrodb/l0/readers/PARSIVEL2/USA/C3WE.py +146 -0
disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100.py +105 -99
disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100_SIRTA.py +151 -0
disdrodb/l1/__init__.py +5 -0
disdrodb/l1/fall_velocity.py +46 -0
disdrodb/l1/filters.py +34 -20
disdrodb/l1/processing.py +46 -45
disdrodb/l1/resampling.py +77 -66
disdrodb/l1_env/routines.py +18 -3
disdrodb/l2/__init__.py +7 -0
disdrodb/l2/empirical_dsd.py +58 -10
disdrodb/l2/processing.py +268 -117
disdrodb/metadata/checks.py +132 -125
disdrodb/metadata/standards.py +3 -1
disdrodb/psd/fitting.py +631 -345
disdrodb/psd/models.py +9 -6
disdrodb/routines/__init__.py +54 -0
disdrodb/{l0/routines.py → routines/l0.py} +316 -355
disdrodb/{l1/routines.py → routines/l1.py} +76 -116
disdrodb/routines/l2.py +1019 -0
disdrodb/{routines.py → routines/wrappers.py} +98 -10
disdrodb/scattering/__init__.py +16 -4
disdrodb/scattering/axis_ratio.py +61 -37
disdrodb/scattering/permittivity.py +504 -0
disdrodb/scattering/routines.py +746 -184
disdrodb/summary/__init__.py +17 -0
disdrodb/summary/routines.py +4196 -0
disdrodb/utils/archiving.py +434 -0
disdrodb/utils/attrs.py +68 -125
disdrodb/utils/cli.py +5 -5
disdrodb/utils/compression.py +30 -1
disdrodb/utils/dask.py +121 -9
disdrodb/utils/dataframe.py +61 -7
disdrodb/utils/decorators.py +31 -0
disdrodb/utils/directories.py +35 -15
disdrodb/utils/encoding.py +37 -19
disdrodb/{l2 → utils}/event.py +15 -173
disdrodb/utils/logger.py +14 -7
disdrodb/utils/manipulations.py +81 -0
disdrodb/utils/routines.py +166 -0
disdrodb/utils/subsetting.py +214 -0
disdrodb/utils/time.py +35 -177
disdrodb/utils/writer.py +20 -7
disdrodb/utils/xarray.py +5 -4
disdrodb/viz/__init__.py +13 -0
disdrodb/viz/plots.py +398 -0
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/METADATA +4 -3
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/RECORD +139 -98
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/entry_points.txt +2 -0
disdrodb/l1/encoding_attrs.py +0 -642
disdrodb/l2/processing_options.py +0 -213
disdrodb/l2/routines.py +0 -868
/disdrodb/l0/readers/PARSIVEL/SLOVENIA/{UL_FGG.py → UL.py} +0 -0
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/WHEEL +0 -0
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/licenses/LICENSE +0 -0
{disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/top_level.txt +0 -0

disdrodb/l0/configs/PARSIVEL2/l0b_cf_attrs.yml CHANGED Viewed

@@ -47,7 +47,7 @@ number_particles:
 sensor_temperature:
   description: Temperature in sensor housing
   long_name: Temperature of the sensor
-  units: "C"
+  units: "degC"
 sensor_serial_number:
   description: Sensor serial number
   long_name: Serial number of the sensor
@@ -105,15 +105,15 @@ error_code:
 sensor_temperature_pcb:
   description: Temperature in printed circuit board
   long_name: Sensor PCB temperature
-  units: "C"
+  units: "degC"
 sensor_temperature_receiver:
   description: Temperature in right sensor head
   long_name: Sensor receiver temperature
-  units: "C"
+  units: "degC"
 sensor_temperature_trasmitter:
   description: Temperature in left sensor head
   long_name: Sensor trasmitter temperature
-  units: "C"
+  units: "degC"
 rainfall_rate_16_bit_30:
   description: Rainfall rate
   long_name: Rainfall rate max 30 mm/h 16 bit
@@ -161,7 +161,7 @@ raw_drop_number:
 air_temperature:
   description: "Air temperature in degrees Celsius (C)"
   long_name: Air temperature
-  units: "C"
+  units: "degC"
 relative_humidity:
   description: "Relative humidity in percent (%)"
   long_name: Relative humidity

disdrodb/l0/configs/PARSIVEL2/l0b_encodings.yml CHANGED Viewed

@@ -102,7 +102,7 @@ sensor_temperature:
   chunksizes: 5000
   _FillValue: 127
 sensor_serial_number:
-  dtype: object
+  dtype: str
   zlib: false
   complevel: 3
   shuffle: true
@@ -110,7 +110,7 @@ sensor_serial_number:
   contiguous: false
   chunksizes: 5000
 firmware_iop:
-  dtype: object
+  dtype: str
   zlib: false
   complevel: 3
   shuffle: true
@@ -118,7 +118,7 @@ firmware_iop:
   contiguous: false
   chunksizes: 5000
 firmware_dsp:
-  dtype: object
+  dtype: str
   zlib: false
   complevel: 3
   shuffle: true

disdrodb/l0/configs/PARSIVEL2/raw_data_format.yml CHANGED Viewed

@@ -15,7 +15,7 @@ rainfall_accumulated_32bit:
   n_naturals: 4
   data_range:
     - 0
-    - 300.0
+    - 9999.0
   nan_flags: null
   field_number: "02"
 weather_code_synop_4680:

disdrodb/l0/configs/PWS100/l0b_cf_attrs.yml CHANGED Viewed

@@ -25,7 +25,7 @@ sensor_status:
 air_temperature:
   description: "Air temperature in degrees Celsius"
   long_name: Air temperature
-  units: "C"
+  units: "degC"
 relative_humidity:
   description: "Relative humidity in percent (%)"
   long_name: Relative humidity
@@ -33,15 +33,15 @@ relative_humidity:
 wetbulb_temperature:
   description: "Wet bulb temperature in degrees Celsius"
   long_name: Wet bulb temperature
-  units: "C"
+  units: "degC"
 air_temperature_max:
   description: "Maximum air temperature in degrees Celsius"
   long_name: Maximum air temperature
-  units: "C"
+  units: "degC"
 air_temperature_min:
   description: "Minimum air temperature in degrees Celsius"
   long_name: Minimum air temperature
-  units: "C"
+  units: "degC"
 rainfall_rate:
   description: Rainfall rate
   long_name: Rainfall rate

disdrodb/l0/configs/PWS100/raw_data_format.yml CHANGED Viewed

@@ -5,7 +5,7 @@ mor_visibility:
   n_naturals: 4
   data_range:
     - 0
-    - 9999.9
+    - 20000
   nan_flags: null
   field_number: "20"
 weather_code_synop_4680:

disdrodb/l0/l0a_processing.py CHANGED Viewed

@@ -18,13 +18,13 @@
 # -----------------------------------------------------------------------------.
 """Functions to process raw text files into DISDRODB L0A Apache Parquet."""
 import logging
 import os
 from typing import Union
 import numpy as np
 import pandas as pd
+import pyarrow.parquet as pq
 from disdrodb.l0.check_standards import check_l0a_column_names, check_l0a_standards
 from disdrodb.l0.l0b_processing import infer_split_str
@@ -130,11 +130,15 @@ def read_raw_text_file(
     try:
         df = pd.read_csv(filepath, names=column_names, dtype=dtype, **reader_kwargs)
     except pd.errors.EmptyDataError:
+        # if isinstance(filepath, zipfile.ZipExtFile):
+        #     filepath = filepath.name
         msg = f"The following file is empty: {filepath}"
         raise ValueError(msg)
     # Check the dataframe is not empty
     if len(df.index) == 0:
+        # if isinstance(filepath, zipfile.ZipExtFile):
+        #     filepath = filepath.name
         msg = f"The following file is empty: {filepath}"
         raise ValueError(msg)
@@ -265,13 +269,15 @@ def remove_issue_timesteps(df, issue_dict, logger=None, verbose=False):
     # Retrieve timesteps and time_periods
     timesteps = issue_dict.get("timesteps", None)
     time_periods = issue_dict.get("time_periods", None)
+    timesteps = [] if timesteps is None else timesteps
+    time_periods = [] if time_periods is None else time_periods
     # Drop rows of specified timesteps
-    if timesteps:
+    if len(timesteps) > 0:
         df = drop_timesteps(df=df, timesteps=timesteps)
     # Drop rows within specified time_period
-    if time_periods:
+    if len(time_periods) > 0:
         df = drop_time_periods(df, time_periods=time_periods)
     # Report number of dropped rows
@@ -413,6 +419,8 @@ def is_raw_array_string_not_corrupted(string):
     """Check if the raw array is corrupted."""
     if not isinstance(string, str):
         return False
+    if string in ["", "NAN", "NaN"]:
+        return True
     split_str = infer_split_str(string=string)
     list_values = string.split(split_str)
     values = pd.to_numeric(list_values, errors="coerce")
@@ -625,6 +633,9 @@ def sanitize_df(
     # - Sort by time
     df = df.sort_values("time")
+    # - Drop index
+    df = df.reset_index(drop=True)
     # ------------------------------------------------------.
     # - Check column names agrees to DISDRODB standards
     check_l0a_column_names(df, sensor_name=sensor_name)
@@ -755,24 +766,8 @@ def concatenate_dataframe(list_df: list, logger=None, verbose: bool = False) ->
     return df
-def _read_l0a(filepath: str, verbose: bool = False, logger=None, debugging_mode: bool = False) -> pd.DataFrame:
-    # Log
-    msg = f"Reading L0 Apache Parquet file at {filepath} started."
-    log_info(logger=logger, msg=msg, verbose=verbose)
-    # Open file
-    df = pd.read_parquet(filepath)
-    if debugging_mode:
-        df = df.iloc[0:100]
-    # Log
-    msg = f"Reading L0 Apache Parquet file at {filepath} ended."
-    log_info(logger=logger, msg=msg, verbose=verbose)
-    return df
 def read_l0a_dataframe(
     filepaths: Union[str, list],
-    verbose: bool = False,
-    logger=None,
     debugging_mode: bool = False,
 ) -> pd.DataFrame:
     """Read DISDRODB L0A Apache Parquet file(s).
@@ -781,13 +776,10 @@ def read_l0a_dataframe(
     ----------
     filepaths : str or list
         Either a list or a single filepath.
-    verbose : bool
-        Whether to print detailed processing information into terminal.
-        The default is ``False``.
     debugging_mode : bool
         If ``True``, it reduces the amount of data to process.
         If filepaths is a list, it reads only the first 3 files.
-        For each file it select only the first 100 rows.
+        It selects only 100 rows sampled from the first 3 files.
         The default is ``False``.
     Returns
@@ -796,8 +788,6 @@ def read_l0a_dataframe(
         L0A Dataframe.
     """
-    from disdrodb.l0.l0a_processing import concatenate_dataframe
     # ----------------------------------------
     # Check filepaths validity
     if not isinstance(filepaths, (list, str)):
@@ -814,16 +804,22 @@ def read_l0a_dataframe(
     # ---------------------------------------------------
     # Define the list of dataframe
-    list_df = [
-        _read_l0a(filepath, verbose=verbose, logger=logger, debugging_mode=debugging_mode) for filepath in filepaths
-    ]
+    df = pq.ParquetDataset(filepaths).read().to_pandas()
-    # Concatenate dataframe
-    df = concatenate_dataframe(list_df, logger=logger, verbose=verbose)
+    # Reduce rows
+    if debugging_mode:
+        n_rows = min(100, len(df))
+        df = df.sample(n=n_rows)
     # Ensure time is in nanoseconds
     df["time"] = df["time"].astype("M8[ns]")
+    # Ensure sorted by time
+    df = df.sort_values(by="time")
+    # Ensure no index
+    df = df.reset_index(drop=True)
     # ---------------------------------------------------
     # Return dataframe
     return df
@@ -833,14 +829,15 @@ def read_l0a_dataframe(
 #### L0A Utility
-def read_raw_text_files(
+def generate_l0a(
     filepaths: Union[list, str],
     reader,
     sensor_name,
+    issue_dict=None,
     verbose=True,
     logger=None,
 ) -> pd.DataFrame:
-    """Read and parse a list for raw files into a dataframe.
+    """Read and parse a list of raw files and generate a DISDRODB L0A dataframe.
     Parameters
     ----------
@@ -851,6 +848,13 @@ def read_raw_text_files(
         Format: reader(filepath, logger=None)
     sensor_name : str
         Name of the sensor.
+    issue_dict : dict, optional
+        Issue dictionary providing information on timesteps to remove.
+        The default is an empty dictionary ``{}``.
+        Valid issue_dict key are ``'timesteps'`` and ``'time_periods'``.
+        Valid issue_dict values are list of datetime64 values (with second accuracy).
+        To correctly format and check the validity of the ``issue_dict``, use
+        the ``disdrodb.l0.issue.check_issue_dict`` function.
     verbose : bool
         Whether to verbose the processing. The default is ``True``.
@@ -886,6 +890,7 @@ def read_raw_text_files(
             df = sanitize_df(
                 df=df,
                 sensor_name=sensor_name,
+                issue_dict=issue_dict,
                 logger=logger,
                 verbose=verbose,
             )

disdrodb/l0/l0b_nc_processing.py CHANGED Viewed

@@ -19,6 +19,7 @@
 """Functions to process DISDRODB raw netCDF files into DISDRODB L0B netCDF files."""
 import logging
+from typing import Union
 import numpy as np
@@ -33,8 +34,8 @@ from disdrodb.l0.standards import (
     get_valid_variable_names,
 )
 from disdrodb.utils.logger import (
+    log_error,
     # log_warning,
-    # log_debug,
     log_info,
 )
@@ -169,6 +170,8 @@ def standardize_raw_dataset(ds, dict_names, sensor_name):
     # If missing variables, infill with NaN array
     missing_vars = _get_missing_variables(ds, dict_names, sensor_name)
+    if "raw_drop_number" in missing_vars:
+        raise ValueError("The raw drop spectrum is not present in the netCDF file!")
     if len(missing_vars) > 0:
         ds = add_dataset_missing_variables(ds=ds, missing_vars=missing_vars, sensor_name=sensor_name)
@@ -343,7 +346,7 @@ def drop_timesteps(ds, timesteps: list):
     # Ensure there's at least one timestep left
     if ds_filtered.sizes.get("time", 0) == 0:
         raise ValueError(
-            "No timesteps left after removing problematic timesteps. " "Maybe you need to adjust the issue YAML file.",
+            "No timesteps left after removing problematic timesteps. Maybe you need to adjust the issue YAML file.",
         )
     return ds_filtered
@@ -419,16 +422,21 @@ def remove_issue_timesteps(
     ValueError
         If after removing specified timesteps/periods no data remains.
     """
+    # Retrieve number of initial rows
     n_initial = ds.sizes.get("time", 0)
-    timesteps = issue_dict.get("timesteps", []) or []
-    time_periods = issue_dict.get("time_periods", []) or []
+    # Retrieve timesteps and time_periods
+    timesteps = issue_dict.get("timesteps")
+    time_periods = issue_dict.get("time_periods")
+    timesteps = [] if timesteps is None else timesteps
+    time_periods = [] if time_periods is None else time_periods
     # Drop individual timesteps
-    if timesteps:
+    if len(timesteps) > 0:
         ds = drop_timesteps(ds, timesteps)
     # Drop intervals of time
-    if time_periods:
+    if len(time_periods) > 0:
         ds = drop_time_periods(ds, time_periods)
     # Report number dropped
@@ -454,8 +462,8 @@ def sanitize_ds(
     ----------
     ds : xarray.Dataset
         Raw xarray dataset
-    attrs: dict
-        Global metadata to attach as global attributes to the xr.Dataset.
+    metadata: dict
+        Station metadata to attach as global attributes to the xr.Dataset.
     sensor_name : str
         Name of the sensor.
     verbose : bool
@@ -525,3 +533,105 @@ def open_raw_netcdf_file(
     # Log information
     log_info(logger=logger, msg=f"netCDF file {filepath} has been loaded successively into xarray.", verbose=False)
     return ds
+def generate_l0b_from_nc(
+    filepaths: Union[list, str],
+    reader,
+    sensor_name,
+    metadata,
+    issue_dict=None,
+    verbose=True,
+    logger=None,
+):
+    """Read and parse a list of raw netCDF files and generate a DISDRODB L0B dataset.
+    Parameters
+    ----------
+    filepaths : Union[list,str]
+        File(s) path(s)
+    reader:
+        DISDRODB reader function.
+        Format: reader(filepath, logger=None)
+    sensor_name : str
+        Name of the sensor.
+    metadata: dict
+        Station metadata to attach as global attributes to the xr.Dataset.
+    issue_dict : dict, optional
+        Issue dictionary providing information on timesteps to remove.
+        The default is an empty dictionary ``{}``.
+        Valid issue_dict key are ``'timesteps'`` and ``'time_periods'``.
+        Valid issue_dict values are list of datetime64 values (with second accuracy).
+        To correctly format and check the validity of the ``issue_dict``, use
+        the ``disdrodb.l0.issue.check_issue_dict`` function.
+    verbose : bool
+        Whether to verbose the processing. The default is ``True``.
+    Returns
+    -------
+    xarray.Dataset
+        DISDRODB L0B Dataset.
+    Raises
+    ------
+    ValueError
+        Input parameters can not be used or the raw file can not be processed.
+    """
+    import xarray as xr
+    # Check input list
+    if isinstance(filepaths, str):
+        filepaths = [filepaths]
+    if len(filepaths) == 0:
+        raise ValueError("'filepaths' must contains at least 1 filepath.")
+    # ------------------------------------------------------.
+    # Loop over all raw files
+    n_files = len(filepaths)
+    processed_file_counter = 0
+    list_skipped_files_msg = []
+    list_ds = []
+    for filepath in filepaths:
+        # Try read the raw netCDF file
+        try:
+            ds = reader(filepath, logger=logger)
+            # Sanitize the dataframe
+            ds = sanitize_ds(
+                ds=ds,
+                sensor_name=sensor_name,
+                metadata=metadata,
+                issue_dict=issue_dict,
+                verbose=verbose,
+                logger=logger,
+            )
+            # Append dataframe to the list
+            list_ds.append(ds)
+            # Update the logger
+            processed_file_counter += 1
+            msg = f"Raw file '{filepath}' processed successfully ({processed_file_counter}/{n_files})."
+            log_info(logger=logger, msg=msg, verbose=verbose)
+        # Skip the file if the processing fails
+        except Exception as e:
+            # Update the logger
+            msg = f"{filepath} has been skipped. The error is: {e}."
+            log_error(logger=logger, msg=msg, verbose=verbose)
+            list_skipped_files_msg.append(msg)
+    # Update logger
+    msg = f"{len(list_skipped_files_msg)} of {n_files} have been skipped."
+    log_info(logger=logger, msg=msg, verbose=verbose)
+    # Check if there are files to concatenate
+    if len(list_ds) == 0:
+        raise ValueError("Any raw file could be read!")
+    ##----------------------------------------------------------------.
+    # Concatenate the datasets
+    list_ds = [ds.chunk({"time": -1}) for ds in list_ds]
+    ds = xr.concat(list_ds, dim="time", join="outer", compat="no_conflicts", combine_attrs="override").sortby("time")
+    ds = ds.compute()
+    # Return the dataframe
+    return ds

disdrodb/l0/l0b_processing.py CHANGED Viewed

@@ -19,7 +19,6 @@
 """Functions to process DISDRODB L0A files into DISDRODB L0B netCDF files."""
 import logging
-import os
 import numpy as np
 import pandas as pd
@@ -43,13 +42,8 @@ from disdrodb.utils.attrs import (
     set_coordinate_attributes,
     set_disdrodb_attrs,
 )
-from disdrodb.utils.directories import create_directory, remove_if_exists
 from disdrodb.utils.encoding import set_encodings
-from disdrodb.utils.logger import (
-    # log_warning,
-    # log_debug,
-    log_info,
-)
+from disdrodb.utils.logger import log_info
 from disdrodb.utils.time import ensure_sorted_by_time
 logger = logging.getLogger(__name__)
@@ -246,12 +240,20 @@ def retrieve_l0b_arrays(
             unavailable_keys.append(key)
             continue
-        # Ensure is a string
-        df_series = df[key].astype(str)
+        # Ensure is a string, get a numpy array for each row and then stack
+        # - Option 1: Clear but lot of copies
+        # df_series = df[key].astype(str)
+        # list_arr = df_series.apply(_format_string_array, n_values=n_values)
+        # arr = np.stack(list_arr, axis=0)
+        # - Option 2: still copies
+        # arr = np.vstack(_format_string_array(s, n_values=n_values) for s in df_series.astype(str))
-        # Get a numpy array for each row and then stack
-        list_arr = df_series.apply(_format_string_array, n_values=n_values)
-        arr = np.stack(list_arr, axis=0)
+        # - Option 3: more memory efficient
+        n_timesteps = len(df[key])
+        arr = np.empty((n_timesteps, n_values), dtype=float)  # preallocates
+        for i, s in enumerate(df[key].astype(str)):
+            arr[i, :] = _format_string_array(s, n_values=n_values)
         # Retrieve dimensions
         dims_order = dims_order_dict[key]
@@ -333,18 +335,6 @@ def _set_variable_attributes(ds: xr.Dataset, sensor_name: str) -> xr.Dataset:
     return ds
-def _set_dataset_attrs(ds, sensor_name):
-    """Set variable and coordinates attributes."""
-    # - Add netCDF variable attributes
-    # --> Attributes: long_name, units, descriptions, valid_min, valid_max
-    ds = _set_variable_attributes(ds=ds, sensor_name=sensor_name)
-    # - Add netCDF coordinate attributes
-    ds = set_coordinate_attributes(ds=ds)
-    #  - Set DISDRODB global attributes
-    ds = set_disdrodb_attrs(ds=ds, product="L0B")
-    return ds
 def add_dataset_crs_coords(ds):
     """Add the CRS coordinate to the xr.Dataset."""
     # TODO: define CF-compliant CRS !
@@ -386,13 +376,13 @@ def _define_dataset_variables(df, sensor_name, logger=None, verbose=False):
     return data_vars
-def create_l0b_from_l0a(
+def generate_l0b(
     df: pd.DataFrame,
     metadata: dict,
     logger=None,
     verbose: bool = False,
 ) -> xr.Dataset:
-    """Transform the L0A dataframe to the L0B xr.Dataset.
+    """Transform the DISDRODB L0A dataframe to the DISDRODB L0B xr.Dataset.
     Parameters
     ----------
@@ -475,16 +465,25 @@ def finalize_dataset(ds, sensor_name, metadata):
     ds = add_dataset_crs_coords(ds)
     # Set netCDF dimension order
+    # --> Required for correct encoding !
     ds = ds.transpose("time", "diameter_bin_center", ...)
-    # Add netCDF variable and coordinate attributes
-    ds = _set_dataset_attrs(ds, sensor_name)
     # Ensure variables with dtype object are converted to string
     ds = _convert_object_variables_to_string(ds)
+    # Add netCDF variable and coordinate attributes
+    # - Add variable attributes: long_name, units, descriptions, valid_min, valid_max
+    ds = _set_variable_attributes(ds=ds, sensor_name=sensor_name)
+    # - Add netCDF coordinate attributes
+    ds = set_coordinate_attributes(ds=ds)
+    #  - Set DISDRODB global attributes
+    ds = set_disdrodb_attrs(ds=ds, product="L0B")
     # Check L0B standards
     check_l0b_standards(ds)
+    # Set L0B encodings
+    ds = set_l0b_encodings(ds=ds, sensor_name=sensor_name)
     return ds
@@ -503,43 +502,9 @@ def set_l0b_encodings(ds: xr.Dataset, sensor_name: str):
     xarray.Dataset
         Output xarray dataset.
     """
-    encoding_dict = get_l0b_encodings_dict(sensor_name)
-    ds = set_encodings(ds=ds, encoding_dict=encoding_dict)
+    encodings_dict = get_l0b_encodings_dict(sensor_name)
+    ds = set_encodings(ds=ds, encodings_dict=encodings_dict)
     return ds
-def write_l0b(ds: xr.Dataset, filepath: str, force=False) -> None:
-    """Save the xarray dataset into a NetCDF file.
-    Parameters
-    ----------
-    ds  : xarray.Dataset
-        Input xarray dataset.
-    filepath : str
-        Output file path.
-    sensor_name : str
-        Name of the sensor.
-    force : bool, optional
-        Whether to overwrite existing data.
-        If ``True``, overwrite existing data into destination directories.
-        If ``False``, raise an error if there are already data into destination directories. This is the default.
-    """
-    # Create station directory if does not exist
-    create_directory(os.path.dirname(filepath))
-    # Check if the file already exists
-    # - If force=True --> Remove it
-    # - If force=False --> Raise error
-    remove_if_exists(filepath, force=force)
-    # Get sensor name from dataset
-    sensor_name = ds.attrs.get("sensor_name")
-    # Set encodings
-    ds = set_l0b_encodings(ds=ds, sensor_name=sensor_name)
-    # Write netcdf
-    ds.to_netcdf(filepath, engine="netcdf4")
 ####--------------------------------------------------------------------------.

disdrodb 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

disdrodb 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl