disdrodb 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- disdrodb/__init__.py +68 -34
- disdrodb/_config.py +5 -4
- disdrodb/_version.py +16 -3
- disdrodb/accessor/__init__.py +20 -0
- disdrodb/accessor/methods.py +125 -0
- disdrodb/api/checks.py +177 -24
- disdrodb/api/configs.py +3 -3
- disdrodb/api/info.py +13 -13
- disdrodb/api/io.py +281 -22
- disdrodb/api/path.py +184 -195
- disdrodb/api/search.py +18 -9
- disdrodb/cli/disdrodb_create_summary.py +103 -0
- disdrodb/cli/disdrodb_create_summary_station.py +91 -0
- disdrodb/cli/disdrodb_run_l0.py +1 -1
- disdrodb/cli/disdrodb_run_l0_station.py +1 -1
- disdrodb/cli/disdrodb_run_l0a_station.py +1 -1
- disdrodb/cli/disdrodb_run_l0b.py +1 -1
- disdrodb/cli/disdrodb_run_l0b_station.py +3 -3
- disdrodb/cli/disdrodb_run_l0c.py +1 -1
- disdrodb/cli/disdrodb_run_l0c_station.py +3 -3
- disdrodb/cli/disdrodb_run_l1_station.py +2 -2
- disdrodb/cli/disdrodb_run_l2e_station.py +2 -2
- disdrodb/cli/disdrodb_run_l2m_station.py +2 -2
- disdrodb/configs.py +149 -4
- disdrodb/constants.py +61 -0
- disdrodb/data_transfer/download_data.py +127 -11
- disdrodb/etc/configs/attributes.yaml +339 -0
- disdrodb/etc/configs/encodings.yaml +473 -0
- disdrodb/etc/products/L1/global.yaml +13 -0
- disdrodb/etc/products/L2E/10MIN.yaml +12 -0
- disdrodb/etc/products/L2E/1MIN.yaml +1 -0
- disdrodb/etc/products/L2E/global.yaml +22 -0
- disdrodb/etc/products/L2M/10MIN.yaml +12 -0
- disdrodb/etc/products/L2M/GAMMA_ML.yaml +8 -0
- disdrodb/etc/products/L2M/NGAMMA_GS_LOG_ND_MAE.yaml +6 -0
- disdrodb/etc/products/L2M/NGAMMA_GS_ND_MAE.yaml +6 -0
- disdrodb/etc/products/L2M/NGAMMA_GS_Z_MAE.yaml +6 -0
- disdrodb/etc/products/L2M/global.yaml +26 -0
- disdrodb/issue/writer.py +2 -0
- disdrodb/l0/__init__.py +13 -0
- disdrodb/l0/configs/LPM/l0b_cf_attrs.yml +4 -4
- disdrodb/l0/configs/PARSIVEL/l0b_cf_attrs.yml +1 -1
- disdrodb/l0/configs/PARSIVEL/l0b_encodings.yml +3 -3
- disdrodb/l0/configs/PARSIVEL/raw_data_format.yml +1 -1
- disdrodb/l0/configs/PARSIVEL2/l0b_cf_attrs.yml +5 -5
- disdrodb/l0/configs/PARSIVEL2/l0b_encodings.yml +3 -3
- disdrodb/l0/configs/PARSIVEL2/raw_data_format.yml +1 -1
- disdrodb/l0/configs/PWS100/l0b_cf_attrs.yml +4 -4
- disdrodb/l0/configs/PWS100/raw_data_format.yml +1 -1
- disdrodb/l0/l0a_processing.py +37 -32
- disdrodb/l0/l0b_nc_processing.py +118 -8
- disdrodb/l0/l0b_processing.py +30 -65
- disdrodb/l0/l0c_processing.py +369 -259
- disdrodb/l0/readers/LPM/ARM/ARM_LPM.py +7 -0
- disdrodb/l0/readers/LPM/NETHERLANDS/DELFT_LPM_NC.py +66 -0
- disdrodb/l0/readers/LPM/SLOVENIA/{CRNI_VRH.py → UL.py} +3 -0
- disdrodb/l0/readers/LPM/SWITZERLAND/INNERERIZ_LPM.py +195 -0
- disdrodb/l0/readers/PARSIVEL/GPM/PIERS.py +0 -2
- disdrodb/l0/readers/PARSIVEL/JAPAN/JMA.py +4 -1
- disdrodb/l0/readers/PARSIVEL/NCAR/PECAN_MOBILE.py +1 -1
- disdrodb/l0/readers/PARSIVEL/NCAR/VORTEX2_2009.py +1 -1
- disdrodb/l0/readers/PARSIVEL2/ARM/ARM_PARSIVEL2.py +4 -0
- disdrodb/l0/readers/PARSIVEL2/BELGIUM/ILVO.py +168 -0
- disdrodb/l0/readers/PARSIVEL2/CANADA/UQAM_NC.py +69 -0
- disdrodb/l0/readers/PARSIVEL2/DENMARK/DTU.py +165 -0
- disdrodb/l0/readers/PARSIVEL2/FINLAND/FMI_PARSIVEL2.py +69 -0
- disdrodb/l0/readers/PARSIVEL2/FRANCE/ENPC_PARSIVEL2.py +255 -134
- disdrodb/l0/readers/PARSIVEL2/FRANCE/OSUG.py +525 -0
- disdrodb/l0/readers/PARSIVEL2/FRANCE/SIRTA_PARSIVEL2.py +1 -1
- disdrodb/l0/readers/PARSIVEL2/GPM/GCPEX.py +9 -7
- disdrodb/l0/readers/PARSIVEL2/KIT/BURKINA_FASO.py +1 -1
- disdrodb/l0/readers/PARSIVEL2/KIT/TEAMX.py +123 -0
- disdrodb/l0/readers/PARSIVEL2/{NETHERLANDS/DELFT.py → MPI/BCO_PARSIVEL2.py} +41 -71
- disdrodb/l0/readers/PARSIVEL2/MPI/BOWTIE.py +220 -0
- disdrodb/l0/readers/PARSIVEL2/NASA/APU.py +120 -0
- disdrodb/l0/readers/PARSIVEL2/NASA/LPVEX.py +109 -0
- disdrodb/l0/readers/PARSIVEL2/NCAR/FARM_PARSIVEL2.py +1 -0
- disdrodb/l0/readers/PARSIVEL2/NCAR/PECAN_FP3.py +1 -1
- disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_MIPS.py +126 -0
- disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_PIPS.py +165 -0
- disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_P2.py +1 -1
- disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_PIPS.py +20 -12
- disdrodb/l0/readers/PARSIVEL2/NETHERLANDS/DELFT_NC.py +5 -0
- disdrodb/l0/readers/PARSIVEL2/SPAIN/CENER.py +144 -0
- disdrodb/l0/readers/PARSIVEL2/SPAIN/CR1000DL.py +201 -0
- disdrodb/l0/readers/PARSIVEL2/SPAIN/LIAISE.py +137 -0
- disdrodb/l0/readers/PARSIVEL2/USA/C3WE.py +146 -0
- disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100.py +105 -99
- disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100_SIRTA.py +151 -0
- disdrodb/l1/__init__.py +5 -0
- disdrodb/l1/fall_velocity.py +46 -0
- disdrodb/l1/filters.py +34 -20
- disdrodb/l1/processing.py +46 -45
- disdrodb/l1/resampling.py +77 -66
- disdrodb/l1_env/routines.py +18 -3
- disdrodb/l2/__init__.py +7 -0
- disdrodb/l2/empirical_dsd.py +58 -10
- disdrodb/l2/processing.py +268 -117
- disdrodb/metadata/checks.py +132 -125
- disdrodb/metadata/standards.py +3 -1
- disdrodb/psd/fitting.py +631 -345
- disdrodb/psd/models.py +9 -6
- disdrodb/routines/__init__.py +54 -0
- disdrodb/{l0/routines.py → routines/l0.py} +316 -355
- disdrodb/{l1/routines.py → routines/l1.py} +76 -116
- disdrodb/routines/l2.py +1019 -0
- disdrodb/{routines.py → routines/wrappers.py} +98 -10
- disdrodb/scattering/__init__.py +16 -4
- disdrodb/scattering/axis_ratio.py +61 -37
- disdrodb/scattering/permittivity.py +504 -0
- disdrodb/scattering/routines.py +746 -184
- disdrodb/summary/__init__.py +17 -0
- disdrodb/summary/routines.py +4196 -0
- disdrodb/utils/archiving.py +434 -0
- disdrodb/utils/attrs.py +68 -125
- disdrodb/utils/cli.py +5 -5
- disdrodb/utils/compression.py +30 -1
- disdrodb/utils/dask.py +121 -9
- disdrodb/utils/dataframe.py +61 -7
- disdrodb/utils/decorators.py +31 -0
- disdrodb/utils/directories.py +35 -15
- disdrodb/utils/encoding.py +37 -19
- disdrodb/{l2 → utils}/event.py +15 -173
- disdrodb/utils/logger.py +14 -7
- disdrodb/utils/manipulations.py +81 -0
- disdrodb/utils/routines.py +166 -0
- disdrodb/utils/subsetting.py +214 -0
- disdrodb/utils/time.py +35 -177
- disdrodb/utils/writer.py +20 -7
- disdrodb/utils/xarray.py +5 -4
- disdrodb/viz/__init__.py +13 -0
- disdrodb/viz/plots.py +398 -0
- {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/METADATA +4 -3
- {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/RECORD +139 -98
- {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/entry_points.txt +2 -0
- disdrodb/l1/encoding_attrs.py +0 -642
- disdrodb/l2/processing_options.py +0 -213
- disdrodb/l2/routines.py +0 -868
- /disdrodb/l0/readers/PARSIVEL/SLOVENIA/{UL_FGG.py → UL.py} +0 -0
- {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/WHEEL +0 -0
- {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/top_level.txt +0 -0
disdrodb/utils/cli.py
CHANGED
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
import click
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
def
|
|
24
|
+
def execute_cmd(cmd, raise_error=False):
|
|
25
25
|
"""Execute command in the terminal, streaming output in python console."""
|
|
26
26
|
from subprocess import PIPE, CalledProcessError, Popen
|
|
27
27
|
|
|
@@ -34,7 +34,7 @@ def _execute_cmd(cmd, raise_error=False):
|
|
|
34
34
|
raise CalledProcessError(p.returncode, p.args)
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
def
|
|
37
|
+
def parse_empty_string_and_none(args):
|
|
38
38
|
"""Utility to parse argument passed from the command line.
|
|
39
39
|
|
|
40
40
|
If ``args = ''``, returns None.
|
|
@@ -58,7 +58,7 @@ def parse_arg_to_list(args):
|
|
|
58
58
|
If ``args = 'variable1 variable2'`` returns ``[variable1, variable2]``.
|
|
59
59
|
"""
|
|
60
60
|
# If '' or 'None' --> Set to None
|
|
61
|
-
args =
|
|
61
|
+
args = parse_empty_string_and_none(args)
|
|
62
62
|
# - If multiple arguments, split by space
|
|
63
63
|
if isinstance(args, str):
|
|
64
64
|
# - Split by space
|
|
@@ -75,7 +75,7 @@ def parse_archive_dir(archive_dir: str):
|
|
|
75
75
|
If ``archive_dir = ''`` returns ``None``.
|
|
76
76
|
"""
|
|
77
77
|
# If '', set to 'None'
|
|
78
|
-
return
|
|
78
|
+
return parse_empty_string_and_none(archive_dir)
|
|
79
79
|
|
|
80
80
|
|
|
81
81
|
def click_station_arguments(function: object):
|
|
@@ -86,7 +86,7 @@ def click_station_arguments(function: object):
|
|
|
86
86
|
function : object
|
|
87
87
|
Function.
|
|
88
88
|
"""
|
|
89
|
-
function = click.argument("station_name", metavar="<
|
|
89
|
+
function = click.argument("station_name", metavar="<STATION_NAME>")(function)
|
|
90
90
|
function = click.argument("campaign_name", metavar="<CAMPAIGN_NAME>")(function)
|
|
91
91
|
function = click.argument("data_source", metavar="<DATA_SOURCE>")(function)
|
|
92
92
|
return function
|
disdrodb/utils/compression.py
CHANGED
|
@@ -22,6 +22,7 @@ import bz2
|
|
|
22
22
|
import gzip
|
|
23
23
|
import os
|
|
24
24
|
import shutil
|
|
25
|
+
import subprocess
|
|
25
26
|
import tempfile
|
|
26
27
|
import zipfile
|
|
27
28
|
from typing import Optional
|
|
@@ -53,6 +54,34 @@ def unzip_file(filepath: str, dest_path: str) -> None:
|
|
|
53
54
|
zip_ref.extractall(dest_path)
|
|
54
55
|
|
|
55
56
|
|
|
57
|
+
def unzip_file_on_terminal(filepath: str, dest_path: str) -> str:
|
|
58
|
+
"""Unzip a file into a directory using the terminal command.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
filepath : str
|
|
63
|
+
Path of the file to unzip.
|
|
64
|
+
dest_path : str
|
|
65
|
+
Path of the destination directory.
|
|
66
|
+
"""
|
|
67
|
+
os.makedirs(dest_path, exist_ok=True)
|
|
68
|
+
|
|
69
|
+
if os.name == "nt":
|
|
70
|
+
# Windows: use PowerShell Expand-Archive
|
|
71
|
+
cmd = [
|
|
72
|
+
"powershell.exe",
|
|
73
|
+
"-NoProfile",
|
|
74
|
+
"-NonInteractive",
|
|
75
|
+
"-Command",
|
|
76
|
+
f"Expand-Archive -LiteralPath '{filepath}' -DestinationPath '{dest_path}' -Force",
|
|
77
|
+
]
|
|
78
|
+
else:
|
|
79
|
+
# macOS/Linux: use unzip
|
|
80
|
+
cmd = ["unzip", "-q", filepath, "-d", dest_path]
|
|
81
|
+
|
|
82
|
+
subprocess.run(cmd, check=True)
|
|
83
|
+
|
|
84
|
+
|
|
56
85
|
def _zip_dir(dir_path: str) -> str:
|
|
57
86
|
"""Zip a directory into a file located in the same directory.
|
|
58
87
|
|
|
@@ -157,7 +186,7 @@ def compress_station_files(
|
|
|
157
186
|
raise ValueError(f"Station data directory {station_dir} does not exist.")
|
|
158
187
|
|
|
159
188
|
# Get list of files inside the station directory (in all nested directories)
|
|
160
|
-
filepaths = list_files(station_dir,
|
|
189
|
+
filepaths = list_files(station_dir, recursive=True)
|
|
161
190
|
for filepath in filepaths:
|
|
162
191
|
_ = _compress_file(filepath, method, skip=skip)
|
|
163
192
|
|
disdrodb/utils/dask.py
CHANGED
|
@@ -16,31 +16,82 @@
|
|
|
16
16
|
# You should have received a copy of the GNU General Public License
|
|
17
17
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
18
18
|
# -----------------------------------------------------------------------------.
|
|
19
|
-
"""Utilities for Dask Distributed
|
|
19
|
+
"""Utilities for Dask Distributed Computations."""
|
|
20
20
|
import logging
|
|
21
21
|
import os
|
|
22
22
|
|
|
23
|
+
import numpy as np
|
|
23
24
|
|
|
24
|
-
|
|
25
|
+
|
|
26
|
+
def check_parallel_validity(parallel):
|
|
27
|
+
"""Check validity of parallel option given Dask settings."""
|
|
28
|
+
import dask
|
|
29
|
+
|
|
30
|
+
scheduler = dask.config.get("scheduler", None)
|
|
31
|
+
if scheduler is None:
|
|
32
|
+
return parallel
|
|
33
|
+
if scheduler in ["synchronous", "threads"]:
|
|
34
|
+
return False
|
|
35
|
+
if scheduler == "distributed":
|
|
36
|
+
from dask.distributed import default_client
|
|
37
|
+
|
|
38
|
+
client = default_client()
|
|
39
|
+
info = client.scheduler_info()
|
|
40
|
+
|
|
41
|
+
# If ThreadWorker, only 1 pid
|
|
42
|
+
pids = list(client.run(os.getpid).values())
|
|
43
|
+
if len(np.unique(pids)) == 1:
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
# If ProcessWorker
|
|
47
|
+
# - Check single thread per worker to avoid locks
|
|
48
|
+
nthreads_per_process = np.array([v["nthreads"] for v in info["workers"].values()])
|
|
49
|
+
if not np.all(nthreads_per_process == 1):
|
|
50
|
+
print(
|
|
51
|
+
"To open netCDFs in parallel with dask distributed (processes=True), please set threads_per_worker=1 !",
|
|
52
|
+
)
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
# Otherwise let the user choose
|
|
56
|
+
return parallel
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def initialize_dask_cluster(minimum_memory=None):
|
|
25
60
|
"""Initialize Dask Cluster."""
|
|
26
61
|
import dask
|
|
62
|
+
import psutil
|
|
63
|
+
|
|
64
|
+
# Silence dask warnings
|
|
65
|
+
# dask.config.set({"logging.distributed": "error"})
|
|
66
|
+
# Import dask.distributed after setting the config
|
|
27
67
|
from dask.distributed import Client, LocalCluster
|
|
68
|
+
from dask.utils import parse_bytes
|
|
28
69
|
|
|
29
70
|
# Set HDF5_USE_FILE_LOCKING to avoid going stuck with HDF
|
|
30
71
|
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
|
|
31
|
-
|
|
32
|
-
|
|
72
|
+
|
|
73
|
+
# Retrieve the number of processes to run
|
|
74
|
+
available_workers = os.cpu_count() - 2 # if not set, all CPUs minus 2
|
|
33
75
|
num_workers = dask.config.get("num_workers", available_workers)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
76
|
+
|
|
77
|
+
# If memory limit specified, ensure correct amount of workers
|
|
78
|
+
if minimum_memory is not None:
|
|
79
|
+
# Compute available memory (in bytes)
|
|
80
|
+
total_memory = psutil.virtual_memory().total
|
|
81
|
+
# Get minimum memory per worker (in bytes)
|
|
82
|
+
minimum_memory = parse_bytes(minimum_memory)
|
|
83
|
+
# Determine number of workers constrained by memory
|
|
84
|
+
maximum_workers_allowed = max(1, total_memory // minimum_memory)
|
|
85
|
+
# Respect both CPU and memory requirements
|
|
86
|
+
num_workers = min(maximum_workers_allowed, num_workers)
|
|
87
|
+
|
|
37
88
|
# Create dask.distributed local cluster
|
|
38
89
|
cluster = LocalCluster(
|
|
39
90
|
n_workers=num_workers,
|
|
40
91
|
threads_per_worker=1,
|
|
41
92
|
processes=True,
|
|
42
|
-
|
|
43
|
-
|
|
93
|
+
memory_limit=0, # this avoid flexible dask memory management
|
|
94
|
+
silence_logs=logging.ERROR,
|
|
44
95
|
)
|
|
45
96
|
client = Client(cluster)
|
|
46
97
|
return cluster, client
|
|
@@ -60,3 +111,64 @@ def close_dask_cluster(cluster, client):
|
|
|
60
111
|
finally:
|
|
61
112
|
# Restore the original log level
|
|
62
113
|
logger.setLevel(original_level)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def execute_tasks_safely(list_tasks, parallel: bool, logs_dir: str):
|
|
117
|
+
"""
|
|
118
|
+
Execute Dask tasks and skip failed ones.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
list_tasks : list
|
|
123
|
+
List of dask delayed objects or results.
|
|
124
|
+
parallel : bool
|
|
125
|
+
Whether to execute in parallel with Dask or not.
|
|
126
|
+
logs_dir : str
|
|
127
|
+
Directory to store FAILED_TASKS.log.
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
list_logs : list
|
|
132
|
+
List of task results. For failed tasks, adds the path
|
|
133
|
+
to FAILED_TASKS.log in place of the result.
|
|
134
|
+
"""
|
|
135
|
+
from dask.distributed import get_client
|
|
136
|
+
|
|
137
|
+
# Ensure logs_dir exists
|
|
138
|
+
os.makedirs(logs_dir, exist_ok=True)
|
|
139
|
+
|
|
140
|
+
# Define file name where to log failed dask tasks
|
|
141
|
+
failed_log_path = os.path.join(logs_dir, "FAILED_DASK_TASKS.log")
|
|
142
|
+
|
|
143
|
+
if not parallel:
|
|
144
|
+
# Non-parallel mode: just return results directly
|
|
145
|
+
return list_tasks
|
|
146
|
+
|
|
147
|
+
# Ensure we have a Dask client
|
|
148
|
+
try:
|
|
149
|
+
client = get_client()
|
|
150
|
+
except ValueError:
|
|
151
|
+
raise ValueError("No Dask Distributed Client found.")
|
|
152
|
+
|
|
153
|
+
# Compute tasks (all concurrently)
|
|
154
|
+
# - Runs tasks == num_workers * threads_per_worker (which is 1 for DISDRODB)
|
|
155
|
+
# - If errors occurs in some, skip it
|
|
156
|
+
futures = client.compute(list_tasks)
|
|
157
|
+
results = client.gather(futures, errors="skip")
|
|
158
|
+
|
|
159
|
+
# Collect failed futures
|
|
160
|
+
failed_futures = [f for f in futures if f.status != "finished"] # "error"
|
|
161
|
+
|
|
162
|
+
# If no tasks failed, return results
|
|
163
|
+
if not failed_futures:
|
|
164
|
+
return results
|
|
165
|
+
|
|
166
|
+
# Otherwise define log file listing failed tasks
|
|
167
|
+
with open(failed_log_path, "w") as f:
|
|
168
|
+
for fut in failed_futures:
|
|
169
|
+
err = fut.exception()
|
|
170
|
+
f.write(f"ERROR - DASK TASK FAILURE - Task {fut.key} failed: {err}\n")
|
|
171
|
+
|
|
172
|
+
# Append to list of log filepaths (results) the dask failing log
|
|
173
|
+
results.append(failed_log_path)
|
|
174
|
+
return results
|
disdrodb/utils/dataframe.py
CHANGED
|
@@ -20,6 +20,8 @@
|
|
|
20
20
|
import numpy as np
|
|
21
21
|
import pandas as pd
|
|
22
22
|
|
|
23
|
+
from disdrodb.utils.warnings import suppress_warnings
|
|
24
|
+
|
|
23
25
|
|
|
24
26
|
def log_arange(start, stop, log_step=0.1, base=10):
|
|
25
27
|
"""
|
|
@@ -47,7 +49,39 @@ def log_arange(start, stop, log_step=0.1, base=10):
|
|
|
47
49
|
log_start = np.log(start) / np.log(base)
|
|
48
50
|
log_stop = np.log(stop) / np.log(base)
|
|
49
51
|
|
|
50
|
-
log_values = np.arange(log_start, log_stop, log_step)
|
|
52
|
+
log_values = np.arange(log_start, log_stop + log_step / 2, log_step)
|
|
53
|
+
return base**log_values
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def log_linspace(start, stop, n_bins, base=10):
|
|
57
|
+
"""
|
|
58
|
+
Return numbers spaced evenly on a log scale between start and stop.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
start : float
|
|
63
|
+
The starting value of the sequence (must be > 0).
|
|
64
|
+
stop : float
|
|
65
|
+
The end value of the sequence (must be > 0).
|
|
66
|
+
n_bins : int
|
|
67
|
+
The number of points to generate (including start and stop).
|
|
68
|
+
base : float
|
|
69
|
+
The logarithmic base (default is 10).
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
np.ndarray
|
|
74
|
+
Array of values spaced evenly in log space.
|
|
75
|
+
"""
|
|
76
|
+
if start <= 0 or stop <= 0:
|
|
77
|
+
raise ValueError("Both start and stop must be > 0 for log spacing.")
|
|
78
|
+
if n_bins < 2:
|
|
79
|
+
raise ValueError("n_bins must be >= 2 to include start and stop values.")
|
|
80
|
+
|
|
81
|
+
log_start = np.log(start) / np.log(base)
|
|
82
|
+
log_stop = np.log(stop) / np.log(base)
|
|
83
|
+
|
|
84
|
+
log_values = np.linspace(log_start, log_stop, n_bins)
|
|
51
85
|
return base**log_values
|
|
52
86
|
|
|
53
87
|
|
|
@@ -100,6 +134,9 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
|
|
|
100
134
|
if len(df) == 0:
|
|
101
135
|
raise ValueError("No valid data points after removing NaN values")
|
|
102
136
|
|
|
137
|
+
# Keep only data within bin range
|
|
138
|
+
df = df[(df[column] >= bins[0]) & (df[column] < bins[-1])]
|
|
139
|
+
|
|
103
140
|
# Create binned columns with explicit handling of out-of-bounds values
|
|
104
141
|
df[f"{column}_binned"] = pd.cut(df[column], bins=bins, include_lowest=True)
|
|
105
142
|
|
|
@@ -134,7 +171,7 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
|
|
|
134
171
|
(f"{prefix}std", "std"),
|
|
135
172
|
(f"{prefix}min", "min"),
|
|
136
173
|
(f"{prefix}max", "max"),
|
|
137
|
-
(f"{prefix}mad", lambda s:
|
|
174
|
+
(f"{prefix}mad", lambda s: (s - s.median()).abs().median()),
|
|
138
175
|
]
|
|
139
176
|
if i == 0:
|
|
140
177
|
list_stats.append(("count", "count"))
|
|
@@ -142,7 +179,8 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
|
|
|
142
179
|
list_stats = [("count", "count")]
|
|
143
180
|
|
|
144
181
|
# Compute statistics
|
|
145
|
-
|
|
182
|
+
with suppress_warnings():
|
|
183
|
+
df_stats = df_grouped[var].agg(list_stats)
|
|
146
184
|
|
|
147
185
|
# Compute other variable statistics
|
|
148
186
|
if variables_specified:
|
|
@@ -253,8 +291,18 @@ def compute_2d_histogram(
|
|
|
253
291
|
raise ValueError("No valid data points after removing NaN values")
|
|
254
292
|
|
|
255
293
|
# Create binned columns with explicit handling of out-of-bounds values
|
|
256
|
-
df[f"{x}_binned"] = pd.cut(
|
|
257
|
-
|
|
294
|
+
df[f"{x}_binned"] = pd.cut(
|
|
295
|
+
df[x],
|
|
296
|
+
bins=pd.IntervalIndex.from_breaks(x_bins, closed="right"),
|
|
297
|
+
include_lowest=True,
|
|
298
|
+
ordered=True,
|
|
299
|
+
)
|
|
300
|
+
df[f"{y}_binned"] = pd.cut(
|
|
301
|
+
df[y],
|
|
302
|
+
bins=pd.IntervalIndex.from_breaks(y_bins, closed="right"),
|
|
303
|
+
include_lowest=True,
|
|
304
|
+
ordered=True,
|
|
305
|
+
)
|
|
258
306
|
|
|
259
307
|
# Create complete IntervalIndex for both dimensions
|
|
260
308
|
x_intervals = df[f"{x}_binned"].cat.categories
|
|
@@ -318,8 +366,8 @@ def compute_2d_histogram(
|
|
|
318
366
|
df_stats = df_stats.reindex(full_index)
|
|
319
367
|
|
|
320
368
|
# Determine coordinates
|
|
321
|
-
x_centers = x_intervals.mid
|
|
322
|
-
y_centers = y_intervals.mid
|
|
369
|
+
x_centers = np.array(x_intervals.mid)
|
|
370
|
+
y_centers = np.array(y_intervals.mid)
|
|
323
371
|
|
|
324
372
|
# Use provided labels if available
|
|
325
373
|
x_coords = x_labels if x_labels is not None else x_centers
|
|
@@ -337,6 +385,12 @@ def compute_2d_histogram(
|
|
|
337
385
|
# Convert to dataset
|
|
338
386
|
ds = df_stats.to_xarray()
|
|
339
387
|
|
|
388
|
+
# Convert Categorical coordinates to float if possible
|
|
389
|
+
if np.issubdtype(x_coords.dtype, np.number):
|
|
390
|
+
ds[f"{x}"] = ds[f"{x}"].astype(float)
|
|
391
|
+
if np.issubdtype(y_coords.dtype, np.number):
|
|
392
|
+
ds[f"{y}"] = ds[f"{y}"].astype(float)
|
|
393
|
+
|
|
340
394
|
# Transpose arrays
|
|
341
395
|
ds = ds.transpose(y, x)
|
|
342
396
|
return ds
|
disdrodb/utils/decorators.py
CHANGED
|
@@ -19,10 +19,34 @@
|
|
|
19
19
|
"""DISDRODB decorators."""
|
|
20
20
|
import functools
|
|
21
21
|
import importlib
|
|
22
|
+
import uuid
|
|
22
23
|
|
|
23
24
|
import dask
|
|
24
25
|
|
|
25
26
|
|
|
27
|
+
def create_dask_task_name(function_name: str, name=None) -> str | None:
|
|
28
|
+
"""
|
|
29
|
+
Create a custom dask task name.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
function_name : str
|
|
34
|
+
Name of the function being delayed.
|
|
35
|
+
name : str, optional
|
|
36
|
+
Custom name for the task (e.g., filepath or ID).
|
|
37
|
+
If None, returns None so that Dask generates is own default name.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
str | None
|
|
42
|
+
Custom dask task name string if `name` is given,
|
|
43
|
+
otherwise None (use Dask's default naming).
|
|
44
|
+
"""
|
|
45
|
+
if name is None:
|
|
46
|
+
return None
|
|
47
|
+
return f"{function_name}.{name}-{uuid.uuid4()}"
|
|
48
|
+
|
|
49
|
+
|
|
26
50
|
def delayed_if_parallel(function):
|
|
27
51
|
"""Decorator to make the function delayed if its ``parallel`` argument is ``True``."""
|
|
28
52
|
|
|
@@ -34,6 +58,13 @@ def delayed_if_parallel(function):
|
|
|
34
58
|
if parallel:
|
|
35
59
|
# Enforce verbose to be False
|
|
36
60
|
kwargs["verbose"] = False
|
|
61
|
+
# Define custom dask task name
|
|
62
|
+
if "logs_filename" in kwargs:
|
|
63
|
+
kwargs["dask_key_name"] = create_dask_task_name(
|
|
64
|
+
function_name=function.__name__,
|
|
65
|
+
name=kwargs["logs_filename"],
|
|
66
|
+
)
|
|
67
|
+
|
|
37
68
|
# Define the delayed task
|
|
38
69
|
result = dask.delayed(function)(*args, **kwargs)
|
|
39
70
|
else:
|
disdrodb/utils/directories.py
CHANGED
|
@@ -98,18 +98,29 @@ def _recursive_glob(dir_path, glob_pattern):
|
|
|
98
98
|
return [str(path) for path in dir_path.rglob(glob_pattern)]
|
|
99
99
|
|
|
100
100
|
|
|
101
|
-
def
|
|
101
|
+
def _is_hidden(path):
|
|
102
|
+
"""Return True if any component of path is hidden."""
|
|
103
|
+
return any(part.startswith(".") for part in path.split(os.sep))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _list_paths(dir_path, glob_pattern, recursive=False, skip_hidden=True):
|
|
102
107
|
"""Return a list of filepaths and directory paths based on a single glob pattern."""
|
|
103
108
|
# If glob pattern has separators, disable recursive option
|
|
104
109
|
if "/" in glob_pattern and "**" not in glob_pattern:
|
|
105
110
|
recursive = False
|
|
106
111
|
# Search paths
|
|
107
112
|
if not recursive:
|
|
108
|
-
|
|
109
|
-
|
|
113
|
+
matches = glob.glob(os.path.join(dir_path, glob_pattern))
|
|
114
|
+
else:
|
|
115
|
+
matches = _recursive_glob(dir_path, glob_pattern)
|
|
110
116
|
|
|
117
|
+
# Filter out anything with a hidden component
|
|
118
|
+
if skip_hidden:
|
|
119
|
+
matches = [p for p in matches if not _is_hidden(os.path.relpath(p, dir_path))]
|
|
120
|
+
return matches
|
|
111
121
|
|
|
112
|
-
|
|
122
|
+
|
|
123
|
+
def list_paths(dir_path, glob_pattern, recursive=False, skip_hidden=True):
|
|
113
124
|
"""Return a list of filepaths and directory paths.
|
|
114
125
|
|
|
115
126
|
This function accept also a list of glob patterns !
|
|
@@ -119,35 +130,41 @@ def list_paths(dir_path, glob_pattern, recursive=False):
|
|
|
119
130
|
# Search path for specified glob patterns
|
|
120
131
|
paths = flatten_list(
|
|
121
132
|
[
|
|
122
|
-
_list_paths(dir_path=dir_path, glob_pattern=glob_pattern, recursive=recursive)
|
|
133
|
+
_list_paths(dir_path=dir_path, glob_pattern=glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
|
|
123
134
|
for glob_pattern in glob_patterns
|
|
124
135
|
],
|
|
125
136
|
)
|
|
126
137
|
return paths
|
|
127
138
|
|
|
128
139
|
|
|
129
|
-
def list_files(dir_path, glob_pattern, recursive=False):
|
|
140
|
+
def list_files(dir_path, glob_pattern="*", recursive=False, skip_hidden=True, return_paths=True):
|
|
130
141
|
"""Return a list of filepaths (exclude directory paths)."""
|
|
131
|
-
paths = list_paths(dir_path, glob_pattern, recursive=recursive)
|
|
142
|
+
paths = list_paths(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
|
|
132
143
|
filepaths = [f for f in paths if os.path.isfile(f)]
|
|
144
|
+
# If return_paths is False, return only files names
|
|
145
|
+
if not return_paths:
|
|
146
|
+
filepaths = [os.path.basename(f) for f in filepaths]
|
|
133
147
|
return filepaths
|
|
134
148
|
|
|
135
149
|
|
|
136
|
-
def list_directories(dir_path, glob_pattern, recursive=False):
|
|
150
|
+
def list_directories(dir_path, glob_pattern="*", recursive=False, skip_hidden=True, return_paths=True):
|
|
137
151
|
"""Return a list of directory paths (exclude file paths)."""
|
|
138
|
-
paths = list_paths(dir_path, glob_pattern, recursive=recursive)
|
|
152
|
+
paths = list_paths(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
|
|
139
153
|
dir_paths = [f for f in paths if os.path.isdir(f)]
|
|
154
|
+
# If return_paths is False, return only directory names
|
|
155
|
+
if not return_paths:
|
|
156
|
+
dir_paths = [os.path.basename(f) for f in dir_paths]
|
|
140
157
|
return dir_paths
|
|
141
158
|
|
|
142
159
|
|
|
143
|
-
def count_files(dir_path, glob_pattern, recursive=False):
|
|
160
|
+
def count_files(dir_path, glob_pattern="*", recursive=False, skip_hidden=True):
|
|
144
161
|
"""Return the number of files (exclude directories)."""
|
|
145
|
-
return len(list_files(dir_path, glob_pattern, recursive=recursive))
|
|
162
|
+
return len(list_files(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden))
|
|
146
163
|
|
|
147
164
|
|
|
148
|
-
def count_directories(dir_path, glob_pattern, recursive=False):
|
|
165
|
+
def count_directories(dir_path, glob_pattern="*", recursive=False, skip_hidden=True):
|
|
149
166
|
"""Return the number of files (exclude directories)."""
|
|
150
|
-
return len(list_directories(dir_path, glob_pattern, recursive=recursive))
|
|
167
|
+
return len(list_directories(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden))
|
|
151
168
|
|
|
152
169
|
|
|
153
170
|
def check_directory_exists(dir_path):
|
|
@@ -177,7 +194,7 @@ def create_required_directory(dir_path, dir_name, exist_ok=True):
|
|
|
177
194
|
create_directory(path=new_dir_path, exist_ok=exist_ok)
|
|
178
195
|
|
|
179
196
|
|
|
180
|
-
def is_empty_directory(path):
|
|
197
|
+
def is_empty_directory(path, skip_hidden=True):
|
|
181
198
|
"""Check if a directory path is empty.
|
|
182
199
|
|
|
183
200
|
Return ``False`` if path is a file or non-empty directory.
|
|
@@ -187,8 +204,11 @@ def is_empty_directory(path):
|
|
|
187
204
|
raise OSError(f"{path} does not exist.")
|
|
188
205
|
if not os.path.isdir(path):
|
|
189
206
|
return False
|
|
190
|
-
|
|
191
207
|
paths = os.listdir(path)
|
|
208
|
+
|
|
209
|
+
# If skip_hidden is True, filter out hidden files/directories
|
|
210
|
+
if skip_hidden:
|
|
211
|
+
paths = [f for f in paths if not f.startswith(".")]
|
|
192
212
|
return len(paths) == 0
|
|
193
213
|
|
|
194
214
|
|
disdrodb/utils/encoding.py
CHANGED
|
@@ -17,42 +17,59 @@
|
|
|
17
17
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
18
18
|
# -----------------------------------------------------------------------------.
|
|
19
19
|
"""DISDRODB netCDF4 encoding utilities."""
|
|
20
|
+
import os
|
|
21
|
+
|
|
20
22
|
import xarray as xr
|
|
21
23
|
|
|
24
|
+
from disdrodb.utils.yaml import read_yaml
|
|
25
|
+
|
|
22
26
|
EPOCH = "seconds since 1970-01-01 00:00:00"
|
|
23
27
|
|
|
24
28
|
|
|
25
|
-
def
|
|
29
|
+
def get_encodings_dict():
|
|
30
|
+
"""Get encoding dictionary for DISDRODB product variables and coordinates."""
|
|
31
|
+
import disdrodb
|
|
32
|
+
|
|
33
|
+
configs_path = os.path.join(disdrodb.__root_path__, "disdrodb", "etc", "configs")
|
|
34
|
+
encodings_dict = read_yaml(os.path.join(configs_path, "encodings.yaml"))
|
|
35
|
+
return encodings_dict
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def set_encodings(ds: xr.Dataset, encodings_dict: dict) -> xr.Dataset:
|
|
26
39
|
"""Apply the encodings to the xarray Dataset.
|
|
27
40
|
|
|
28
41
|
Parameters
|
|
29
42
|
----------
|
|
30
43
|
ds : xarray.Dataset
|
|
31
44
|
Input xarray dataset.
|
|
32
|
-
|
|
33
|
-
Dictionary with
|
|
45
|
+
encodings_dict : dict
|
|
46
|
+
Dictionary with encodings specifications.
|
|
34
47
|
|
|
35
48
|
Returns
|
|
36
49
|
-------
|
|
37
50
|
xarray.Dataset
|
|
38
51
|
Output xarray dataset.
|
|
39
52
|
"""
|
|
53
|
+
# TODO: CHANGE CHUNKSIZES SPECIFICATION USING {<DIM>: <CHUNKSIZE>} INSTEAD OF LIST
|
|
54
|
+
# --> Then unwrap to list of chunksizes here
|
|
55
|
+
|
|
40
56
|
# Subset encoding dictionary
|
|
41
|
-
# - Here below
|
|
42
|
-
|
|
57
|
+
# - Here below encodings_dict contains only keys (variables) within the dataset
|
|
58
|
+
encodings_dict = {var: encodings_dict[var] for var in ds.data_vars if var in encodings_dict}
|
|
43
59
|
|
|
44
60
|
# Ensure chunksize smaller than the array shape
|
|
45
|
-
|
|
61
|
+
encodings_dict = sanitize_encodings_dict(encodings_dict, ds)
|
|
46
62
|
|
|
47
63
|
# Rechunk variables for fast writing !
|
|
48
64
|
# - This pop the chunksize argument from the encoding dict !
|
|
49
|
-
ds = rechunk_dataset(ds,
|
|
65
|
+
ds = rechunk_dataset(ds, encodings_dict)
|
|
50
66
|
|
|
51
67
|
# Set time encoding
|
|
52
|
-
|
|
68
|
+
if "time" in ds:
|
|
69
|
+
ds["time"].encoding.update(get_time_encoding())
|
|
53
70
|
|
|
54
71
|
# Set the variable encodings
|
|
55
|
-
for var, encoding in
|
|
72
|
+
for var, encoding in encodings_dict.items():
|
|
56
73
|
ds[var].encoding.update(encoding)
|
|
57
74
|
|
|
58
75
|
# Ensure no deprecated "missing_value" attribute
|
|
@@ -63,12 +80,12 @@ def set_encodings(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset:
|
|
|
63
80
|
return ds
|
|
64
81
|
|
|
65
82
|
|
|
66
|
-
def sanitize_encodings_dict(
|
|
83
|
+
def sanitize_encodings_dict(encodings_dict: dict, ds: xr.Dataset) -> dict:
|
|
67
84
|
"""Ensure chunk size to be smaller than the array shape.
|
|
68
85
|
|
|
69
86
|
Parameters
|
|
70
87
|
----------
|
|
71
|
-
|
|
88
|
+
encodings_dict : dict
|
|
72
89
|
Dictionary containing the variable encodings.
|
|
73
90
|
ds : xarray.Dataset
|
|
74
91
|
Input dataset.
|
|
@@ -79,23 +96,23 @@ def sanitize_encodings_dict(encoding_dict: dict, ds: xr.Dataset) -> dict:
|
|
|
79
96
|
Encoding dictionary.
|
|
80
97
|
"""
|
|
81
98
|
for var in ds.data_vars:
|
|
82
|
-
if var in
|
|
99
|
+
if var in encodings_dict:
|
|
83
100
|
shape = ds[var].shape
|
|
84
|
-
chunks =
|
|
101
|
+
chunks = encodings_dict[var].get("chunksizes", None)
|
|
85
102
|
if chunks is not None:
|
|
86
103
|
chunks = [shape[i] if chunks[i] > shape[i] else chunks[i] for i in range(len(chunks))]
|
|
87
|
-
|
|
88
|
-
return
|
|
104
|
+
encodings_dict[var]["chunksizes"] = chunks
|
|
105
|
+
return encodings_dict
|
|
89
106
|
|
|
90
107
|
|
|
91
|
-
def rechunk_dataset(ds: xr.Dataset,
|
|
108
|
+
def rechunk_dataset(ds: xr.Dataset, encodings_dict: dict) -> xr.Dataset:
|
|
92
109
|
"""Coerce the dataset arrays to have the chunk size specified in the encoding dictionary.
|
|
93
110
|
|
|
94
111
|
Parameters
|
|
95
112
|
----------
|
|
96
113
|
ds : xarray.Dataset
|
|
97
114
|
Input xarray dataset
|
|
98
|
-
|
|
115
|
+
encodings_dict : dict
|
|
99
116
|
Dictionary containing the encoding to write the xarray dataset as a netCDF.
|
|
100
117
|
|
|
101
118
|
Returns
|
|
@@ -104,12 +121,13 @@ def rechunk_dataset(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset:
|
|
|
104
121
|
Output xarray dataset
|
|
105
122
|
"""
|
|
106
123
|
for var in ds.data_vars:
|
|
107
|
-
if var in
|
|
108
|
-
chunks =
|
|
124
|
+
if var in encodings_dict:
|
|
125
|
+
chunks = encodings_dict[var].get("chunksizes", None) # .pop("chunksizes", None)
|
|
109
126
|
if chunks is not None:
|
|
110
127
|
dims = list(ds[var].dims)
|
|
111
128
|
chunks_dict = dict(zip(dims, chunks))
|
|
112
129
|
ds[var] = ds[var].chunk(chunks_dict)
|
|
130
|
+
ds[var].encoding["chunksizes"] = chunks
|
|
113
131
|
return ds
|
|
114
132
|
|
|
115
133
|
|