disdrodb 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. disdrodb/__init__.py +68 -34
  2. disdrodb/_config.py +5 -4
  3. disdrodb/_version.py +16 -3
  4. disdrodb/accessor/__init__.py +20 -0
  5. disdrodb/accessor/methods.py +125 -0
  6. disdrodb/api/checks.py +177 -24
  7. disdrodb/api/configs.py +3 -3
  8. disdrodb/api/info.py +13 -13
  9. disdrodb/api/io.py +281 -22
  10. disdrodb/api/path.py +184 -195
  11. disdrodb/api/search.py +18 -9
  12. disdrodb/cli/disdrodb_create_summary.py +103 -0
  13. disdrodb/cli/disdrodb_create_summary_station.py +91 -0
  14. disdrodb/cli/disdrodb_run_l0.py +1 -1
  15. disdrodb/cli/disdrodb_run_l0_station.py +1 -1
  16. disdrodb/cli/disdrodb_run_l0a_station.py +1 -1
  17. disdrodb/cli/disdrodb_run_l0b.py +1 -1
  18. disdrodb/cli/disdrodb_run_l0b_station.py +3 -3
  19. disdrodb/cli/disdrodb_run_l0c.py +1 -1
  20. disdrodb/cli/disdrodb_run_l0c_station.py +3 -3
  21. disdrodb/cli/disdrodb_run_l1_station.py +2 -2
  22. disdrodb/cli/disdrodb_run_l2e_station.py +2 -2
  23. disdrodb/cli/disdrodb_run_l2m_station.py +2 -2
  24. disdrodb/configs.py +149 -4
  25. disdrodb/constants.py +61 -0
  26. disdrodb/data_transfer/download_data.py +127 -11
  27. disdrodb/etc/configs/attributes.yaml +339 -0
  28. disdrodb/etc/configs/encodings.yaml +473 -0
  29. disdrodb/etc/products/L1/global.yaml +13 -0
  30. disdrodb/etc/products/L2E/10MIN.yaml +12 -0
  31. disdrodb/etc/products/L2E/1MIN.yaml +1 -0
  32. disdrodb/etc/products/L2E/global.yaml +22 -0
  33. disdrodb/etc/products/L2M/10MIN.yaml +12 -0
  34. disdrodb/etc/products/L2M/GAMMA_ML.yaml +8 -0
  35. disdrodb/etc/products/L2M/NGAMMA_GS_LOG_ND_MAE.yaml +6 -0
  36. disdrodb/etc/products/L2M/NGAMMA_GS_ND_MAE.yaml +6 -0
  37. disdrodb/etc/products/L2M/NGAMMA_GS_Z_MAE.yaml +6 -0
  38. disdrodb/etc/products/L2M/global.yaml +26 -0
  39. disdrodb/issue/writer.py +2 -0
  40. disdrodb/l0/__init__.py +13 -0
  41. disdrodb/l0/configs/LPM/l0b_cf_attrs.yml +4 -4
  42. disdrodb/l0/configs/PARSIVEL/l0b_cf_attrs.yml +1 -1
  43. disdrodb/l0/configs/PARSIVEL/l0b_encodings.yml +3 -3
  44. disdrodb/l0/configs/PARSIVEL/raw_data_format.yml +1 -1
  45. disdrodb/l0/configs/PARSIVEL2/l0b_cf_attrs.yml +5 -5
  46. disdrodb/l0/configs/PARSIVEL2/l0b_encodings.yml +3 -3
  47. disdrodb/l0/configs/PARSIVEL2/raw_data_format.yml +1 -1
  48. disdrodb/l0/configs/PWS100/l0b_cf_attrs.yml +4 -4
  49. disdrodb/l0/configs/PWS100/raw_data_format.yml +1 -1
  50. disdrodb/l0/l0a_processing.py +37 -32
  51. disdrodb/l0/l0b_nc_processing.py +118 -8
  52. disdrodb/l0/l0b_processing.py +30 -65
  53. disdrodb/l0/l0c_processing.py +369 -259
  54. disdrodb/l0/readers/LPM/ARM/ARM_LPM.py +7 -0
  55. disdrodb/l0/readers/LPM/NETHERLANDS/DELFT_LPM_NC.py +66 -0
  56. disdrodb/l0/readers/LPM/SLOVENIA/{CRNI_VRH.py → UL.py} +3 -0
  57. disdrodb/l0/readers/LPM/SWITZERLAND/INNERERIZ_LPM.py +195 -0
  58. disdrodb/l0/readers/PARSIVEL/GPM/PIERS.py +0 -2
  59. disdrodb/l0/readers/PARSIVEL/JAPAN/JMA.py +4 -1
  60. disdrodb/l0/readers/PARSIVEL/NCAR/PECAN_MOBILE.py +1 -1
  61. disdrodb/l0/readers/PARSIVEL/NCAR/VORTEX2_2009.py +1 -1
  62. disdrodb/l0/readers/PARSIVEL2/ARM/ARM_PARSIVEL2.py +4 -0
  63. disdrodb/l0/readers/PARSIVEL2/BELGIUM/ILVO.py +168 -0
  64. disdrodb/l0/readers/PARSIVEL2/CANADA/UQAM_NC.py +69 -0
  65. disdrodb/l0/readers/PARSIVEL2/DENMARK/DTU.py +165 -0
  66. disdrodb/l0/readers/PARSIVEL2/FINLAND/FMI_PARSIVEL2.py +69 -0
  67. disdrodb/l0/readers/PARSIVEL2/FRANCE/ENPC_PARSIVEL2.py +255 -134
  68. disdrodb/l0/readers/PARSIVEL2/FRANCE/OSUG.py +525 -0
  69. disdrodb/l0/readers/PARSIVEL2/FRANCE/SIRTA_PARSIVEL2.py +1 -1
  70. disdrodb/l0/readers/PARSIVEL2/GPM/GCPEX.py +9 -7
  71. disdrodb/l0/readers/PARSIVEL2/KIT/BURKINA_FASO.py +1 -1
  72. disdrodb/l0/readers/PARSIVEL2/KIT/TEAMX.py +123 -0
  73. disdrodb/l0/readers/PARSIVEL2/{NETHERLANDS/DELFT.py → MPI/BCO_PARSIVEL2.py} +41 -71
  74. disdrodb/l0/readers/PARSIVEL2/MPI/BOWTIE.py +220 -0
  75. disdrodb/l0/readers/PARSIVEL2/NASA/APU.py +120 -0
  76. disdrodb/l0/readers/PARSIVEL2/NASA/LPVEX.py +109 -0
  77. disdrodb/l0/readers/PARSIVEL2/NCAR/FARM_PARSIVEL2.py +1 -0
  78. disdrodb/l0/readers/PARSIVEL2/NCAR/PECAN_FP3.py +1 -1
  79. disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_MIPS.py +126 -0
  80. disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_PIPS.py +165 -0
  81. disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_P2.py +1 -1
  82. disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_PIPS.py +20 -12
  83. disdrodb/l0/readers/PARSIVEL2/NETHERLANDS/DELFT_NC.py +5 -0
  84. disdrodb/l0/readers/PARSIVEL2/SPAIN/CENER.py +144 -0
  85. disdrodb/l0/readers/PARSIVEL2/SPAIN/CR1000DL.py +201 -0
  86. disdrodb/l0/readers/PARSIVEL2/SPAIN/LIAISE.py +137 -0
  87. disdrodb/l0/readers/PARSIVEL2/USA/C3WE.py +146 -0
  88. disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100.py +105 -99
  89. disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100_SIRTA.py +151 -0
  90. disdrodb/l1/__init__.py +5 -0
  91. disdrodb/l1/fall_velocity.py +46 -0
  92. disdrodb/l1/filters.py +34 -20
  93. disdrodb/l1/processing.py +46 -45
  94. disdrodb/l1/resampling.py +77 -66
  95. disdrodb/l1_env/routines.py +18 -3
  96. disdrodb/l2/__init__.py +7 -0
  97. disdrodb/l2/empirical_dsd.py +58 -10
  98. disdrodb/l2/processing.py +268 -117
  99. disdrodb/metadata/checks.py +132 -125
  100. disdrodb/metadata/standards.py +3 -1
  101. disdrodb/psd/fitting.py +631 -345
  102. disdrodb/psd/models.py +9 -6
  103. disdrodb/routines/__init__.py +54 -0
  104. disdrodb/{l0/routines.py → routines/l0.py} +316 -355
  105. disdrodb/{l1/routines.py → routines/l1.py} +76 -116
  106. disdrodb/routines/l2.py +1019 -0
  107. disdrodb/{routines.py → routines/wrappers.py} +98 -10
  108. disdrodb/scattering/__init__.py +16 -4
  109. disdrodb/scattering/axis_ratio.py +61 -37
  110. disdrodb/scattering/permittivity.py +504 -0
  111. disdrodb/scattering/routines.py +746 -184
  112. disdrodb/summary/__init__.py +17 -0
  113. disdrodb/summary/routines.py +4196 -0
  114. disdrodb/utils/archiving.py +434 -0
  115. disdrodb/utils/attrs.py +68 -125
  116. disdrodb/utils/cli.py +5 -5
  117. disdrodb/utils/compression.py +30 -1
  118. disdrodb/utils/dask.py +121 -9
  119. disdrodb/utils/dataframe.py +61 -7
  120. disdrodb/utils/decorators.py +31 -0
  121. disdrodb/utils/directories.py +35 -15
  122. disdrodb/utils/encoding.py +37 -19
  123. disdrodb/{l2 → utils}/event.py +15 -173
  124. disdrodb/utils/logger.py +14 -7
  125. disdrodb/utils/manipulations.py +81 -0
  126. disdrodb/utils/routines.py +166 -0
  127. disdrodb/utils/subsetting.py +214 -0
  128. disdrodb/utils/time.py +35 -177
  129. disdrodb/utils/writer.py +20 -7
  130. disdrodb/utils/xarray.py +5 -4
  131. disdrodb/viz/__init__.py +13 -0
  132. disdrodb/viz/plots.py +398 -0
  133. {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/METADATA +4 -3
  134. {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/RECORD +139 -98
  135. {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/entry_points.txt +2 -0
  136. disdrodb/l1/encoding_attrs.py +0 -642
  137. disdrodb/l2/processing_options.py +0 -213
  138. disdrodb/l2/routines.py +0 -868
  139. /disdrodb/l0/readers/PARSIVEL/SLOVENIA/{UL_FGG.py → UL.py} +0 -0
  140. {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/WHEEL +0 -0
  141. {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/licenses/LICENSE +0 -0
  142. {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/top_level.txt +0 -0
disdrodb/utils/cli.py CHANGED
@@ -21,7 +21,7 @@
21
21
  import click
22
22
 
23
23
 
24
- def _execute_cmd(cmd, raise_error=False):
24
+ def execute_cmd(cmd, raise_error=False):
25
25
  """Execute command in the terminal, streaming output in python console."""
26
26
  from subprocess import PIPE, CalledProcessError, Popen
27
27
 
@@ -34,7 +34,7 @@ def _execute_cmd(cmd, raise_error=False):
34
34
  raise CalledProcessError(p.returncode, p.args)
35
35
 
36
36
 
37
- def _parse_empty_string_and_none(args):
37
+ def parse_empty_string_and_none(args):
38
38
  """Utility to parse argument passed from the command line.
39
39
 
40
40
  If ``args = ''``, returns None.
@@ -58,7 +58,7 @@ def parse_arg_to_list(args):
58
58
  If ``args = 'variable1 variable2'`` returns ``[variable1, variable2]``.
59
59
  """
60
60
  # If '' or 'None' --> Set to None
61
- args = _parse_empty_string_and_none(args)
61
+ args = parse_empty_string_and_none(args)
62
62
  # - If multiple arguments, split by space
63
63
  if isinstance(args, str):
64
64
  # - Split by space
@@ -75,7 +75,7 @@ def parse_archive_dir(archive_dir: str):
75
75
  If ``archive_dir = ''`` returns ``None``.
76
76
  """
77
77
  # If '', set to 'None'
78
- return _parse_empty_string_and_none(archive_dir)
78
+ return parse_empty_string_and_none(archive_dir)
79
79
 
80
80
 
81
81
  def click_station_arguments(function: object):
@@ -86,7 +86,7 @@ def click_station_arguments(function: object):
86
86
  function : object
87
87
  Function.
88
88
  """
89
- function = click.argument("station_name", metavar="<station>")(function)
89
+ function = click.argument("station_name", metavar="<STATION_NAME>")(function)
90
90
  function = click.argument("campaign_name", metavar="<CAMPAIGN_NAME>")(function)
91
91
  function = click.argument("data_source", metavar="<DATA_SOURCE>")(function)
92
92
  return function
@@ -22,6 +22,7 @@ import bz2
22
22
  import gzip
23
23
  import os
24
24
  import shutil
25
+ import subprocess
25
26
  import tempfile
26
27
  import zipfile
27
28
  from typing import Optional
@@ -53,6 +54,34 @@ def unzip_file(filepath: str, dest_path: str) -> None:
53
54
  zip_ref.extractall(dest_path)
54
55
 
55
56
 
57
+ def unzip_file_on_terminal(filepath: str, dest_path: str) -> str:
58
+ """Unzip a file into a directory using the terminal command.
59
+
60
+ Parameters
61
+ ----------
62
+ filepath : str
63
+ Path of the file to unzip.
64
+ dest_path : str
65
+ Path of the destination directory.
66
+ """
67
+ os.makedirs(dest_path, exist_ok=True)
68
+
69
+ if os.name == "nt":
70
+ # Windows: use PowerShell Expand-Archive
71
+ cmd = [
72
+ "powershell.exe",
73
+ "-NoProfile",
74
+ "-NonInteractive",
75
+ "-Command",
76
+ f"Expand-Archive -LiteralPath '{filepath}' -DestinationPath '{dest_path}' -Force",
77
+ ]
78
+ else:
79
+ # macOS/Linux: use unzip
80
+ cmd = ["unzip", "-q", filepath, "-d", dest_path]
81
+
82
+ subprocess.run(cmd, check=True)
83
+
84
+
56
85
  def _zip_dir(dir_path: str) -> str:
57
86
  """Zip a directory into a file located in the same directory.
58
87
 
@@ -157,7 +186,7 @@ def compress_station_files(
157
186
  raise ValueError(f"Station data directory {station_dir} does not exist.")
158
187
 
159
188
  # Get list of files inside the station directory (in all nested directories)
160
- filepaths = list_files(station_dir, glob_pattern="*", recursive=True)
189
+ filepaths = list_files(station_dir, recursive=True)
161
190
  for filepath in filepaths:
162
191
  _ = _compress_file(filepath, method, skip=skip)
163
192
 
disdrodb/utils/dask.py CHANGED
@@ -16,31 +16,82 @@
16
16
  # You should have received a copy of the GNU General Public License
17
17
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
18
18
  # -----------------------------------------------------------------------------.
19
- """Utilities for Dask Distributed computations."""
19
+ """Utilities for Dask Distributed Computations."""
20
20
  import logging
21
21
  import os
22
22
 
23
+ import numpy as np
23
24
 
24
- def initialize_dask_cluster():
25
+
26
+ def check_parallel_validity(parallel):
27
+ """Check validity of parallel option given Dask settings."""
28
+ import dask
29
+
30
+ scheduler = dask.config.get("scheduler", None)
31
+ if scheduler is None:
32
+ return parallel
33
+ if scheduler in ["synchronous", "threads"]:
34
+ return False
35
+ if scheduler == "distributed":
36
+ from dask.distributed import default_client
37
+
38
+ client = default_client()
39
+ info = client.scheduler_info()
40
+
41
+ # If ThreadWorker, only 1 pid
42
+ pids = list(client.run(os.getpid).values())
43
+ if len(np.unique(pids)) == 1:
44
+ return False
45
+
46
+ # If ProcessWorker
47
+ # - Check single thread per worker to avoid locks
48
+ nthreads_per_process = np.array([v["nthreads"] for v in info["workers"].values()])
49
+ if not np.all(nthreads_per_process == 1):
50
+ print(
51
+ "To open netCDFs in parallel with dask distributed (processes=True), please set threads_per_worker=1 !",
52
+ )
53
+ return False
54
+
55
+ # Otherwise let the user choose
56
+ return parallel
57
+
58
+
59
+ def initialize_dask_cluster(minimum_memory=None):
25
60
  """Initialize Dask Cluster."""
26
61
  import dask
62
+ import psutil
63
+
64
+ # Silence dask warnings
65
+ # dask.config.set({"logging.distributed": "error"})
66
+ # Import dask.distributed after setting the config
27
67
  from dask.distributed import Client, LocalCluster
68
+ from dask.utils import parse_bytes
28
69
 
29
70
  # Set HDF5_USE_FILE_LOCKING to avoid going stuck with HDF
30
71
  os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
31
- # Retrieve the number of process to run
32
- available_workers = os.cpu_count() - 2 # if not set, all CPUs
72
+
73
+ # Retrieve the number of processes to run
74
+ available_workers = os.cpu_count() - 2 # if not set, all CPUs minus 2
33
75
  num_workers = dask.config.get("num_workers", available_workers)
34
- # Silence dask warnings
35
- dask.config.set({"logging.distributed": "error"})
36
- # dask.config.set({"distributed.admin.system-monitor.gil.enabled": False})
76
+
77
+ # If memory limit specified, ensure correct amount of workers
78
+ if minimum_memory is not None:
79
+ # Compute available memory (in bytes)
80
+ total_memory = psutil.virtual_memory().total
81
+ # Get minimum memory per worker (in bytes)
82
+ minimum_memory = parse_bytes(minimum_memory)
83
+ # Determine number of workers constrained by memory
84
+ maximum_workers_allowed = max(1, total_memory // minimum_memory)
85
+ # Respect both CPU and memory requirements
86
+ num_workers = min(maximum_workers_allowed, num_workers)
87
+
37
88
  # Create dask.distributed local cluster
38
89
  cluster = LocalCluster(
39
90
  n_workers=num_workers,
40
91
  threads_per_worker=1,
41
92
  processes=True,
42
- # memory_limit='8GB',
43
- # silence_logs=False,
93
+ memory_limit=0, # this avoid flexible dask memory management
94
+ silence_logs=logging.ERROR,
44
95
  )
45
96
  client = Client(cluster)
46
97
  return cluster, client
@@ -60,3 +111,64 @@ def close_dask_cluster(cluster, client):
60
111
  finally:
61
112
  # Restore the original log level
62
113
  logger.setLevel(original_level)
114
+
115
+
116
+ def execute_tasks_safely(list_tasks, parallel: bool, logs_dir: str):
117
+ """
118
+ Execute Dask tasks and skip failed ones.
119
+
120
+ Parameters
121
+ ----------
122
+ list_tasks : list
123
+ List of dask delayed objects or results.
124
+ parallel : bool
125
+ Whether to execute in parallel with Dask or not.
126
+ logs_dir : str
127
+ Directory to store FAILED_TASKS.log.
128
+
129
+ Returns
130
+ -------
131
+ list_logs : list
132
+ List of task results. For failed tasks, adds the path
133
+ to FAILED_TASKS.log in place of the result.
134
+ """
135
+ from dask.distributed import get_client
136
+
137
+ # Ensure logs_dir exists
138
+ os.makedirs(logs_dir, exist_ok=True)
139
+
140
+ # Define file name where to log failed dask tasks
141
+ failed_log_path = os.path.join(logs_dir, "FAILED_DASK_TASKS.log")
142
+
143
+ if not parallel:
144
+ # Non-parallel mode: just return results directly
145
+ return list_tasks
146
+
147
+ # Ensure we have a Dask client
148
+ try:
149
+ client = get_client()
150
+ except ValueError:
151
+ raise ValueError("No Dask Distributed Client found.")
152
+
153
+ # Compute tasks (all concurrently)
154
+ # - Runs tasks == num_workers * threads_per_worker (which is 1 for DISDRODB)
155
+ # - If errors occurs in some, skip it
156
+ futures = client.compute(list_tasks)
157
+ results = client.gather(futures, errors="skip")
158
+
159
+ # Collect failed futures
160
+ failed_futures = [f for f in futures if f.status != "finished"] # "error"
161
+
162
+ # If no tasks failed, return results
163
+ if not failed_futures:
164
+ return results
165
+
166
+ # Otherwise define log file listing failed tasks
167
+ with open(failed_log_path, "w") as f:
168
+ for fut in failed_futures:
169
+ err = fut.exception()
170
+ f.write(f"ERROR - DASK TASK FAILURE - Task {fut.key} failed: {err}\n")
171
+
172
+ # Append to list of log filepaths (results) the dask failing log
173
+ results.append(failed_log_path)
174
+ return results
@@ -20,6 +20,8 @@
20
20
  import numpy as np
21
21
  import pandas as pd
22
22
 
23
+ from disdrodb.utils.warnings import suppress_warnings
24
+
23
25
 
24
26
  def log_arange(start, stop, log_step=0.1, base=10):
25
27
  """
@@ -47,7 +49,39 @@ def log_arange(start, stop, log_step=0.1, base=10):
47
49
  log_start = np.log(start) / np.log(base)
48
50
  log_stop = np.log(stop) / np.log(base)
49
51
 
50
- log_values = np.arange(log_start, log_stop, log_step)
52
+ log_values = np.arange(log_start, log_stop + log_step / 2, log_step)
53
+ return base**log_values
54
+
55
+
56
+ def log_linspace(start, stop, n_bins, base=10):
57
+ """
58
+ Return numbers spaced evenly on a log scale between start and stop.
59
+
60
+ Parameters
61
+ ----------
62
+ start : float
63
+ The starting value of the sequence (must be > 0).
64
+ stop : float
65
+ The end value of the sequence (must be > 0).
66
+ n_bins : int
67
+ The number of points to generate (including start and stop).
68
+ base : float
69
+ The logarithmic base (default is 10).
70
+
71
+ Returns
72
+ -------
73
+ np.ndarray
74
+ Array of values spaced evenly in log space.
75
+ """
76
+ if start <= 0 or stop <= 0:
77
+ raise ValueError("Both start and stop must be > 0 for log spacing.")
78
+ if n_bins < 2:
79
+ raise ValueError("n_bins must be >= 2 to include start and stop values.")
80
+
81
+ log_start = np.log(start) / np.log(base)
82
+ log_stop = np.log(stop) / np.log(base)
83
+
84
+ log_values = np.linspace(log_start, log_stop, n_bins)
51
85
  return base**log_values
52
86
 
53
87
 
@@ -100,6 +134,9 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
100
134
  if len(df) == 0:
101
135
  raise ValueError("No valid data points after removing NaN values")
102
136
 
137
+ # Keep only data within bin range
138
+ df = df[(df[column] >= bins[0]) & (df[column] < bins[-1])]
139
+
103
140
  # Create binned columns with explicit handling of out-of-bounds values
104
141
  df[f"{column}_binned"] = pd.cut(df[column], bins=bins, include_lowest=True)
105
142
 
@@ -134,7 +171,7 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
134
171
  (f"{prefix}std", "std"),
135
172
  (f"{prefix}min", "min"),
136
173
  (f"{prefix}max", "max"),
137
- (f"{prefix}mad", lambda s: np.median(np.abs(s - np.median(s)))),
174
+ (f"{prefix}mad", lambda s: (s - s.median()).abs().median()),
138
175
  ]
139
176
  if i == 0:
140
177
  list_stats.append(("count", "count"))
@@ -142,7 +179,8 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
142
179
  list_stats = [("count", "count")]
143
180
 
144
181
  # Compute statistics
145
- df_stats = df_grouped[var].agg(list_stats)
182
+ with suppress_warnings():
183
+ df_stats = df_grouped[var].agg(list_stats)
146
184
 
147
185
  # Compute other variable statistics
148
186
  if variables_specified:
@@ -253,8 +291,18 @@ def compute_2d_histogram(
253
291
  raise ValueError("No valid data points after removing NaN values")
254
292
 
255
293
  # Create binned columns with explicit handling of out-of-bounds values
256
- df[f"{x}_binned"] = pd.cut(df[x], bins=x_bins, include_lowest=True)
257
- df[f"{y}_binned"] = pd.cut(df[y], bins=y_bins, include_lowest=True)
294
+ df[f"{x}_binned"] = pd.cut(
295
+ df[x],
296
+ bins=pd.IntervalIndex.from_breaks(x_bins, closed="right"),
297
+ include_lowest=True,
298
+ ordered=True,
299
+ )
300
+ df[f"{y}_binned"] = pd.cut(
301
+ df[y],
302
+ bins=pd.IntervalIndex.from_breaks(y_bins, closed="right"),
303
+ include_lowest=True,
304
+ ordered=True,
305
+ )
258
306
 
259
307
  # Create complete IntervalIndex for both dimensions
260
308
  x_intervals = df[f"{x}_binned"].cat.categories
@@ -318,8 +366,8 @@ def compute_2d_histogram(
318
366
  df_stats = df_stats.reindex(full_index)
319
367
 
320
368
  # Determine coordinates
321
- x_centers = x_intervals.mid
322
- y_centers = y_intervals.mid
369
+ x_centers = np.array(x_intervals.mid)
370
+ y_centers = np.array(y_intervals.mid)
323
371
 
324
372
  # Use provided labels if available
325
373
  x_coords = x_labels if x_labels is not None else x_centers
@@ -337,6 +385,12 @@ def compute_2d_histogram(
337
385
  # Convert to dataset
338
386
  ds = df_stats.to_xarray()
339
387
 
388
+ # Convert Categorical coordinates to float if possible
389
+ if np.issubdtype(x_coords.dtype, np.number):
390
+ ds[f"{x}"] = ds[f"{x}"].astype(float)
391
+ if np.issubdtype(y_coords.dtype, np.number):
392
+ ds[f"{y}"] = ds[f"{y}"].astype(float)
393
+
340
394
  # Transpose arrays
341
395
  ds = ds.transpose(y, x)
342
396
  return ds
@@ -19,10 +19,34 @@
19
19
  """DISDRODB decorators."""
20
20
  import functools
21
21
  import importlib
22
+ import uuid
22
23
 
23
24
  import dask
24
25
 
25
26
 
27
+ def create_dask_task_name(function_name: str, name=None) -> str | None:
28
+ """
29
+ Create a custom dask task name.
30
+
31
+ Parameters
32
+ ----------
33
+ function_name : str
34
+ Name of the function being delayed.
35
+ name : str, optional
36
+ Custom name for the task (e.g., filepath or ID).
37
+ If None, returns None so that Dask generates is own default name.
38
+
39
+ Returns
40
+ -------
41
+ str | None
42
+ Custom dask task name string if `name` is given,
43
+ otherwise None (use Dask's default naming).
44
+ """
45
+ if name is None:
46
+ return None
47
+ return f"{function_name}.{name}-{uuid.uuid4()}"
48
+
49
+
26
50
  def delayed_if_parallel(function):
27
51
  """Decorator to make the function delayed if its ``parallel`` argument is ``True``."""
28
52
 
@@ -34,6 +58,13 @@ def delayed_if_parallel(function):
34
58
  if parallel:
35
59
  # Enforce verbose to be False
36
60
  kwargs["verbose"] = False
61
+ # Define custom dask task name
62
+ if "logs_filename" in kwargs:
63
+ kwargs["dask_key_name"] = create_dask_task_name(
64
+ function_name=function.__name__,
65
+ name=kwargs["logs_filename"],
66
+ )
67
+
37
68
  # Define the delayed task
38
69
  result = dask.delayed(function)(*args, **kwargs)
39
70
  else:
@@ -98,18 +98,29 @@ def _recursive_glob(dir_path, glob_pattern):
98
98
  return [str(path) for path in dir_path.rglob(glob_pattern)]
99
99
 
100
100
 
101
- def _list_paths(dir_path, glob_pattern, recursive=False):
101
+ def _is_hidden(path):
102
+ """Return True if any component of path is hidden."""
103
+ return any(part.startswith(".") for part in path.split(os.sep))
104
+
105
+
106
+ def _list_paths(dir_path, glob_pattern, recursive=False, skip_hidden=True):
102
107
  """Return a list of filepaths and directory paths based on a single glob pattern."""
103
108
  # If glob pattern has separators, disable recursive option
104
109
  if "/" in glob_pattern and "**" not in glob_pattern:
105
110
  recursive = False
106
111
  # Search paths
107
112
  if not recursive:
108
- return glob.glob(os.path.join(dir_path, glob_pattern))
109
- return _recursive_glob(dir_path, glob_pattern)
113
+ matches = glob.glob(os.path.join(dir_path, glob_pattern))
114
+ else:
115
+ matches = _recursive_glob(dir_path, glob_pattern)
110
116
 
117
+ # Filter out anything with a hidden component
118
+ if skip_hidden:
119
+ matches = [p for p in matches if not _is_hidden(os.path.relpath(p, dir_path))]
120
+ return matches
111
121
 
112
- def list_paths(dir_path, glob_pattern, recursive=False):
122
+
123
+ def list_paths(dir_path, glob_pattern, recursive=False, skip_hidden=True):
113
124
  """Return a list of filepaths and directory paths.
114
125
 
115
126
  This function accept also a list of glob patterns !
@@ -119,35 +130,41 @@ def list_paths(dir_path, glob_pattern, recursive=False):
119
130
  # Search path for specified glob patterns
120
131
  paths = flatten_list(
121
132
  [
122
- _list_paths(dir_path=dir_path, glob_pattern=glob_pattern, recursive=recursive)
133
+ _list_paths(dir_path=dir_path, glob_pattern=glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
123
134
  for glob_pattern in glob_patterns
124
135
  ],
125
136
  )
126
137
  return paths
127
138
 
128
139
 
129
- def list_files(dir_path, glob_pattern, recursive=False):
140
+ def list_files(dir_path, glob_pattern="*", recursive=False, skip_hidden=True, return_paths=True):
130
141
  """Return a list of filepaths (exclude directory paths)."""
131
- paths = list_paths(dir_path, glob_pattern, recursive=recursive)
142
+ paths = list_paths(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
132
143
  filepaths = [f for f in paths if os.path.isfile(f)]
144
+ # If return_paths is False, return only files names
145
+ if not return_paths:
146
+ filepaths = [os.path.basename(f) for f in filepaths]
133
147
  return filepaths
134
148
 
135
149
 
136
- def list_directories(dir_path, glob_pattern, recursive=False):
150
+ def list_directories(dir_path, glob_pattern="*", recursive=False, skip_hidden=True, return_paths=True):
137
151
  """Return a list of directory paths (exclude file paths)."""
138
- paths = list_paths(dir_path, glob_pattern, recursive=recursive)
152
+ paths = list_paths(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
139
153
  dir_paths = [f for f in paths if os.path.isdir(f)]
154
+ # If return_paths is False, return only directory names
155
+ if not return_paths:
156
+ dir_paths = [os.path.basename(f) for f in dir_paths]
140
157
  return dir_paths
141
158
 
142
159
 
143
- def count_files(dir_path, glob_pattern, recursive=False):
160
+ def count_files(dir_path, glob_pattern="*", recursive=False, skip_hidden=True):
144
161
  """Return the number of files (exclude directories)."""
145
- return len(list_files(dir_path, glob_pattern, recursive=recursive))
162
+ return len(list_files(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden))
146
163
 
147
164
 
148
- def count_directories(dir_path, glob_pattern, recursive=False):
165
+ def count_directories(dir_path, glob_pattern="*", recursive=False, skip_hidden=True):
149
166
  """Return the number of files (exclude directories)."""
150
- return len(list_directories(dir_path, glob_pattern, recursive=recursive))
167
+ return len(list_directories(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden))
151
168
 
152
169
 
153
170
  def check_directory_exists(dir_path):
@@ -177,7 +194,7 @@ def create_required_directory(dir_path, dir_name, exist_ok=True):
177
194
  create_directory(path=new_dir_path, exist_ok=exist_ok)
178
195
 
179
196
 
180
- def is_empty_directory(path):
197
+ def is_empty_directory(path, skip_hidden=True):
181
198
  """Check if a directory path is empty.
182
199
 
183
200
  Return ``False`` if path is a file or non-empty directory.
@@ -187,8 +204,11 @@ def is_empty_directory(path):
187
204
  raise OSError(f"{path} does not exist.")
188
205
  if not os.path.isdir(path):
189
206
  return False
190
-
191
207
  paths = os.listdir(path)
208
+
209
+ # If skip_hidden is True, filter out hidden files/directories
210
+ if skip_hidden:
211
+ paths = [f for f in paths if not f.startswith(".")]
192
212
  return len(paths) == 0
193
213
 
194
214
 
@@ -17,42 +17,59 @@
17
17
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
18
18
  # -----------------------------------------------------------------------------.
19
19
  """DISDRODB netCDF4 encoding utilities."""
20
+ import os
21
+
20
22
  import xarray as xr
21
23
 
24
+ from disdrodb.utils.yaml import read_yaml
25
+
22
26
  EPOCH = "seconds since 1970-01-01 00:00:00"
23
27
 
24
28
 
25
- def set_encodings(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset:
29
+ def get_encodings_dict():
30
+ """Get encoding dictionary for DISDRODB product variables and coordinates."""
31
+ import disdrodb
32
+
33
+ configs_path = os.path.join(disdrodb.__root_path__, "disdrodb", "etc", "configs")
34
+ encodings_dict = read_yaml(os.path.join(configs_path, "encodings.yaml"))
35
+ return encodings_dict
36
+
37
+
38
+ def set_encodings(ds: xr.Dataset, encodings_dict: dict) -> xr.Dataset:
26
39
  """Apply the encodings to the xarray Dataset.
27
40
 
28
41
  Parameters
29
42
  ----------
30
43
  ds : xarray.Dataset
31
44
  Input xarray dataset.
32
- encoding_dict : dict
33
- Dictionary with encoding specifications.
45
+ encodings_dict : dict
46
+ Dictionary with encodings specifications.
34
47
 
35
48
  Returns
36
49
  -------
37
50
  xarray.Dataset
38
51
  Output xarray dataset.
39
52
  """
53
+ # TODO: CHANGE CHUNKSIZES SPECIFICATION USING {<DIM>: <CHUNKSIZE>} INSTEAD OF LIST
54
+ # --> Then unwrap to list of chunksizes here
55
+
40
56
  # Subset encoding dictionary
41
- # - Here below encoding_dict contains only keys (variables) within the dataset
42
- encoding_dict = {var: encoding_dict[var] for var in ds.data_vars if var in encoding_dict}
57
+ # - Here below encodings_dict contains only keys (variables) within the dataset
58
+ encodings_dict = {var: encodings_dict[var] for var in ds.data_vars if var in encodings_dict}
43
59
 
44
60
  # Ensure chunksize smaller than the array shape
45
- encoding_dict = sanitize_encodings_dict(encoding_dict, ds)
61
+ encodings_dict = sanitize_encodings_dict(encodings_dict, ds)
46
62
 
47
63
  # Rechunk variables for fast writing !
48
64
  # - This pop the chunksize argument from the encoding dict !
49
- ds = rechunk_dataset(ds, encoding_dict)
65
+ ds = rechunk_dataset(ds, encodings_dict)
50
66
 
51
67
  # Set time encoding
52
- ds["time"].encoding.update(get_time_encoding())
68
+ if "time" in ds:
69
+ ds["time"].encoding.update(get_time_encoding())
53
70
 
54
71
  # Set the variable encodings
55
- for var, encoding in encoding_dict.items():
72
+ for var, encoding in encodings_dict.items():
56
73
  ds[var].encoding.update(encoding)
57
74
 
58
75
  # Ensure no deprecated "missing_value" attribute
@@ -63,12 +80,12 @@ def set_encodings(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset:
63
80
  return ds
64
81
 
65
82
 
66
- def sanitize_encodings_dict(encoding_dict: dict, ds: xr.Dataset) -> dict:
83
+ def sanitize_encodings_dict(encodings_dict: dict, ds: xr.Dataset) -> dict:
67
84
  """Ensure chunk size to be smaller than the array shape.
68
85
 
69
86
  Parameters
70
87
  ----------
71
- encoding_dict : dict
88
+ encodings_dict : dict
72
89
  Dictionary containing the variable encodings.
73
90
  ds : xarray.Dataset
74
91
  Input dataset.
@@ -79,23 +96,23 @@ def sanitize_encodings_dict(encoding_dict: dict, ds: xr.Dataset) -> dict:
79
96
  Encoding dictionary.
80
97
  """
81
98
  for var in ds.data_vars:
82
- if var in encoding_dict:
99
+ if var in encodings_dict:
83
100
  shape = ds[var].shape
84
- chunks = encoding_dict[var].get("chunksizes", None)
101
+ chunks = encodings_dict[var].get("chunksizes", None)
85
102
  if chunks is not None:
86
103
  chunks = [shape[i] if chunks[i] > shape[i] else chunks[i] for i in range(len(chunks))]
87
- encoding_dict[var]["chunksizes"] = chunks
88
- return encoding_dict
104
+ encodings_dict[var]["chunksizes"] = chunks
105
+ return encodings_dict
89
106
 
90
107
 
91
- def rechunk_dataset(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset:
108
+ def rechunk_dataset(ds: xr.Dataset, encodings_dict: dict) -> xr.Dataset:
92
109
  """Coerce the dataset arrays to have the chunk size specified in the encoding dictionary.
93
110
 
94
111
  Parameters
95
112
  ----------
96
113
  ds : xarray.Dataset
97
114
  Input xarray dataset
98
- encoding_dict : dict
115
+ encodings_dict : dict
99
116
  Dictionary containing the encoding to write the xarray dataset as a netCDF.
100
117
 
101
118
  Returns
@@ -104,12 +121,13 @@ def rechunk_dataset(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset:
104
121
  Output xarray dataset
105
122
  """
106
123
  for var in ds.data_vars:
107
- if var in encoding_dict:
108
- chunks = encoding_dict[var].pop("chunksizes", None)
124
+ if var in encodings_dict:
125
+ chunks = encodings_dict[var].get("chunksizes", None) # .pop("chunksizes", None)
109
126
  if chunks is not None:
110
127
  dims = list(ds[var].dims)
111
128
  chunks_dict = dict(zip(dims, chunks))
112
129
  ds[var] = ds[var].chunk(chunks_dict)
130
+ ds[var].encoding["chunksizes"] = chunks
113
131
  return ds
114
132
 
115
133