disdrodb 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. disdrodb/__init__.py +64 -34
  2. disdrodb/_config.py +5 -4
  3. disdrodb/_version.py +16 -3
  4. disdrodb/accessor/__init__.py +20 -0
  5. disdrodb/accessor/methods.py +125 -0
  6. disdrodb/api/checks.py +139 -9
  7. disdrodb/api/configs.py +4 -2
  8. disdrodb/api/info.py +10 -10
  9. disdrodb/api/io.py +237 -18
  10. disdrodb/api/path.py +81 -75
  11. disdrodb/api/search.py +6 -6
  12. disdrodb/cli/disdrodb_create_summary_station.py +91 -0
  13. disdrodb/cli/disdrodb_run_l0.py +1 -1
  14. disdrodb/cli/disdrodb_run_l0_station.py +1 -1
  15. disdrodb/cli/disdrodb_run_l0b.py +1 -1
  16. disdrodb/cli/disdrodb_run_l0b_station.py +1 -1
  17. disdrodb/cli/disdrodb_run_l0c.py +1 -1
  18. disdrodb/cli/disdrodb_run_l0c_station.py +1 -1
  19. disdrodb/cli/disdrodb_run_l2e_station.py +1 -1
  20. disdrodb/configs.py +149 -4
  21. disdrodb/constants.py +61 -0
  22. disdrodb/data_transfer/download_data.py +5 -5
  23. disdrodb/etc/configs/attributes.yaml +339 -0
  24. disdrodb/etc/configs/encodings.yaml +473 -0
  25. disdrodb/etc/products/L1/global.yaml +13 -0
  26. disdrodb/etc/products/L2E/10MIN.yaml +12 -0
  27. disdrodb/etc/products/L2E/1MIN.yaml +1 -0
  28. disdrodb/etc/products/L2E/global.yaml +22 -0
  29. disdrodb/etc/products/L2M/10MIN.yaml +12 -0
  30. disdrodb/etc/products/L2M/GAMMA_ML.yaml +8 -0
  31. disdrodb/etc/products/L2M/NGAMMA_GS_LOG_ND_MAE.yaml +6 -0
  32. disdrodb/etc/products/L2M/NGAMMA_GS_ND_MAE.yaml +6 -0
  33. disdrodb/etc/products/L2M/NGAMMA_GS_Z_MAE.yaml +6 -0
  34. disdrodb/etc/products/L2M/global.yaml +26 -0
  35. disdrodb/l0/__init__.py +13 -0
  36. disdrodb/l0/configs/LPM/l0b_cf_attrs.yml +4 -4
  37. disdrodb/l0/configs/PARSIVEL/l0b_cf_attrs.yml +1 -1
  38. disdrodb/l0/configs/PARSIVEL/l0b_encodings.yml +3 -3
  39. disdrodb/l0/configs/PARSIVEL/raw_data_format.yml +1 -1
  40. disdrodb/l0/configs/PARSIVEL2/l0b_cf_attrs.yml +5 -5
  41. disdrodb/l0/configs/PARSIVEL2/l0b_encodings.yml +3 -3
  42. disdrodb/l0/configs/PARSIVEL2/raw_data_format.yml +1 -1
  43. disdrodb/l0/configs/PWS100/l0b_cf_attrs.yml +4 -4
  44. disdrodb/l0/configs/PWS100/raw_data_format.yml +1 -1
  45. disdrodb/l0/l0a_processing.py +30 -30
  46. disdrodb/l0/l0b_nc_processing.py +108 -2
  47. disdrodb/l0/l0b_processing.py +4 -4
  48. disdrodb/l0/l0c_processing.py +5 -13
  49. disdrodb/l0/readers/LPM/NETHERLANDS/DELFT_LPM_NC.py +66 -0
  50. disdrodb/l0/readers/LPM/SLOVENIA/{CRNI_VRH.py → UL.py} +3 -0
  51. disdrodb/l0/readers/LPM/SWITZERLAND/INNERERIZ_LPM.py +195 -0
  52. disdrodb/l0/readers/PARSIVEL/GPM/PIERS.py +0 -2
  53. disdrodb/l0/readers/PARSIVEL/JAPAN/JMA.py +4 -1
  54. disdrodb/l0/readers/PARSIVEL/NCAR/PECAN_MOBILE.py +1 -1
  55. disdrodb/l0/readers/PARSIVEL/NCAR/VORTEX2_2009.py +1 -1
  56. disdrodb/l0/readers/PARSIVEL2/BELGIUM/ILVO.py +168 -0
  57. disdrodb/l0/readers/PARSIVEL2/DENMARK/DTU.py +165 -0
  58. disdrodb/l0/readers/PARSIVEL2/FINLAND/FMI_PARSIVEL2.py +69 -0
  59. disdrodb/l0/readers/PARSIVEL2/FRANCE/ENPC_PARSIVEL2.py +255 -134
  60. disdrodb/l0/readers/PARSIVEL2/FRANCE/OSUG.py +525 -0
  61. disdrodb/l0/readers/PARSIVEL2/FRANCE/SIRTA_PARSIVEL2.py +1 -1
  62. disdrodb/l0/readers/PARSIVEL2/GPM/GCPEX.py +9 -7
  63. disdrodb/l0/readers/PARSIVEL2/KIT/BURKINA_FASO.py +1 -1
  64. disdrodb/l0/readers/PARSIVEL2/KIT/TEAMX.py +123 -0
  65. disdrodb/l0/readers/PARSIVEL2/NASA/APU.py +120 -0
  66. disdrodb/l0/readers/PARSIVEL2/NCAR/FARM_PARSIVEL2.py +1 -0
  67. disdrodb/l0/readers/PARSIVEL2/NCAR/PECAN_FP3.py +1 -1
  68. disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_MIPS.py +126 -0
  69. disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_PIPS.py +165 -0
  70. disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_P2.py +1 -1
  71. disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_PIPS.py +20 -12
  72. disdrodb/l0/readers/PARSIVEL2/NETHERLANDS/DELFT_NC.py +2 -0
  73. disdrodb/l0/readers/PARSIVEL2/SPAIN/CENER.py +144 -0
  74. disdrodb/l0/readers/PARSIVEL2/SPAIN/CR1000DL.py +201 -0
  75. disdrodb/l0/readers/PARSIVEL2/SPAIN/LIAISE.py +137 -0
  76. disdrodb/l0/readers/PARSIVEL2/{NETHERLANDS/DELFT.py → USA/C3WE.py} +65 -85
  77. disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100.py +105 -99
  78. disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100_SIRTA.py +151 -0
  79. disdrodb/l0/routines.py +105 -14
  80. disdrodb/l1/__init__.py +5 -0
  81. disdrodb/l1/filters.py +34 -20
  82. disdrodb/l1/processing.py +45 -44
  83. disdrodb/l1/resampling.py +77 -66
  84. disdrodb/l1/routines.py +35 -43
  85. disdrodb/l1_env/routines.py +18 -3
  86. disdrodb/l2/__init__.py +7 -0
  87. disdrodb/l2/empirical_dsd.py +58 -10
  88. disdrodb/l2/event.py +27 -120
  89. disdrodb/l2/processing.py +267 -116
  90. disdrodb/l2/routines.py +618 -254
  91. disdrodb/metadata/standards.py +3 -1
  92. disdrodb/psd/fitting.py +463 -144
  93. disdrodb/psd/models.py +8 -5
  94. disdrodb/routines.py +3 -3
  95. disdrodb/scattering/__init__.py +16 -4
  96. disdrodb/scattering/axis_ratio.py +56 -36
  97. disdrodb/scattering/permittivity.py +486 -0
  98. disdrodb/scattering/routines.py +701 -159
  99. disdrodb/summary/__init__.py +17 -0
  100. disdrodb/summary/routines.py +4120 -0
  101. disdrodb/utils/attrs.py +68 -125
  102. disdrodb/utils/compression.py +30 -1
  103. disdrodb/utils/dask.py +59 -8
  104. disdrodb/utils/dataframe.py +61 -7
  105. disdrodb/utils/directories.py +35 -15
  106. disdrodb/utils/encoding.py +33 -19
  107. disdrodb/utils/logger.py +13 -6
  108. disdrodb/utils/manipulations.py +71 -0
  109. disdrodb/utils/subsetting.py +214 -0
  110. disdrodb/utils/time.py +165 -19
  111. disdrodb/utils/writer.py +20 -7
  112. disdrodb/utils/xarray.py +2 -4
  113. disdrodb/viz/__init__.py +13 -0
  114. disdrodb/viz/plots.py +327 -0
  115. {disdrodb-0.1.2.dist-info → disdrodb-0.1.3.dist-info}/METADATA +3 -2
  116. {disdrodb-0.1.2.dist-info → disdrodb-0.1.3.dist-info}/RECORD +121 -88
  117. {disdrodb-0.1.2.dist-info → disdrodb-0.1.3.dist-info}/entry_points.txt +1 -0
  118. disdrodb/l1/encoding_attrs.py +0 -642
  119. disdrodb/l2/processing_options.py +0 -213
  120. /disdrodb/l0/readers/PARSIVEL/SLOVENIA/{UL_FGG.py → UL.py} +0 -0
  121. {disdrodb-0.1.2.dist-info → disdrodb-0.1.3.dist-info}/WHEEL +0 -0
  122. {disdrodb-0.1.2.dist-info → disdrodb-0.1.3.dist-info}/licenses/LICENSE +0 -0
  123. {disdrodb-0.1.2.dist-info → disdrodb-0.1.3.dist-info}/top_level.txt +0 -0
disdrodb/utils/attrs.py CHANGED
@@ -18,15 +18,26 @@
18
18
  # -----------------------------------------------------------------------------.
19
19
  """DISDRODB netCDF4 attributes utilities."""
20
20
  import datetime
21
+ import os
21
22
 
22
- from disdrodb import ARCHIVE_VERSION, CONVENTIONS, SOFTWARE_VERSION
23
+ from disdrodb.constants import ARCHIVE_VERSION, CONVENTIONS, COORDINATES, SOFTWARE_VERSION
24
+ from disdrodb.utils.yaml import read_yaml
23
25
 
24
26
  ####---------------------------------------------------------------------.
25
- #### Variable attributes
27
+ #### Variable and coordinates attributes
28
+
29
+
30
+ def get_attrs_dict():
31
+ """Get attributes dictionary for DISDRODB product variables and coordinates."""
32
+ import disdrodb
33
+
34
+ configs_path = os.path.join(disdrodb.__root_path__, "disdrodb", "etc", "configs")
35
+ attrs_dict = read_yaml(os.path.join(configs_path, "attributes.yaml"))
36
+ return attrs_dict
26
37
 
27
38
 
28
39
  def set_attrs(ds, attrs_dict):
29
- """Set attributes to the variables of the xr.Dataset."""
40
+ """Set attributes to the variables and coordinates of the xr.Dataset."""
30
41
  for var in attrs_dict:
31
42
  if var in ds:
32
43
  ds[var].attrs.update(attrs_dict[var])
@@ -37,104 +48,13 @@ def set_attrs(ds, attrs_dict):
37
48
  #### Coordinates attributes
38
49
 
39
50
 
40
- def get_coords_attrs_dict():
41
- """Return dictionary with DISDRODB coordinates attributes."""
42
- attrs_dict = {}
43
- # Define diameter attributes
44
- attrs_dict["diameter_bin_center"] = {
45
- "name": "diameter_bin_center",
46
- "standard_name": "diameter_bin_center",
47
- "long_name": "diameter_bin_center",
48
- "units": "mm",
49
- "description": "Bin center drop diameter value",
50
- }
51
- attrs_dict["diameter_bin_width"] = {
52
- "name": "diameter_bin_width",
53
- "standard_name": "diameter_bin_width",
54
- "long_name": "diameter_bin_width",
55
- "units": "mm",
56
- "description": "Drop diameter bin width",
57
- }
58
- attrs_dict["diameter_bin_upper"] = {
59
- "name": "diameter_bin_upper",
60
- "standard_name": "diameter_bin_upper",
61
- "long_name": "diameter_bin_upper",
62
- "units": "mm",
63
- "description": "Bin upper bound drop diameter value",
64
- }
65
- attrs_dict["velocity_bin_lower"] = {
66
- "name": "velocity_bin_lower",
67
- "standard_name": "velocity_bin_lower",
68
- "long_name": "velocity_bin_lower",
69
- "units": "mm",
70
- "description": "Bin lower bound drop diameter value",
71
- }
72
- # Define velocity attributes
73
- attrs_dict["velocity_bin_center"] = {
74
- "name": "velocity_bin_center",
75
- "standard_name": "velocity_bin_center",
76
- "long_name": "velocity_bin_center",
77
- "units": "m/s",
78
- "description": "Bin center drop fall velocity value",
79
- }
80
- attrs_dict["velocity_bin_width"] = {
81
- "name": "velocity_bin_width",
82
- "standard_name": "velocity_bin_width",
83
- "long_name": "velocity_bin_width",
84
- "units": "m/s",
85
- "description": "Drop fall velocity bin width",
86
- }
87
- attrs_dict["velocity_bin_upper"] = {
88
- "name": "velocity_bin_upper",
89
- "standard_name": "velocity_bin_upper",
90
- "long_name": "velocity_bin_upper",
91
- "units": "m/s",
92
- "description": "Bin upper bound drop fall velocity value",
93
- }
94
- attrs_dict["velocity_bin_lower"] = {
95
- "name": "velocity_bin_lower",
96
- "standard_name": "velocity_bin_lower",
97
- "long_name": "velocity_bin_lower",
98
- "units": "m/s",
99
- "description": "Bin lower bound drop fall velocity value",
100
- }
101
- # Define geolocation attributes
102
- attrs_dict["latitude"] = {
103
- "name": "latitude",
104
- "standard_name": "latitude",
105
- "long_name": "Latitude",
106
- "units": "degrees_north",
107
- }
108
- attrs_dict["longitude"] = {
109
- "name": "longitude",
110
- "standard_name": "longitude",
111
- "long_name": "Longitude",
112
- "units": "degrees_east",
113
- }
114
- attrs_dict["altitude"] = {
115
- "name": "altitude",
116
- "standard_name": "altitude",
117
- "long_name": "Altitude",
118
- "units": "m",
119
- "description": "Elevation above sea level",
120
- }
121
- # Define time attributes
122
- attrs_dict["time"] = {
123
- "name": "time",
124
- "standard_name": "time",
125
- "long_name": "time",
126
- "description": "UTC Time",
127
- }
128
-
129
- return attrs_dict
130
-
131
-
132
51
  def set_coordinate_attributes(ds):
133
52
  """Set coordinates attributes."""
134
53
  # Get attributes dictionary
135
- attrs_dict = get_coords_attrs_dict()
54
+ attrs_dict = get_attrs_dict()
55
+ coords_dict = {coord: attrs_dict[coord] for coord in COORDINATES if coord in attrs_dict}
136
56
  # Set attributes
137
- ds = set_attrs(ds, attrs_dict)
57
+ ds = set_attrs(ds, coords_dict)
138
58
  return ds
139
59
 
140
60
 
@@ -142,14 +62,14 @@ def set_coordinate_attributes(ds):
142
62
  #### DISDRODB Global Attributes
143
63
 
144
64
 
145
- def set_disdrodb_attrs(ds, product: str):
65
+ def update_disdrodb_attrs(ds, product: str):
146
66
  """Add DISDRODB processing information to the netCDF global attributes.
147
67
 
148
68
  It assumes stations metadata are already added the dataset.
149
69
 
150
70
  Parameters
151
71
  ----------
152
- ds : xarray.Dataset
72
+ ds : xarray dataset.
153
73
  Dataset
154
74
  product: str
155
75
  DISDRODB product.
@@ -159,30 +79,53 @@ def set_disdrodb_attrs(ds, product: str):
159
79
  xarray dataset
160
80
  Dataset.
161
81
  """
162
- # Add dataset conventions
163
- ds.attrs["Conventions"] = CONVENTIONS
164
-
165
- # Add featureType
166
- if "platform_type" in ds.attrs:
167
- platform_type = ds.attrs["platform_type"]
168
- if platform_type == "fixed":
169
- ds.attrs["featureType"] = "timeSeries"
170
- else:
171
- ds.attrs["featureType"] = "trajectory"
82
+ attrs = ds.attrs.copy()
83
+
84
+ # ----------------------------------------------
85
+ # Drop metadata not relevant for DISDRODB products
86
+ keys_to_drop = [
87
+ "disdrodb_reader",
88
+ "disdrodb_data_url",
89
+ "raw_data_glob_pattern",
90
+ "raw_data_format",
91
+ ]
92
+ for key in keys_to_drop:
93
+ _ = attrs.pop(key, None)
94
+
95
+ # ----------------------------------------------
96
+ # Add time_coverage_start and time_coverage_end
97
+ if "time" in ds.dims:
98
+ attrs["time_coverage_start"] = str(ds["time"].data[0])
99
+ attrs["time_coverage_end"] = str(ds["time"].data[-1])
172
100
 
173
- # Update DISDRODDB attributes
174
- ds = update_disdrodb_attrs(ds=ds, product=product)
101
+ # ----------------------------------------------
102
+ # Set DISDRODDB attributes
103
+ # - Add DISDRODB processing info
104
+ now = datetime.datetime.utcnow()
105
+ current_time = now.strftime("%Y-%m-%d %H:%M:%S")
106
+ attrs["disdrodb_processing_date"] = current_time
107
+ # - Add DISDRODB product and version
108
+ attrs["disdrodb_product_version"] = ARCHIVE_VERSION
109
+ attrs["disdrodb_software_version"] = SOFTWARE_VERSION
110
+ attrs["disdrodb_product"] = product
111
+
112
+ # ----------------------------------------------
113
+ # Finalize attributes dictionary
114
+ # - Sort attributes alphabetically
115
+ attrs = dict(sorted(attrs.items()))
116
+ # - Set attributes
117
+ ds.attrs = attrs
175
118
  return ds
176
119
 
177
120
 
178
- def update_disdrodb_attrs(ds, product: str):
121
+ def set_disdrodb_attrs(ds, product: str):
179
122
  """Add DISDRODB processing information to the netCDF global attributes.
180
123
 
181
124
  It assumes stations metadata are already added the dataset.
182
125
 
183
126
  Parameters
184
127
  ----------
185
- ds : xarray dataset.
128
+ ds : xarray.Dataset
186
129
  Dataset
187
130
  product: str
188
131
  DISDRODB product.
@@ -192,17 +135,17 @@ def update_disdrodb_attrs(ds, product: str):
192
135
  xarray dataset
193
136
  Dataset.
194
137
  """
195
- # Add time_coverage_start and time_coverage_end
196
- ds.attrs["time_coverage_start"] = str(ds["time"].data[0])
197
- ds.attrs["time_coverage_end"] = str(ds["time"].data[-1])
138
+ # Add dataset conventions
139
+ ds.attrs["Conventions"] = CONVENTIONS
198
140
 
199
- # DISDRODDB attributes
200
- # - Add DISDRODB processing info
201
- now = datetime.datetime.utcnow()
202
- current_time = now.strftime("%Y-%m-%d %H:%M:%S")
203
- ds.attrs["disdrodb_processing_date"] = current_time
204
- # - Add DISDRODB product and version
205
- ds.attrs["disdrodb_product_version"] = ARCHIVE_VERSION
206
- ds.attrs["disdrodb_software_version"] = SOFTWARE_VERSION
207
- ds.attrs["disdrodb_product"] = product
141
+ # Add featureType
142
+ if "platform_type" in ds.attrs:
143
+ platform_type = ds.attrs["platform_type"]
144
+ if platform_type == "fixed":
145
+ ds.attrs["featureType"] = "timeSeries"
146
+ else:
147
+ ds.attrs["featureType"] = "trajectory"
148
+
149
+ # Update DISDRODDB attributes
150
+ ds = update_disdrodb_attrs(ds=ds, product=product)
208
151
  return ds
@@ -22,6 +22,7 @@ import bz2
22
22
  import gzip
23
23
  import os
24
24
  import shutil
25
+ import subprocess
25
26
  import tempfile
26
27
  import zipfile
27
28
  from typing import Optional
@@ -53,6 +54,34 @@ def unzip_file(filepath: str, dest_path: str) -> None:
53
54
  zip_ref.extractall(dest_path)
54
55
 
55
56
 
57
+ def unzip_file_on_terminal(filepath: str, dest_path: str) -> str:
58
+ """Unzip a file into a directory using the terminal command.
59
+
60
+ Parameters
61
+ ----------
62
+ filepath : str
63
+ Path of the file to unzip.
64
+ dest_path : str
65
+ Path of the destination directory.
66
+ """
67
+ os.makedirs(dest_path, exist_ok=True)
68
+
69
+ if os.name == "nt":
70
+ # Windows: use PowerShell Expand-Archive
71
+ cmd = [
72
+ "powershell.exe",
73
+ "-NoProfile",
74
+ "-NonInteractive",
75
+ "-Command",
76
+ f"Expand-Archive -LiteralPath '{filepath}' -DestinationPath '{dest_path}' -Force",
77
+ ]
78
+ else:
79
+ # macOS/Linux: use unzip
80
+ cmd = ["unzip", "-q", filepath, "-d", dest_path]
81
+
82
+ subprocess.run(cmd, check=True)
83
+
84
+
56
85
  def _zip_dir(dir_path: str) -> str:
57
86
  """Zip a directory into a file located in the same directory.
58
87
 
@@ -157,7 +186,7 @@ def compress_station_files(
157
186
  raise ValueError(f"Station data directory {station_dir} does not exist.")
158
187
 
159
188
  # Get list of files inside the station directory (in all nested directories)
160
- filepaths = list_files(station_dir, glob_pattern="*", recursive=True)
189
+ filepaths = list_files(station_dir, recursive=True)
161
190
  for filepath in filepaths:
162
191
  _ = _compress_file(filepath, method, skip=skip)
163
192
 
disdrodb/utils/dask.py CHANGED
@@ -16,31 +16,82 @@
16
16
  # You should have received a copy of the GNU General Public License
17
17
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
18
18
  # -----------------------------------------------------------------------------.
19
- """Utilities for Dask Distributed computations."""
19
+ """Utilities for Dask Distributed Computations."""
20
20
  import logging
21
21
  import os
22
22
 
23
+ import numpy as np
23
24
 
24
- def initialize_dask_cluster():
25
+
26
+ def check_parallel_validity(parallel):
27
+ """Check validity of parallel option given Dask settings."""
28
+ import dask
29
+
30
+ scheduler = dask.config.get("scheduler", None)
31
+ if scheduler is None:
32
+ return parallel
33
+ if scheduler in ["synchronous", "threads"]:
34
+ return False
35
+ if scheduler == "distributed":
36
+ from dask.distributed import default_client
37
+
38
+ client = default_client()
39
+ info = client.scheduler_info()
40
+
41
+ # If ThreadWorker, only 1 pid
42
+ pids = list(client.run(os.getpid).values())
43
+ if len(np.unique(pids)) == 1:
44
+ return False
45
+
46
+ # If ProcessWorker
47
+ # - Check single thread per worker to avoid locks
48
+ nthreads_per_process = np.array([v["nthreads"] for v in info["workers"].values()])
49
+ if not np.all(nthreads_per_process == 1):
50
+ print(
51
+ "To open netCDFs in parallel with dask distributed (processes=True), please set threads_per_worker=1 !",
52
+ )
53
+ return False
54
+
55
+ # Otherwise let the user choose
56
+ return parallel
57
+
58
+
59
+ def initialize_dask_cluster(minimum_memory=None):
25
60
  """Initialize Dask Cluster."""
26
61
  import dask
62
+ import psutil
63
+
64
+ # Silence dask warnings
65
+ # dask.config.set({"logging.distributed": "error"})
66
+ # Import dask.distributed after setting the config
27
67
  from dask.distributed import Client, LocalCluster
68
+ from dask.utils import parse_bytes
28
69
 
29
70
  # Set HDF5_USE_FILE_LOCKING to avoid going stuck with HDF
30
71
  os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
31
- # Retrieve the number of process to run
32
- available_workers = os.cpu_count() - 2 # if not set, all CPUs
72
+
73
+ # Retrieve the number of processes to run
74
+ available_workers = os.cpu_count() - 2 # if not set, all CPUs minus 2
33
75
  num_workers = dask.config.get("num_workers", available_workers)
34
- # Silence dask warnings
35
- dask.config.set({"logging.distributed": "error"})
36
- # dask.config.set({"distributed.admin.system-monitor.gil.enabled": False})
76
+
77
+ # If memory limit specified, ensure correct amount of workers
78
+ if minimum_memory is not None:
79
+ # Compute available memory (in bytes)
80
+ total_memory = psutil.virtual_memory().total
81
+ # Get minimum memory per worker (in bytes)
82
+ minimum_memory = parse_bytes(minimum_memory)
83
+ # Determine number of workers constrained by memory
84
+ maximum_workers_allowed = max(1, total_memory // minimum_memory)
85
+ # Respect both CPU and memory requirements
86
+ num_workers = min(maximum_workers_allowed, num_workers)
87
+
37
88
  # Create dask.distributed local cluster
38
89
  cluster = LocalCluster(
39
90
  n_workers=num_workers,
40
91
  threads_per_worker=1,
41
92
  processes=True,
42
93
  # memory_limit='8GB',
43
- # silence_logs=False,
94
+ silence_logs=logging.ERROR,
44
95
  )
45
96
  client = Client(cluster)
46
97
  return cluster, client
@@ -20,6 +20,8 @@
20
20
  import numpy as np
21
21
  import pandas as pd
22
22
 
23
+ from disdrodb.utils.warnings import suppress_warnings
24
+
23
25
 
24
26
  def log_arange(start, stop, log_step=0.1, base=10):
25
27
  """
@@ -47,7 +49,39 @@ def log_arange(start, stop, log_step=0.1, base=10):
47
49
  log_start = np.log(start) / np.log(base)
48
50
  log_stop = np.log(stop) / np.log(base)
49
51
 
50
- log_values = np.arange(log_start, log_stop, log_step)
52
+ log_values = np.arange(log_start, log_stop + log_step / 2, log_step)
53
+ return base**log_values
54
+
55
+
56
+ def log_linspace(start, stop, n_bins, base=10):
57
+ """
58
+ Return numbers spaced evenly on a log scale between start and stop.
59
+
60
+ Parameters
61
+ ----------
62
+ start : float
63
+ The starting value of the sequence (must be > 0).
64
+ stop : float
65
+ The end value of the sequence (must be > 0).
66
+ n_bins : int
67
+ The number of points to generate (including start and stop).
68
+ base : float
69
+ The logarithmic base (default is 10).
70
+
71
+ Returns
72
+ -------
73
+ np.ndarray
74
+ Array of values spaced evenly in log space.
75
+ """
76
+ if start <= 0 or stop <= 0:
77
+ raise ValueError("Both start and stop must be > 0 for log spacing.")
78
+ if n_bins < 2:
79
+ raise ValueError("n_bins must be >= 2 to include start and stop values.")
80
+
81
+ log_start = np.log(start) / np.log(base)
82
+ log_stop = np.log(stop) / np.log(base)
83
+
84
+ log_values = np.linspace(log_start, log_stop, n_bins)
51
85
  return base**log_values
52
86
 
53
87
 
@@ -100,6 +134,9 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
100
134
  if len(df) == 0:
101
135
  raise ValueError("No valid data points after removing NaN values")
102
136
 
137
+ # Keep only data within bin range
138
+ df = df[(df[column] >= bins[0]) & (df[column] < bins[-1])]
139
+
103
140
  # Create binned columns with explicit handling of out-of-bounds values
104
141
  df[f"{column}_binned"] = pd.cut(df[column], bins=bins, include_lowest=True)
105
142
 
@@ -134,7 +171,7 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
134
171
  (f"{prefix}std", "std"),
135
172
  (f"{prefix}min", "min"),
136
173
  (f"{prefix}max", "max"),
137
- (f"{prefix}mad", lambda s: np.median(np.abs(s - np.median(s)))),
174
+ (f"{prefix}mad", lambda s: (s - s.median()).abs().median()),
138
175
  ]
139
176
  if i == 0:
140
177
  list_stats.append(("count", "count"))
@@ -142,7 +179,8 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
142
179
  list_stats = [("count", "count")]
143
180
 
144
181
  # Compute statistics
145
- df_stats = df_grouped[var].agg(list_stats)
182
+ with suppress_warnings():
183
+ df_stats = df_grouped[var].agg(list_stats)
146
184
 
147
185
  # Compute other variable statistics
148
186
  if variables_specified:
@@ -253,8 +291,18 @@ def compute_2d_histogram(
253
291
  raise ValueError("No valid data points after removing NaN values")
254
292
 
255
293
  # Create binned columns with explicit handling of out-of-bounds values
256
- df[f"{x}_binned"] = pd.cut(df[x], bins=x_bins, include_lowest=True)
257
- df[f"{y}_binned"] = pd.cut(df[y], bins=y_bins, include_lowest=True)
294
+ df[f"{x}_binned"] = pd.cut(
295
+ df[x],
296
+ bins=pd.IntervalIndex.from_breaks(x_bins, closed="right"),
297
+ include_lowest=True,
298
+ ordered=True,
299
+ )
300
+ df[f"{y}_binned"] = pd.cut(
301
+ df[y],
302
+ bins=pd.IntervalIndex.from_breaks(y_bins, closed="right"),
303
+ include_lowest=True,
304
+ ordered=True,
305
+ )
258
306
 
259
307
  # Create complete IntervalIndex for both dimensions
260
308
  x_intervals = df[f"{x}_binned"].cat.categories
@@ -318,8 +366,8 @@ def compute_2d_histogram(
318
366
  df_stats = df_stats.reindex(full_index)
319
367
 
320
368
  # Determine coordinates
321
- x_centers = x_intervals.mid
322
- y_centers = y_intervals.mid
369
+ x_centers = np.array(x_intervals.mid)
370
+ y_centers = np.array(y_intervals.mid)
323
371
 
324
372
  # Use provided labels if available
325
373
  x_coords = x_labels if x_labels is not None else x_centers
@@ -337,6 +385,12 @@ def compute_2d_histogram(
337
385
  # Convert to dataset
338
386
  ds = df_stats.to_xarray()
339
387
 
388
+ # Convert Categorical coordinates to float if possible
389
+ if np.issubdtype(x_coords.dtype, np.number):
390
+ ds[f"{x}"] = ds[f"{x}"].astype(float)
391
+ if np.issubdtype(y_coords.dtype, np.number):
392
+ ds[f"{y}"] = ds[f"{y}"].astype(float)
393
+
340
394
  # Transpose arrays
341
395
  ds = ds.transpose(y, x)
342
396
  return ds
@@ -98,18 +98,29 @@ def _recursive_glob(dir_path, glob_pattern):
98
98
  return [str(path) for path in dir_path.rglob(glob_pattern)]
99
99
 
100
100
 
101
- def _list_paths(dir_path, glob_pattern, recursive=False):
101
+ def _is_hidden(path):
102
+ """Return True if any component of path is hidden."""
103
+ return any(part.startswith(".") for part in path.split(os.sep))
104
+
105
+
106
+ def _list_paths(dir_path, glob_pattern, recursive=False, skip_hidden=True):
102
107
  """Return a list of filepaths and directory paths based on a single glob pattern."""
103
108
  # If glob pattern has separators, disable recursive option
104
109
  if "/" in glob_pattern and "**" not in glob_pattern:
105
110
  recursive = False
106
111
  # Search paths
107
112
  if not recursive:
108
- return glob.glob(os.path.join(dir_path, glob_pattern))
109
- return _recursive_glob(dir_path, glob_pattern)
113
+ matches = glob.glob(os.path.join(dir_path, glob_pattern))
114
+ else:
115
+ matches = _recursive_glob(dir_path, glob_pattern)
110
116
 
117
+ # Filter out anything with a hidden component
118
+ if skip_hidden:
119
+ matches = [p for p in matches if not _is_hidden(os.path.relpath(p, dir_path))]
120
+ return matches
111
121
 
112
- def list_paths(dir_path, glob_pattern, recursive=False):
122
+
123
+ def list_paths(dir_path, glob_pattern, recursive=False, skip_hidden=True):
113
124
  """Return a list of filepaths and directory paths.
114
125
 
115
126
  This function accept also a list of glob patterns !
@@ -119,35 +130,41 @@ def list_paths(dir_path, glob_pattern, recursive=False):
119
130
  # Search path for specified glob patterns
120
131
  paths = flatten_list(
121
132
  [
122
- _list_paths(dir_path=dir_path, glob_pattern=glob_pattern, recursive=recursive)
133
+ _list_paths(dir_path=dir_path, glob_pattern=glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
123
134
  for glob_pattern in glob_patterns
124
135
  ],
125
136
  )
126
137
  return paths
127
138
 
128
139
 
129
- def list_files(dir_path, glob_pattern, recursive=False):
140
+ def list_files(dir_path, glob_pattern="*", recursive=False, skip_hidden=True, return_paths=True):
130
141
  """Return a list of filepaths (exclude directory paths)."""
131
- paths = list_paths(dir_path, glob_pattern, recursive=recursive)
142
+ paths = list_paths(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
132
143
  filepaths = [f for f in paths if os.path.isfile(f)]
144
+ # If return_paths is False, return only files names
145
+ if not return_paths:
146
+ filepaths = [os.path.basename(f) for f in filepaths]
133
147
  return filepaths
134
148
 
135
149
 
136
- def list_directories(dir_path, glob_pattern, recursive=False):
150
+ def list_directories(dir_path, glob_pattern="*", recursive=False, skip_hidden=True, return_paths=True):
137
151
  """Return a list of directory paths (exclude file paths)."""
138
- paths = list_paths(dir_path, glob_pattern, recursive=recursive)
152
+ paths = list_paths(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
139
153
  dir_paths = [f for f in paths if os.path.isdir(f)]
154
+ # If return_paths is False, return only directory names
155
+ if not return_paths:
156
+ dir_paths = [os.path.basename(f) for f in dir_paths]
140
157
  return dir_paths
141
158
 
142
159
 
143
- def count_files(dir_path, glob_pattern, recursive=False):
160
+ def count_files(dir_path, glob_pattern="*", recursive=False, skip_hidden=True):
144
161
  """Return the number of files (exclude directories)."""
145
- return len(list_files(dir_path, glob_pattern, recursive=recursive))
162
+ return len(list_files(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden))
146
163
 
147
164
 
148
- def count_directories(dir_path, glob_pattern, recursive=False):
165
+ def count_directories(dir_path, glob_pattern="*", recursive=False, skip_hidden=True):
149
166
  """Return the number of files (exclude directories)."""
150
- return len(list_directories(dir_path, glob_pattern, recursive=recursive))
167
+ return len(list_directories(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden))
151
168
 
152
169
 
153
170
  def check_directory_exists(dir_path):
@@ -177,7 +194,7 @@ def create_required_directory(dir_path, dir_name, exist_ok=True):
177
194
  create_directory(path=new_dir_path, exist_ok=exist_ok)
178
195
 
179
196
 
180
- def is_empty_directory(path):
197
+ def is_empty_directory(path, skip_hidden=True):
181
198
  """Check if a directory path is empty.
182
199
 
183
200
  Return ``False`` if path is a file or non-empty directory.
@@ -187,8 +204,11 @@ def is_empty_directory(path):
187
204
  raise OSError(f"{path} does not exist.")
188
205
  if not os.path.isdir(path):
189
206
  return False
190
-
191
207
  paths = os.listdir(path)
208
+
209
+ # If skip_hidden is True, filter out hidden files/directories
210
+ if skip_hidden:
211
+ paths = [f for f in paths if not f.startswith(".")]
192
212
  return len(paths) == 0
193
213
 
194
214