disdrodb 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- disdrodb/__init__.py +64 -34
- disdrodb/_config.py +5 -4
- disdrodb/_version.py +16 -3
- disdrodb/accessor/__init__.py +20 -0
- disdrodb/accessor/methods.py +125 -0
- disdrodb/api/checks.py +139 -9
- disdrodb/api/configs.py +4 -2
- disdrodb/api/info.py +10 -10
- disdrodb/api/io.py +237 -18
- disdrodb/api/path.py +81 -75
- disdrodb/api/search.py +6 -6
- disdrodb/cli/disdrodb_create_summary_station.py +91 -0
- disdrodb/cli/disdrodb_run_l0.py +1 -1
- disdrodb/cli/disdrodb_run_l0_station.py +1 -1
- disdrodb/cli/disdrodb_run_l0b.py +1 -1
- disdrodb/cli/disdrodb_run_l0b_station.py +1 -1
- disdrodb/cli/disdrodb_run_l0c.py +1 -1
- disdrodb/cli/disdrodb_run_l0c_station.py +1 -1
- disdrodb/cli/disdrodb_run_l2e_station.py +1 -1
- disdrodb/configs.py +149 -4
- disdrodb/constants.py +61 -0
- disdrodb/data_transfer/download_data.py +5 -5
- disdrodb/etc/configs/attributes.yaml +339 -0
- disdrodb/etc/configs/encodings.yaml +473 -0
- disdrodb/etc/products/L1/global.yaml +13 -0
- disdrodb/etc/products/L2E/10MIN.yaml +12 -0
- disdrodb/etc/products/L2E/1MIN.yaml +1 -0
- disdrodb/etc/products/L2E/global.yaml +22 -0
- disdrodb/etc/products/L2M/10MIN.yaml +12 -0
- disdrodb/etc/products/L2M/GAMMA_ML.yaml +8 -0
- disdrodb/etc/products/L2M/NGAMMA_GS_LOG_ND_MAE.yaml +6 -0
- disdrodb/etc/products/L2M/NGAMMA_GS_ND_MAE.yaml +6 -0
- disdrodb/etc/products/L2M/NGAMMA_GS_Z_MAE.yaml +6 -0
- disdrodb/etc/products/L2M/global.yaml +26 -0
- disdrodb/l0/__init__.py +13 -0
- disdrodb/l0/configs/LPM/l0b_cf_attrs.yml +4 -4
- disdrodb/l0/configs/PARSIVEL/l0b_cf_attrs.yml +1 -1
- disdrodb/l0/configs/PARSIVEL/l0b_encodings.yml +3 -3
- disdrodb/l0/configs/PARSIVEL/raw_data_format.yml +1 -1
- disdrodb/l0/configs/PARSIVEL2/l0b_cf_attrs.yml +5 -5
- disdrodb/l0/configs/PARSIVEL2/l0b_encodings.yml +3 -3
- disdrodb/l0/configs/PARSIVEL2/raw_data_format.yml +1 -1
- disdrodb/l0/configs/PWS100/l0b_cf_attrs.yml +4 -4
- disdrodb/l0/configs/PWS100/raw_data_format.yml +1 -1
- disdrodb/l0/l0a_processing.py +30 -30
- disdrodb/l0/l0b_nc_processing.py +108 -2
- disdrodb/l0/l0b_processing.py +4 -4
- disdrodb/l0/l0c_processing.py +5 -13
- disdrodb/l0/readers/LPM/NETHERLANDS/DELFT_LPM_NC.py +66 -0
- disdrodb/l0/readers/LPM/SLOVENIA/{CRNI_VRH.py → UL.py} +3 -0
- disdrodb/l0/readers/LPM/SWITZERLAND/INNERERIZ_LPM.py +195 -0
- disdrodb/l0/readers/PARSIVEL/GPM/PIERS.py +0 -2
- disdrodb/l0/readers/PARSIVEL/JAPAN/JMA.py +4 -1
- disdrodb/l0/readers/PARSIVEL/NCAR/PECAN_MOBILE.py +1 -1
- disdrodb/l0/readers/PARSIVEL/NCAR/VORTEX2_2009.py +1 -1
- disdrodb/l0/readers/PARSIVEL2/BELGIUM/ILVO.py +168 -0
- disdrodb/l0/readers/PARSIVEL2/DENMARK/DTU.py +165 -0
- disdrodb/l0/readers/PARSIVEL2/FINLAND/FMI_PARSIVEL2.py +69 -0
- disdrodb/l0/readers/PARSIVEL2/FRANCE/ENPC_PARSIVEL2.py +255 -134
- disdrodb/l0/readers/PARSIVEL2/FRANCE/OSUG.py +525 -0
- disdrodb/l0/readers/PARSIVEL2/FRANCE/SIRTA_PARSIVEL2.py +1 -1
- disdrodb/l0/readers/PARSIVEL2/GPM/GCPEX.py +9 -7
- disdrodb/l0/readers/PARSIVEL2/KIT/BURKINA_FASO.py +1 -1
- disdrodb/l0/readers/PARSIVEL2/KIT/TEAMX.py +123 -0
- disdrodb/l0/readers/PARSIVEL2/NASA/APU.py +120 -0
- disdrodb/l0/readers/PARSIVEL2/NCAR/FARM_PARSIVEL2.py +1 -0
- disdrodb/l0/readers/PARSIVEL2/NCAR/PECAN_FP3.py +1 -1
- disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_MIPS.py +126 -0
- disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_PIPS.py +165 -0
- disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_P2.py +1 -1
- disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_PIPS.py +20 -12
- disdrodb/l0/readers/PARSIVEL2/NETHERLANDS/DELFT_NC.py +2 -0
- disdrodb/l0/readers/PARSIVEL2/SPAIN/CENER.py +144 -0
- disdrodb/l0/readers/PARSIVEL2/SPAIN/CR1000DL.py +201 -0
- disdrodb/l0/readers/PARSIVEL2/SPAIN/LIAISE.py +137 -0
- disdrodb/l0/readers/PARSIVEL2/{NETHERLANDS/DELFT.py → USA/C3WE.py} +65 -85
- disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100.py +105 -99
- disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100_SIRTA.py +151 -0
- disdrodb/l0/routines.py +105 -14
- disdrodb/l1/__init__.py +5 -0
- disdrodb/l1/filters.py +34 -20
- disdrodb/l1/processing.py +45 -44
- disdrodb/l1/resampling.py +77 -66
- disdrodb/l1/routines.py +35 -43
- disdrodb/l1_env/routines.py +18 -3
- disdrodb/l2/__init__.py +7 -0
- disdrodb/l2/empirical_dsd.py +58 -10
- disdrodb/l2/event.py +27 -120
- disdrodb/l2/processing.py +267 -116
- disdrodb/l2/routines.py +618 -254
- disdrodb/metadata/standards.py +3 -1
- disdrodb/psd/fitting.py +463 -144
- disdrodb/psd/models.py +8 -5
- disdrodb/routines.py +3 -3
- disdrodb/scattering/__init__.py +16 -4
- disdrodb/scattering/axis_ratio.py +56 -36
- disdrodb/scattering/permittivity.py +486 -0
- disdrodb/scattering/routines.py +701 -159
- disdrodb/summary/__init__.py +17 -0
- disdrodb/summary/routines.py +4120 -0
- disdrodb/utils/attrs.py +68 -125
- disdrodb/utils/compression.py +30 -1
- disdrodb/utils/dask.py +59 -8
- disdrodb/utils/dataframe.py +61 -7
- disdrodb/utils/directories.py +35 -15
- disdrodb/utils/encoding.py +33 -19
- disdrodb/utils/logger.py +13 -6
- disdrodb/utils/manipulations.py +71 -0
- disdrodb/utils/subsetting.py +214 -0
- disdrodb/utils/time.py +165 -19
- disdrodb/utils/writer.py +20 -7
- disdrodb/utils/xarray.py +2 -4
- disdrodb/viz/__init__.py +13 -0
- disdrodb/viz/plots.py +327 -0
- {disdrodb-0.1.2.dist-info → disdrodb-0.1.3.dist-info}/METADATA +3 -2
- {disdrodb-0.1.2.dist-info → disdrodb-0.1.3.dist-info}/RECORD +121 -88
- {disdrodb-0.1.2.dist-info → disdrodb-0.1.3.dist-info}/entry_points.txt +1 -0
- disdrodb/l1/encoding_attrs.py +0 -642
- disdrodb/l2/processing_options.py +0 -213
- /disdrodb/l0/readers/PARSIVEL/SLOVENIA/{UL_FGG.py → UL.py} +0 -0
- {disdrodb-0.1.2.dist-info → disdrodb-0.1.3.dist-info}/WHEEL +0 -0
- {disdrodb-0.1.2.dist-info → disdrodb-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {disdrodb-0.1.2.dist-info → disdrodb-0.1.3.dist-info}/top_level.txt +0 -0
disdrodb/utils/attrs.py
CHANGED
|
@@ -18,15 +18,26 @@
|
|
|
18
18
|
# -----------------------------------------------------------------------------.
|
|
19
19
|
"""DISDRODB netCDF4 attributes utilities."""
|
|
20
20
|
import datetime
|
|
21
|
+
import os
|
|
21
22
|
|
|
22
|
-
from disdrodb import ARCHIVE_VERSION, CONVENTIONS, SOFTWARE_VERSION
|
|
23
|
+
from disdrodb.constants import ARCHIVE_VERSION, CONVENTIONS, COORDINATES, SOFTWARE_VERSION
|
|
24
|
+
from disdrodb.utils.yaml import read_yaml
|
|
23
25
|
|
|
24
26
|
####---------------------------------------------------------------------.
|
|
25
|
-
#### Variable attributes
|
|
27
|
+
#### Variable and coordinates attributes
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_attrs_dict():
|
|
31
|
+
"""Get attributes dictionary for DISDRODB product variables and coordinates."""
|
|
32
|
+
import disdrodb
|
|
33
|
+
|
|
34
|
+
configs_path = os.path.join(disdrodb.__root_path__, "disdrodb", "etc", "configs")
|
|
35
|
+
attrs_dict = read_yaml(os.path.join(configs_path, "attributes.yaml"))
|
|
36
|
+
return attrs_dict
|
|
26
37
|
|
|
27
38
|
|
|
28
39
|
def set_attrs(ds, attrs_dict):
|
|
29
|
-
"""Set attributes to the variables of the xr.Dataset."""
|
|
40
|
+
"""Set attributes to the variables and coordinates of the xr.Dataset."""
|
|
30
41
|
for var in attrs_dict:
|
|
31
42
|
if var in ds:
|
|
32
43
|
ds[var].attrs.update(attrs_dict[var])
|
|
@@ -37,104 +48,13 @@ def set_attrs(ds, attrs_dict):
|
|
|
37
48
|
#### Coordinates attributes
|
|
38
49
|
|
|
39
50
|
|
|
40
|
-
def get_coords_attrs_dict():
|
|
41
|
-
"""Return dictionary with DISDRODB coordinates attributes."""
|
|
42
|
-
attrs_dict = {}
|
|
43
|
-
# Define diameter attributes
|
|
44
|
-
attrs_dict["diameter_bin_center"] = {
|
|
45
|
-
"name": "diameter_bin_center",
|
|
46
|
-
"standard_name": "diameter_bin_center",
|
|
47
|
-
"long_name": "diameter_bin_center",
|
|
48
|
-
"units": "mm",
|
|
49
|
-
"description": "Bin center drop diameter value",
|
|
50
|
-
}
|
|
51
|
-
attrs_dict["diameter_bin_width"] = {
|
|
52
|
-
"name": "diameter_bin_width",
|
|
53
|
-
"standard_name": "diameter_bin_width",
|
|
54
|
-
"long_name": "diameter_bin_width",
|
|
55
|
-
"units": "mm",
|
|
56
|
-
"description": "Drop diameter bin width",
|
|
57
|
-
}
|
|
58
|
-
attrs_dict["diameter_bin_upper"] = {
|
|
59
|
-
"name": "diameter_bin_upper",
|
|
60
|
-
"standard_name": "diameter_bin_upper",
|
|
61
|
-
"long_name": "diameter_bin_upper",
|
|
62
|
-
"units": "mm",
|
|
63
|
-
"description": "Bin upper bound drop diameter value",
|
|
64
|
-
}
|
|
65
|
-
attrs_dict["velocity_bin_lower"] = {
|
|
66
|
-
"name": "velocity_bin_lower",
|
|
67
|
-
"standard_name": "velocity_bin_lower",
|
|
68
|
-
"long_name": "velocity_bin_lower",
|
|
69
|
-
"units": "mm",
|
|
70
|
-
"description": "Bin lower bound drop diameter value",
|
|
71
|
-
}
|
|
72
|
-
# Define velocity attributes
|
|
73
|
-
attrs_dict["velocity_bin_center"] = {
|
|
74
|
-
"name": "velocity_bin_center",
|
|
75
|
-
"standard_name": "velocity_bin_center",
|
|
76
|
-
"long_name": "velocity_bin_center",
|
|
77
|
-
"units": "m/s",
|
|
78
|
-
"description": "Bin center drop fall velocity value",
|
|
79
|
-
}
|
|
80
|
-
attrs_dict["velocity_bin_width"] = {
|
|
81
|
-
"name": "velocity_bin_width",
|
|
82
|
-
"standard_name": "velocity_bin_width",
|
|
83
|
-
"long_name": "velocity_bin_width",
|
|
84
|
-
"units": "m/s",
|
|
85
|
-
"description": "Drop fall velocity bin width",
|
|
86
|
-
}
|
|
87
|
-
attrs_dict["velocity_bin_upper"] = {
|
|
88
|
-
"name": "velocity_bin_upper",
|
|
89
|
-
"standard_name": "velocity_bin_upper",
|
|
90
|
-
"long_name": "velocity_bin_upper",
|
|
91
|
-
"units": "m/s",
|
|
92
|
-
"description": "Bin upper bound drop fall velocity value",
|
|
93
|
-
}
|
|
94
|
-
attrs_dict["velocity_bin_lower"] = {
|
|
95
|
-
"name": "velocity_bin_lower",
|
|
96
|
-
"standard_name": "velocity_bin_lower",
|
|
97
|
-
"long_name": "velocity_bin_lower",
|
|
98
|
-
"units": "m/s",
|
|
99
|
-
"description": "Bin lower bound drop fall velocity value",
|
|
100
|
-
}
|
|
101
|
-
# Define geolocation attributes
|
|
102
|
-
attrs_dict["latitude"] = {
|
|
103
|
-
"name": "latitude",
|
|
104
|
-
"standard_name": "latitude",
|
|
105
|
-
"long_name": "Latitude",
|
|
106
|
-
"units": "degrees_north",
|
|
107
|
-
}
|
|
108
|
-
attrs_dict["longitude"] = {
|
|
109
|
-
"name": "longitude",
|
|
110
|
-
"standard_name": "longitude",
|
|
111
|
-
"long_name": "Longitude",
|
|
112
|
-
"units": "degrees_east",
|
|
113
|
-
}
|
|
114
|
-
attrs_dict["altitude"] = {
|
|
115
|
-
"name": "altitude",
|
|
116
|
-
"standard_name": "altitude",
|
|
117
|
-
"long_name": "Altitude",
|
|
118
|
-
"units": "m",
|
|
119
|
-
"description": "Elevation above sea level",
|
|
120
|
-
}
|
|
121
|
-
# Define time attributes
|
|
122
|
-
attrs_dict["time"] = {
|
|
123
|
-
"name": "time",
|
|
124
|
-
"standard_name": "time",
|
|
125
|
-
"long_name": "time",
|
|
126
|
-
"description": "UTC Time",
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
return attrs_dict
|
|
130
|
-
|
|
131
|
-
|
|
132
51
|
def set_coordinate_attributes(ds):
|
|
133
52
|
"""Set coordinates attributes."""
|
|
134
53
|
# Get attributes dictionary
|
|
135
|
-
attrs_dict =
|
|
54
|
+
attrs_dict = get_attrs_dict()
|
|
55
|
+
coords_dict = {coord: attrs_dict[coord] for coord in COORDINATES if coord in attrs_dict}
|
|
136
56
|
# Set attributes
|
|
137
|
-
ds = set_attrs(ds,
|
|
57
|
+
ds = set_attrs(ds, coords_dict)
|
|
138
58
|
return ds
|
|
139
59
|
|
|
140
60
|
|
|
@@ -142,14 +62,14 @@ def set_coordinate_attributes(ds):
|
|
|
142
62
|
#### DISDRODB Global Attributes
|
|
143
63
|
|
|
144
64
|
|
|
145
|
-
def
|
|
65
|
+
def update_disdrodb_attrs(ds, product: str):
|
|
146
66
|
"""Add DISDRODB processing information to the netCDF global attributes.
|
|
147
67
|
|
|
148
68
|
It assumes stations metadata are already added the dataset.
|
|
149
69
|
|
|
150
70
|
Parameters
|
|
151
71
|
----------
|
|
152
|
-
ds : xarray.
|
|
72
|
+
ds : xarray dataset.
|
|
153
73
|
Dataset
|
|
154
74
|
product: str
|
|
155
75
|
DISDRODB product.
|
|
@@ -159,30 +79,53 @@ def set_disdrodb_attrs(ds, product: str):
|
|
|
159
79
|
xarray dataset
|
|
160
80
|
Dataset.
|
|
161
81
|
"""
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
#
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
82
|
+
attrs = ds.attrs.copy()
|
|
83
|
+
|
|
84
|
+
# ----------------------------------------------
|
|
85
|
+
# Drop metadata not relevant for DISDRODB products
|
|
86
|
+
keys_to_drop = [
|
|
87
|
+
"disdrodb_reader",
|
|
88
|
+
"disdrodb_data_url",
|
|
89
|
+
"raw_data_glob_pattern",
|
|
90
|
+
"raw_data_format",
|
|
91
|
+
]
|
|
92
|
+
for key in keys_to_drop:
|
|
93
|
+
_ = attrs.pop(key, None)
|
|
94
|
+
|
|
95
|
+
# ----------------------------------------------
|
|
96
|
+
# Add time_coverage_start and time_coverage_end
|
|
97
|
+
if "time" in ds.dims:
|
|
98
|
+
attrs["time_coverage_start"] = str(ds["time"].data[0])
|
|
99
|
+
attrs["time_coverage_end"] = str(ds["time"].data[-1])
|
|
172
100
|
|
|
173
|
-
#
|
|
174
|
-
|
|
101
|
+
# ----------------------------------------------
|
|
102
|
+
# Set DISDRODDB attributes
|
|
103
|
+
# - Add DISDRODB processing info
|
|
104
|
+
now = datetime.datetime.utcnow()
|
|
105
|
+
current_time = now.strftime("%Y-%m-%d %H:%M:%S")
|
|
106
|
+
attrs["disdrodb_processing_date"] = current_time
|
|
107
|
+
# - Add DISDRODB product and version
|
|
108
|
+
attrs["disdrodb_product_version"] = ARCHIVE_VERSION
|
|
109
|
+
attrs["disdrodb_software_version"] = SOFTWARE_VERSION
|
|
110
|
+
attrs["disdrodb_product"] = product
|
|
111
|
+
|
|
112
|
+
# ----------------------------------------------
|
|
113
|
+
# Finalize attributes dictionary
|
|
114
|
+
# - Sort attributes alphabetically
|
|
115
|
+
attrs = dict(sorted(attrs.items()))
|
|
116
|
+
# - Set attributes
|
|
117
|
+
ds.attrs = attrs
|
|
175
118
|
return ds
|
|
176
119
|
|
|
177
120
|
|
|
178
|
-
def
|
|
121
|
+
def set_disdrodb_attrs(ds, product: str):
|
|
179
122
|
"""Add DISDRODB processing information to the netCDF global attributes.
|
|
180
123
|
|
|
181
124
|
It assumes stations metadata are already added the dataset.
|
|
182
125
|
|
|
183
126
|
Parameters
|
|
184
127
|
----------
|
|
185
|
-
ds : xarray
|
|
128
|
+
ds : xarray.Dataset
|
|
186
129
|
Dataset
|
|
187
130
|
product: str
|
|
188
131
|
DISDRODB product.
|
|
@@ -192,17 +135,17 @@ def update_disdrodb_attrs(ds, product: str):
|
|
|
192
135
|
xarray dataset
|
|
193
136
|
Dataset.
|
|
194
137
|
"""
|
|
195
|
-
# Add
|
|
196
|
-
ds.attrs["
|
|
197
|
-
ds.attrs["time_coverage_end"] = str(ds["time"].data[-1])
|
|
138
|
+
# Add dataset conventions
|
|
139
|
+
ds.attrs["Conventions"] = CONVENTIONS
|
|
198
140
|
|
|
199
|
-
#
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
141
|
+
# Add featureType
|
|
142
|
+
if "platform_type" in ds.attrs:
|
|
143
|
+
platform_type = ds.attrs["platform_type"]
|
|
144
|
+
if platform_type == "fixed":
|
|
145
|
+
ds.attrs["featureType"] = "timeSeries"
|
|
146
|
+
else:
|
|
147
|
+
ds.attrs["featureType"] = "trajectory"
|
|
148
|
+
|
|
149
|
+
# Update DISDRODDB attributes
|
|
150
|
+
ds = update_disdrodb_attrs(ds=ds, product=product)
|
|
208
151
|
return ds
|
disdrodb/utils/compression.py
CHANGED
|
@@ -22,6 +22,7 @@ import bz2
|
|
|
22
22
|
import gzip
|
|
23
23
|
import os
|
|
24
24
|
import shutil
|
|
25
|
+
import subprocess
|
|
25
26
|
import tempfile
|
|
26
27
|
import zipfile
|
|
27
28
|
from typing import Optional
|
|
@@ -53,6 +54,34 @@ def unzip_file(filepath: str, dest_path: str) -> None:
|
|
|
53
54
|
zip_ref.extractall(dest_path)
|
|
54
55
|
|
|
55
56
|
|
|
57
|
+
def unzip_file_on_terminal(filepath: str, dest_path: str) -> str:
|
|
58
|
+
"""Unzip a file into a directory using the terminal command.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
filepath : str
|
|
63
|
+
Path of the file to unzip.
|
|
64
|
+
dest_path : str
|
|
65
|
+
Path of the destination directory.
|
|
66
|
+
"""
|
|
67
|
+
os.makedirs(dest_path, exist_ok=True)
|
|
68
|
+
|
|
69
|
+
if os.name == "nt":
|
|
70
|
+
# Windows: use PowerShell Expand-Archive
|
|
71
|
+
cmd = [
|
|
72
|
+
"powershell.exe",
|
|
73
|
+
"-NoProfile",
|
|
74
|
+
"-NonInteractive",
|
|
75
|
+
"-Command",
|
|
76
|
+
f"Expand-Archive -LiteralPath '{filepath}' -DestinationPath '{dest_path}' -Force",
|
|
77
|
+
]
|
|
78
|
+
else:
|
|
79
|
+
# macOS/Linux: use unzip
|
|
80
|
+
cmd = ["unzip", "-q", filepath, "-d", dest_path]
|
|
81
|
+
|
|
82
|
+
subprocess.run(cmd, check=True)
|
|
83
|
+
|
|
84
|
+
|
|
56
85
|
def _zip_dir(dir_path: str) -> str:
|
|
57
86
|
"""Zip a directory into a file located in the same directory.
|
|
58
87
|
|
|
@@ -157,7 +186,7 @@ def compress_station_files(
|
|
|
157
186
|
raise ValueError(f"Station data directory {station_dir} does not exist.")
|
|
158
187
|
|
|
159
188
|
# Get list of files inside the station directory (in all nested directories)
|
|
160
|
-
filepaths = list_files(station_dir,
|
|
189
|
+
filepaths = list_files(station_dir, recursive=True)
|
|
161
190
|
for filepath in filepaths:
|
|
162
191
|
_ = _compress_file(filepath, method, skip=skip)
|
|
163
192
|
|
disdrodb/utils/dask.py
CHANGED
|
@@ -16,31 +16,82 @@
|
|
|
16
16
|
# You should have received a copy of the GNU General Public License
|
|
17
17
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
18
18
|
# -----------------------------------------------------------------------------.
|
|
19
|
-
"""Utilities for Dask Distributed
|
|
19
|
+
"""Utilities for Dask Distributed Computations."""
|
|
20
20
|
import logging
|
|
21
21
|
import os
|
|
22
22
|
|
|
23
|
+
import numpy as np
|
|
23
24
|
|
|
24
|
-
|
|
25
|
+
|
|
26
|
+
def check_parallel_validity(parallel):
|
|
27
|
+
"""Check validity of parallel option given Dask settings."""
|
|
28
|
+
import dask
|
|
29
|
+
|
|
30
|
+
scheduler = dask.config.get("scheduler", None)
|
|
31
|
+
if scheduler is None:
|
|
32
|
+
return parallel
|
|
33
|
+
if scheduler in ["synchronous", "threads"]:
|
|
34
|
+
return False
|
|
35
|
+
if scheduler == "distributed":
|
|
36
|
+
from dask.distributed import default_client
|
|
37
|
+
|
|
38
|
+
client = default_client()
|
|
39
|
+
info = client.scheduler_info()
|
|
40
|
+
|
|
41
|
+
# If ThreadWorker, only 1 pid
|
|
42
|
+
pids = list(client.run(os.getpid).values())
|
|
43
|
+
if len(np.unique(pids)) == 1:
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
# If ProcessWorker
|
|
47
|
+
# - Check single thread per worker to avoid locks
|
|
48
|
+
nthreads_per_process = np.array([v["nthreads"] for v in info["workers"].values()])
|
|
49
|
+
if not np.all(nthreads_per_process == 1):
|
|
50
|
+
print(
|
|
51
|
+
"To open netCDFs in parallel with dask distributed (processes=True), please set threads_per_worker=1 !",
|
|
52
|
+
)
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
# Otherwise let the user choose
|
|
56
|
+
return parallel
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def initialize_dask_cluster(minimum_memory=None):
|
|
25
60
|
"""Initialize Dask Cluster."""
|
|
26
61
|
import dask
|
|
62
|
+
import psutil
|
|
63
|
+
|
|
64
|
+
# Silence dask warnings
|
|
65
|
+
# dask.config.set({"logging.distributed": "error"})
|
|
66
|
+
# Import dask.distributed after setting the config
|
|
27
67
|
from dask.distributed import Client, LocalCluster
|
|
68
|
+
from dask.utils import parse_bytes
|
|
28
69
|
|
|
29
70
|
# Set HDF5_USE_FILE_LOCKING to avoid going stuck with HDF
|
|
30
71
|
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
|
|
31
|
-
|
|
32
|
-
|
|
72
|
+
|
|
73
|
+
# Retrieve the number of processes to run
|
|
74
|
+
available_workers = os.cpu_count() - 2 # if not set, all CPUs minus 2
|
|
33
75
|
num_workers = dask.config.get("num_workers", available_workers)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
76
|
+
|
|
77
|
+
# If memory limit specified, ensure correct amount of workers
|
|
78
|
+
if minimum_memory is not None:
|
|
79
|
+
# Compute available memory (in bytes)
|
|
80
|
+
total_memory = psutil.virtual_memory().total
|
|
81
|
+
# Get minimum memory per worker (in bytes)
|
|
82
|
+
minimum_memory = parse_bytes(minimum_memory)
|
|
83
|
+
# Determine number of workers constrained by memory
|
|
84
|
+
maximum_workers_allowed = max(1, total_memory // minimum_memory)
|
|
85
|
+
# Respect both CPU and memory requirements
|
|
86
|
+
num_workers = min(maximum_workers_allowed, num_workers)
|
|
87
|
+
|
|
37
88
|
# Create dask.distributed local cluster
|
|
38
89
|
cluster = LocalCluster(
|
|
39
90
|
n_workers=num_workers,
|
|
40
91
|
threads_per_worker=1,
|
|
41
92
|
processes=True,
|
|
42
93
|
# memory_limit='8GB',
|
|
43
|
-
|
|
94
|
+
silence_logs=logging.ERROR,
|
|
44
95
|
)
|
|
45
96
|
client = Client(cluster)
|
|
46
97
|
return cluster, client
|
disdrodb/utils/dataframe.py
CHANGED
|
@@ -20,6 +20,8 @@
|
|
|
20
20
|
import numpy as np
|
|
21
21
|
import pandas as pd
|
|
22
22
|
|
|
23
|
+
from disdrodb.utils.warnings import suppress_warnings
|
|
24
|
+
|
|
23
25
|
|
|
24
26
|
def log_arange(start, stop, log_step=0.1, base=10):
|
|
25
27
|
"""
|
|
@@ -47,7 +49,39 @@ def log_arange(start, stop, log_step=0.1, base=10):
|
|
|
47
49
|
log_start = np.log(start) / np.log(base)
|
|
48
50
|
log_stop = np.log(stop) / np.log(base)
|
|
49
51
|
|
|
50
|
-
log_values = np.arange(log_start, log_stop, log_step)
|
|
52
|
+
log_values = np.arange(log_start, log_stop + log_step / 2, log_step)
|
|
53
|
+
return base**log_values
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def log_linspace(start, stop, n_bins, base=10):
|
|
57
|
+
"""
|
|
58
|
+
Return numbers spaced evenly on a log scale between start and stop.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
start : float
|
|
63
|
+
The starting value of the sequence (must be > 0).
|
|
64
|
+
stop : float
|
|
65
|
+
The end value of the sequence (must be > 0).
|
|
66
|
+
n_bins : int
|
|
67
|
+
The number of points to generate (including start and stop).
|
|
68
|
+
base : float
|
|
69
|
+
The logarithmic base (default is 10).
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
np.ndarray
|
|
74
|
+
Array of values spaced evenly in log space.
|
|
75
|
+
"""
|
|
76
|
+
if start <= 0 or stop <= 0:
|
|
77
|
+
raise ValueError("Both start and stop must be > 0 for log spacing.")
|
|
78
|
+
if n_bins < 2:
|
|
79
|
+
raise ValueError("n_bins must be >= 2 to include start and stop values.")
|
|
80
|
+
|
|
81
|
+
log_start = np.log(start) / np.log(base)
|
|
82
|
+
log_stop = np.log(stop) / np.log(base)
|
|
83
|
+
|
|
84
|
+
log_values = np.linspace(log_start, log_stop, n_bins)
|
|
51
85
|
return base**log_values
|
|
52
86
|
|
|
53
87
|
|
|
@@ -100,6 +134,9 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
|
|
|
100
134
|
if len(df) == 0:
|
|
101
135
|
raise ValueError("No valid data points after removing NaN values")
|
|
102
136
|
|
|
137
|
+
# Keep only data within bin range
|
|
138
|
+
df = df[(df[column] >= bins[0]) & (df[column] < bins[-1])]
|
|
139
|
+
|
|
103
140
|
# Create binned columns with explicit handling of out-of-bounds values
|
|
104
141
|
df[f"{column}_binned"] = pd.cut(df[column], bins=bins, include_lowest=True)
|
|
105
142
|
|
|
@@ -134,7 +171,7 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
|
|
|
134
171
|
(f"{prefix}std", "std"),
|
|
135
172
|
(f"{prefix}min", "min"),
|
|
136
173
|
(f"{prefix}max", "max"),
|
|
137
|
-
(f"{prefix}mad", lambda s:
|
|
174
|
+
(f"{prefix}mad", lambda s: (s - s.median()).abs().median()),
|
|
138
175
|
]
|
|
139
176
|
if i == 0:
|
|
140
177
|
list_stats.append(("count", "count"))
|
|
@@ -142,7 +179,8 @@ def compute_1d_histogram(df, column, variables=None, bins=10, labels=None, prefi
|
|
|
142
179
|
list_stats = [("count", "count")]
|
|
143
180
|
|
|
144
181
|
# Compute statistics
|
|
145
|
-
|
|
182
|
+
with suppress_warnings():
|
|
183
|
+
df_stats = df_grouped[var].agg(list_stats)
|
|
146
184
|
|
|
147
185
|
# Compute other variable statistics
|
|
148
186
|
if variables_specified:
|
|
@@ -253,8 +291,18 @@ def compute_2d_histogram(
|
|
|
253
291
|
raise ValueError("No valid data points after removing NaN values")
|
|
254
292
|
|
|
255
293
|
# Create binned columns with explicit handling of out-of-bounds values
|
|
256
|
-
df[f"{x}_binned"] = pd.cut(
|
|
257
|
-
|
|
294
|
+
df[f"{x}_binned"] = pd.cut(
|
|
295
|
+
df[x],
|
|
296
|
+
bins=pd.IntervalIndex.from_breaks(x_bins, closed="right"),
|
|
297
|
+
include_lowest=True,
|
|
298
|
+
ordered=True,
|
|
299
|
+
)
|
|
300
|
+
df[f"{y}_binned"] = pd.cut(
|
|
301
|
+
df[y],
|
|
302
|
+
bins=pd.IntervalIndex.from_breaks(y_bins, closed="right"),
|
|
303
|
+
include_lowest=True,
|
|
304
|
+
ordered=True,
|
|
305
|
+
)
|
|
258
306
|
|
|
259
307
|
# Create complete IntervalIndex for both dimensions
|
|
260
308
|
x_intervals = df[f"{x}_binned"].cat.categories
|
|
@@ -318,8 +366,8 @@ def compute_2d_histogram(
|
|
|
318
366
|
df_stats = df_stats.reindex(full_index)
|
|
319
367
|
|
|
320
368
|
# Determine coordinates
|
|
321
|
-
x_centers = x_intervals.mid
|
|
322
|
-
y_centers = y_intervals.mid
|
|
369
|
+
x_centers = np.array(x_intervals.mid)
|
|
370
|
+
y_centers = np.array(y_intervals.mid)
|
|
323
371
|
|
|
324
372
|
# Use provided labels if available
|
|
325
373
|
x_coords = x_labels if x_labels is not None else x_centers
|
|
@@ -337,6 +385,12 @@ def compute_2d_histogram(
|
|
|
337
385
|
# Convert to dataset
|
|
338
386
|
ds = df_stats.to_xarray()
|
|
339
387
|
|
|
388
|
+
# Convert Categorical coordinates to float if possible
|
|
389
|
+
if np.issubdtype(x_coords.dtype, np.number):
|
|
390
|
+
ds[f"{x}"] = ds[f"{x}"].astype(float)
|
|
391
|
+
if np.issubdtype(y_coords.dtype, np.number):
|
|
392
|
+
ds[f"{y}"] = ds[f"{y}"].astype(float)
|
|
393
|
+
|
|
340
394
|
# Transpose arrays
|
|
341
395
|
ds = ds.transpose(y, x)
|
|
342
396
|
return ds
|
disdrodb/utils/directories.py
CHANGED
|
@@ -98,18 +98,29 @@ def _recursive_glob(dir_path, glob_pattern):
|
|
|
98
98
|
return [str(path) for path in dir_path.rglob(glob_pattern)]
|
|
99
99
|
|
|
100
100
|
|
|
101
|
-
def
|
|
101
|
+
def _is_hidden(path):
|
|
102
|
+
"""Return True if any component of path is hidden."""
|
|
103
|
+
return any(part.startswith(".") for part in path.split(os.sep))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _list_paths(dir_path, glob_pattern, recursive=False, skip_hidden=True):
|
|
102
107
|
"""Return a list of filepaths and directory paths based on a single glob pattern."""
|
|
103
108
|
# If glob pattern has separators, disable recursive option
|
|
104
109
|
if "/" in glob_pattern and "**" not in glob_pattern:
|
|
105
110
|
recursive = False
|
|
106
111
|
# Search paths
|
|
107
112
|
if not recursive:
|
|
108
|
-
|
|
109
|
-
|
|
113
|
+
matches = glob.glob(os.path.join(dir_path, glob_pattern))
|
|
114
|
+
else:
|
|
115
|
+
matches = _recursive_glob(dir_path, glob_pattern)
|
|
110
116
|
|
|
117
|
+
# Filter out anything with a hidden component
|
|
118
|
+
if skip_hidden:
|
|
119
|
+
matches = [p for p in matches if not _is_hidden(os.path.relpath(p, dir_path))]
|
|
120
|
+
return matches
|
|
111
121
|
|
|
112
|
-
|
|
122
|
+
|
|
123
|
+
def list_paths(dir_path, glob_pattern, recursive=False, skip_hidden=True):
|
|
113
124
|
"""Return a list of filepaths and directory paths.
|
|
114
125
|
|
|
115
126
|
This function accept also a list of glob patterns !
|
|
@@ -119,35 +130,41 @@ def list_paths(dir_path, glob_pattern, recursive=False):
|
|
|
119
130
|
# Search path for specified glob patterns
|
|
120
131
|
paths = flatten_list(
|
|
121
132
|
[
|
|
122
|
-
_list_paths(dir_path=dir_path, glob_pattern=glob_pattern, recursive=recursive)
|
|
133
|
+
_list_paths(dir_path=dir_path, glob_pattern=glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
|
|
123
134
|
for glob_pattern in glob_patterns
|
|
124
135
|
],
|
|
125
136
|
)
|
|
126
137
|
return paths
|
|
127
138
|
|
|
128
139
|
|
|
129
|
-
def list_files(dir_path, glob_pattern, recursive=False):
|
|
140
|
+
def list_files(dir_path, glob_pattern="*", recursive=False, skip_hidden=True, return_paths=True):
|
|
130
141
|
"""Return a list of filepaths (exclude directory paths)."""
|
|
131
|
-
paths = list_paths(dir_path, glob_pattern, recursive=recursive)
|
|
142
|
+
paths = list_paths(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
|
|
132
143
|
filepaths = [f for f in paths if os.path.isfile(f)]
|
|
144
|
+
# If return_paths is False, return only files names
|
|
145
|
+
if not return_paths:
|
|
146
|
+
filepaths = [os.path.basename(f) for f in filepaths]
|
|
133
147
|
return filepaths
|
|
134
148
|
|
|
135
149
|
|
|
136
|
-
def list_directories(dir_path, glob_pattern, recursive=False):
|
|
150
|
+
def list_directories(dir_path, glob_pattern="*", recursive=False, skip_hidden=True, return_paths=True):
|
|
137
151
|
"""Return a list of directory paths (exclude file paths)."""
|
|
138
|
-
paths = list_paths(dir_path, glob_pattern, recursive=recursive)
|
|
152
|
+
paths = list_paths(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden)
|
|
139
153
|
dir_paths = [f for f in paths if os.path.isdir(f)]
|
|
154
|
+
# If return_paths is False, return only directory names
|
|
155
|
+
if not return_paths:
|
|
156
|
+
dir_paths = [os.path.basename(f) for f in dir_paths]
|
|
140
157
|
return dir_paths
|
|
141
158
|
|
|
142
159
|
|
|
143
|
-
def count_files(dir_path, glob_pattern, recursive=False):
|
|
160
|
+
def count_files(dir_path, glob_pattern="*", recursive=False, skip_hidden=True):
|
|
144
161
|
"""Return the number of files (exclude directories)."""
|
|
145
|
-
return len(list_files(dir_path, glob_pattern, recursive=recursive))
|
|
162
|
+
return len(list_files(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden))
|
|
146
163
|
|
|
147
164
|
|
|
148
|
-
def count_directories(dir_path, glob_pattern, recursive=False):
|
|
165
|
+
def count_directories(dir_path, glob_pattern="*", recursive=False, skip_hidden=True):
|
|
149
166
|
"""Return the number of files (exclude directories)."""
|
|
150
|
-
return len(list_directories(dir_path, glob_pattern, recursive=recursive))
|
|
167
|
+
return len(list_directories(dir_path, glob_pattern, recursive=recursive, skip_hidden=skip_hidden))
|
|
151
168
|
|
|
152
169
|
|
|
153
170
|
def check_directory_exists(dir_path):
|
|
@@ -177,7 +194,7 @@ def create_required_directory(dir_path, dir_name, exist_ok=True):
|
|
|
177
194
|
create_directory(path=new_dir_path, exist_ok=exist_ok)
|
|
178
195
|
|
|
179
196
|
|
|
180
|
-
def is_empty_directory(path):
|
|
197
|
+
def is_empty_directory(path, skip_hidden=True):
|
|
181
198
|
"""Check if a directory path is empty.
|
|
182
199
|
|
|
183
200
|
Return ``False`` if path is a file or non-empty directory.
|
|
@@ -187,8 +204,11 @@ def is_empty_directory(path):
|
|
|
187
204
|
raise OSError(f"{path} does not exist.")
|
|
188
205
|
if not os.path.isdir(path):
|
|
189
206
|
return False
|
|
190
|
-
|
|
191
207
|
paths = os.listdir(path)
|
|
208
|
+
|
|
209
|
+
# If skip_hidden is True, filter out hidden files/directories
|
|
210
|
+
if skip_hidden:
|
|
211
|
+
paths = [f for f in paths if not f.startswith(".")]
|
|
192
212
|
return len(paths) == 0
|
|
193
213
|
|
|
194
214
|
|