climarraykit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ __version__ = "0.2.0"
5
+
6
+ __all__ = [
7
+ "conversions",
8
+ "data_manipulation",
9
+ "file_utils",
10
+ "patterns",
11
+ "xarray_obj_handler",
12
+ ]
@@ -0,0 +1,208 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ #----------------#
5
+ # Import modules #
6
+ #----------------#
7
+
8
+ import xarray as xr
9
+ from pathlib import Path
10
+
11
+ #------------------------#
12
+ # Import project modules #
13
+ #------------------------#
14
+
15
+ from climarraykit.xarray_obj_handler import _save_ds_as_nc
16
+ from paramlib.global_parameters import CLIMATE_FILE_EXTENSIONS
17
+ from pygenutils.arrays_and_lists.conversions import flatten_to_string
18
+ from pygenutils.arrays_and_lists.data_manipulation import flatten_list
19
+ from pygenutils.operative_systems.os_operations import exit_info, run_system_command
20
+ from pygenutils.strings.string_handler import (
21
+ find_substring_index,
22
+ get_obj_specs,
23
+ modify_obj_specs
24
+ )
25
+
26
+ #------------------#
27
+ # Define functions #
28
+ #------------------#
29
+
30
+ # Xarray objects #
31
+ #----------------#
32
+
33
+ def grib2nc(
34
+ grib_file_list: str | list[str],
35
+ on_shell: bool = False,
36
+ option_str: str | None = None,
37
+ capture_output: bool = False,
38
+ return_output_name: bool = False,
39
+ encoding: str = "utf-8",
40
+ shell: bool = True) -> None:
41
+ """
42
+ Converts a GRIB file or list of GRIB files to netCDF format. The conversion
43
+ can be executed either via shell commands or programmatically using xarray.
44
+
45
+ Parameters
46
+ ----------
47
+ grib_file_list : str | list[str]
48
+ The file path(s) of the GRIB file(s) to be converted.
49
+ on_shell : bool, optional
50
+ If True, the conversion will be handled through shell commands using
51
+ the 'grib_to_netcdf' tool. If False, the conversion will be done
52
+ programmatically using xarray.
53
+ option_str : str, optional
54
+ Additional options to pass to the shell command for 'grib_to_netcdf'.
55
+ This parameter is only used if 'on_shell' is set to True.
56
+ capture_output : bool, optional
57
+ Whether to capture the command output. Default is False.
58
+ return_output_name : bool, optional
59
+ Whether to return file descriptor names. Default is False.
60
+ encoding : str, optional
61
+ Encoding to use when decoding command output. Default is "utf-8".
62
+ shell : bool, optional
63
+ Whether to execute the command through the shell. Default is True.
64
+
65
+ Returns
66
+ -------
67
+ None
68
+ Converts the GRIB file(s) to netCDF format and saves the output
69
+ netCDF file(s) in the same directory as the GRIB files.
70
+
71
+ Raises
72
+ ------
73
+ TypeError
74
+ If grib_file_list is not str or list of str.
75
+ ValueError
76
+ If any GRIB file path is invalid or empty.
77
+ FileNotFoundError
78
+ If any GRIB file doesn't exist.
79
+
80
+ Notes
81
+ -----
82
+ - When 'on_shell' is True, the function builds and runs a shell command
83
+ that calls the 'grib_to_netcdf' tool, with optional flags.
84
+ - When 'on_shell' is False, xarray is used to directly open the GRIB file
85
+ and convert it to netCDF format.
86
+ - The function will prompt for input in the case of multiple GRIB files if
87
+ 'on_shell' is True.
88
+ """
89
+
90
+ # Parameter validation
91
+ if not isinstance(grib_file_list, (str, list)):
92
+ raise TypeError("grib_file_list must be a string or list of strings")
93
+
94
+ # Flatten nested lists for defensive programming
95
+ if isinstance(grib_file_list, list):
96
+ grib_file_list = flatten_list(grib_file_list)
97
+
98
+ # Validate all items are strings
99
+ if not all(isinstance(item, str) for item in grib_file_list):
100
+ raise TypeError("All items in grib_file_list must be strings")
101
+
102
+ # Check for empty strings
103
+ if not all(item.strip() for item in grib_file_list):
104
+ raise ValueError("All GRIB file paths must be non-empty strings")
105
+ else:
106
+ # Single string validation
107
+ if not isinstance(grib_file_list, str) or not grib_file_list.strip():
108
+ raise ValueError("GRIB file path must be a non-empty string")
109
+
110
+ # Check file existence
111
+ files_to_check = [grib_file_list] if isinstance(grib_file_list, str) else grib_file_list
112
+ for grib_file in files_to_check:
113
+ if not Path(grib_file).exists():
114
+ raise FileNotFoundError(f"GRIB file not found: {grib_file}")
115
+
116
+ # Check if file has expected GRIB extension
117
+ if not any(grib_file.lower().endswith(ext.lower()) for ext in ['.grib', '.grb', '.grib2', '.grb2']):
118
+ print(f"Warning: File {grib_file} may not be a GRIB file based on extension")
119
+
120
+ # Shell-based conversion #
121
+ #-#-#-#-#-#-#-#-#-#-#-#-#
122
+
123
+ if on_shell:
124
+ # Handle single GRIB file
125
+ if isinstance(grib_file_list, str):
126
+ nc_file_new = modify_obj_specs(grib_file_list, "ext", EXTENSIONS[0])
127
+
128
+ # Handle list of GRIB files
129
+ else:
130
+ grib_allfile_info_str = flatten_to_string(grib_file_list)
131
+
132
+ # Prompt user for the netCDF file name without extension
133
+ nc_file_new_noext = input("Please introduce a name "
134
+ "for the netCDF file, "
135
+ "WITHOUT THE EXTENSION: ")
136
+
137
+ # Validate the file name using RegEx
138
+ allowed_minimum_char_idx = find_substring_index(nc_file_new_noext,
139
+ REGEX_GRIB2NC,
140
+ advanced_search=True)
141
+
142
+ while allowed_minimum_char_idx == -1:
143
+ print("Invalid file name.\nIt can contain alphanumeric characters, "
144
+ "as well as the following non-word characters: {. _ -}")
145
+ nc_file_new_noext = input("Please introduce a valid name: ")
146
+ allowed_minimum_char_idx = find_substring_index(nc_file_new_noext,
147
+ REGEX_GRIB2NC,
148
+ advanced_search=True)
149
+
150
+ # Modify the file name to have the .nc extension
151
+ nc_file_new = modify_obj_specs(nc_file_new_noext,
152
+ obj2modify="ext",
153
+ new_obj=EXTENSIONS[0])
154
+
155
+ # Construct the shell command for conversion
156
+ grib2nc_template = "grib_to_netcdf "
157
+ if option_str:
158
+ grib2nc_template += f"{option_str} "
159
+ grib2nc_template += f"-o {nc_file_new} {grib_allfile_info_str}"
160
+
161
+ # Execute the shell command
162
+ try:
163
+ process_exit_info = run_system_command(
164
+ grib2nc_template,
165
+ capture_output=capture_output,
166
+ return_output_name=return_output_name,
167
+ encoding=encoding,
168
+ shell=shell
169
+ )
170
+ # Call exit_info with parameters based on capture_output
171
+ exit_info(
172
+ process_exit_info,
173
+ check_stdout=True,
174
+ check_stderr=True,
175
+ check_return_code=True
176
+ )
177
+ except Exception as e:
178
+ raise RuntimeError(f"Shell command execution failed: {e}")
179
+
180
+ # Programmatic conversion #
181
+ #-#-#-#-#-#-#-#-#-#-#-#-#-#
182
+
183
+ else:
184
+ # Ensure grib_file_list is a list
185
+ if isinstance(grib_file_list, str):
186
+ grib_file_list = [grib_file_list]
187
+
188
+ # Convert each GRIB file in the list to netCDF
189
+ for grib_file in grib_file_list:
190
+ try:
191
+ grib_file_noext = get_obj_specs(grib_file, "name_noext", EXTENSIONS[0])
192
+ ds = xr.open_dataset(grib_file, engine="cfgrib")
193
+ _save_ds_as_nc(ds, grib_file_noext)
194
+ print(f"Successfully converted {grib_file} to netCDF format")
195
+ except Exception as e:
196
+ print(f"Error converting {grib_file}: {e}")
197
+ raise
198
+
199
+
200
+ #--------------------------#
201
+ # Parameters and constants #
202
+ #--------------------------#
203
+
204
+ # Valid file extensions #
205
+ EXTENSIONS = CLIMATE_FILE_EXTENSIONS[::3]
206
+
207
+ # RegEx control for GRIB-to-netCDF single file name #
208
+ REGEX_GRIB2NC = r"^[a-zA-Z\d\._-]$"
@@ -0,0 +1,386 @@
1
+ #! /usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ #----------------#
5
+ # Import modules #
6
+ #----------------#
7
+
8
+ import os
9
+ from pathlib import Path
10
+
11
+ #------------------------#
12
+ # Import project modules #
13
+ #------------------------#
14
+
15
+ from filewise.file_operations.ops_handler import move_files
16
+ from filewise.file_operations.path_utils import find_dirs_with_files, find_files
17
+ from climarraykit.file_utils import ncfile_integrity_status
18
+ from climarraykit.patterns import (
19
+ find_coordinate_variables,
20
+ get_latlon_bounds,
21
+ get_latlon_deltas,
22
+ get_times
23
+ )
24
+ from paramlib.global_parameters import CLIMATE_FILE_EXTENSIONS
25
+ from pygenutils.strings.text_formatters import format_string, string_underliner
26
+ from pygenutils.time_handling.date_and_time_utils import find_dt_key
27
+
28
+ #-------------------------#
29
+ # Define custom functions #
30
+ #-------------------------#
31
+
32
+ # Data extractors #
33
+ #-----------------#
34
+
35
+ def extract_latlon_bounds(delta_decimal_places: int, value_decimal_places: int) -> None:
36
+ """
37
+ Extract latitude and longitude bounds from netCDF files.
38
+
39
+ Parameters
40
+ ----------
41
+ delta_decimal_places : int
42
+ Number of decimal places to round off the delta between latitude and longitude points.
43
+ value_decimal_places : int
44
+ Number of decimal places to round off the latitude and longitude values.
45
+
46
+ Returns
47
+ -------
48
+ None
49
+
50
+ Raises
51
+ ------
52
+ TypeError
53
+ If decimal places parameters are not integers.
54
+ ValueError
55
+ If decimal places parameters are negative.
56
+
57
+ Notes
58
+ -----
59
+ - The extracted latitude and longitude arrays, their dimensions,
60
+ and deltas are saved in a report file.
61
+ - If any files are faulty or cannot be processed, relevant error information
62
+ is recorded in the report.
63
+ """
64
+ # Parameter validation
65
+ if not isinstance(delta_decimal_places, int):
66
+ raise TypeError("delta_decimal_places must be an integer")
67
+
68
+ if not isinstance(value_decimal_places, int):
69
+ raise TypeError("value_decimal_places must be an integer")
70
+
71
+ if delta_decimal_places < 0:
72
+ raise ValueError("delta_decimal_places must be non-negative")
73
+
74
+ if value_decimal_places < 0:
75
+ raise ValueError("value_decimal_places must be non-negative")
76
+
77
+ nc_dirs = find_dirs_with_files(EXTENSIONS[0], search_path=CODE_CALL_DIR)
78
+
79
+ for dir_num, dir_name in enumerate(nc_dirs, start=1):
80
+ nc_files = find_files(EXTENSIONS[0], dir_name, match_type="ext", top_path_only=True)
81
+
82
+ with open(COORD_INFO_FNAME, "w") as report:
83
+ if nc_files:
84
+ for file_num, nc_file in enumerate(nc_files, start=1):
85
+ print(f"Processing file {file_num} out of {len(nc_files)} "
86
+ f"in directory {dir_num} out of {len(nc_dirs)}...")
87
+ report.write(format_string(string_underliner(DIR_INFO_TEMPLATE, dir_name), "+"))
88
+
89
+ try:
90
+ ncfile_integrity_status(nc_file)
91
+ except Exception as ncf_err:
92
+ report.write(f"FAULTY FILE '{nc_file}': {ncf_err}\n")
93
+ else:
94
+ try:
95
+ coord_vars = find_coordinate_variables(nc_file)
96
+ except Exception as coord_err:
97
+ report.write(f"ERROR IN FILE '{nc_file}': {coord_err}\n")
98
+ else:
99
+ try:
100
+ lats, lons = get_latlon_bounds(nc_file, coord_vars[0], coord_vars[1], value_decimal_places)
101
+ lat_delta, lon_delta = get_latlon_deltas(lats, lons, delta_decimal_places)
102
+
103
+ format_args_latlon_bounds = (
104
+ nc_file,
105
+ lats,
106
+ lons,
107
+ len(lats),
108
+ len(lons),
109
+ lat_delta,
110
+ lon_delta
111
+ )
112
+
113
+ report.write(format_string(LATLON_INFO_TEMPLATE, format_args_latlon_bounds))
114
+ except Exception as e:
115
+ report.write(f"ERROR PROCESSING COORDINATES IN FILE '{nc_file}': {e}\n")
116
+
117
+ move_files(COORD_INFO_FNAME,
118
+ input_directories=".",
119
+ destination_directories=dir_name,
120
+ match_type="glob")
121
+ else:
122
+ report.write(f"No netCDF files in directory {dir_name}\n")
123
+ move_files(COORD_INFO_FNAME,
124
+ input_directories=".",
125
+ destination_directories=dir_name,
126
+ match_type="glob")
127
+
128
+
129
+ def extract_time_bounds() -> None:
130
+ """
131
+ Extract the time bounds (start and end times) from netCDF files.
132
+
133
+ Parameters
134
+ ----------
135
+ None
136
+
137
+ Returns
138
+ -------
139
+ None
140
+
141
+ Notes
142
+ -----
143
+ - The time range (start and end times) and the total number of time records
144
+ are saved in a report file.
145
+ - If any files are faulty or cannot be processed, relevant error information
146
+ is recorded in the report.
147
+ """
148
+ nc_dirs = find_dirs_with_files(EXTENSIONS[0], search_path=CODE_CALL_DIR)
149
+
150
+ for dir_num, dir_name in enumerate(nc_dirs, start=1):
151
+ nc_files = find_files(EXTENSIONS[0], dir_name, match_type="ext", top_path_only=True)
152
+
153
+ with open(DATE_RANGE_INFO_FNAME, "w") as report:
154
+ if nc_files:
155
+ for file_num, nc_file in enumerate(nc_files, start=1):
156
+ print(f"Processing file {file_num} out of {len(nc_files)} "
157
+ f"in directory {dir_num} out of {len(nc_dirs)}...")
158
+ report.write(format_string(string_underliner(DIR_INFO_TEMPLATE, dir_name), "+"))
159
+
160
+ try:
161
+ ncfile_integrity_status(nc_file)
162
+ except Exception as ncf_err:
163
+ report.write(f"FAULTY FILE '{nc_file}': {ncf_err}\n")
164
+ else:
165
+ try:
166
+ time_var = find_dt_key(nc_file)
167
+ except Exception as time_err:
168
+ report.write(f"ERROR IN FILE '{nc_file}': {time_err}\n")
169
+ else:
170
+ try:
171
+ times = get_times(nc_file, time_var)
172
+ format_args_time_periods = (
173
+ nc_file,
174
+ times[0].values,
175
+ times[-1].values,
176
+ len(times)
177
+ )
178
+
179
+ report.write(format_string(PERIOD_INFO_TEMPLATE, format_args_time_periods))
180
+ except Exception as e:
181
+ report.write(f"ERROR PROCESSING TIME DATA IN FILE '{nc_file}': {e}\n")
182
+
183
+ move_files(DATE_RANGE_INFO_FNAME,
184
+ input_directories=".",
185
+ destination_directories=dir_name,
186
+ match_type="glob")
187
+ else:
188
+ report.write(f"No netCDF files in directory {dir_name}\n")
189
+ move_files(DATE_RANGE_INFO_FNAME,
190
+ input_directories=".",
191
+ destination_directories=dir_name,
192
+ match_type="glob")
193
+
194
+
195
+ def extract_time_formats() -> None:
196
+ """
197
+ Extract the time formats from netCDF files.
198
+
199
+ Parameters
200
+ ----------
201
+ None
202
+
203
+ Returns
204
+ -------
205
+ None
206
+
207
+ Notes
208
+ -----
209
+ - The extracted time formats and the total number of time records are saved
210
+ in a report file.
211
+ - If any files are faulty or cannot be processed, relevant error information
212
+ is recorded in the report.
213
+ """
214
+
215
+ nc_dirs = find_dirs_with_files(EXTENSIONS[0], search_path=CODE_CALL_DIR)
216
+
217
+ for dir_num, dir_name in enumerate(nc_dirs, start=1):
218
+ nc_files = find_files(EXTENSIONS[0], dir_name, match_type="ext", top_path_only=True)
219
+
220
+ with open(TIME_FORMATS_FILE_NAME, "w") as report:
221
+ if nc_files:
222
+ for file_num, nc_file in enumerate(nc_files, start=1):
223
+ print(f"Processing file {file_num} out of {len(nc_files)} "
224
+ f"in directory {dir_num} out of {len(nc_dirs)}...")
225
+ report.write(format_string(string_underliner(DIR_INFO_TEMPLATE, dir_name), "+"))
226
+
227
+ try:
228
+ ncfile_integrity_status(nc_file)
229
+ except Exception as ncf_err:
230
+ report.write(f"FAULTY FILE '{nc_file}': {ncf_err}\n")
231
+ else:
232
+ try:
233
+ time_var = find_dt_key(nc_file)
234
+ except Exception as time_err:
235
+ report.write(f"ERROR IN FILE '{nc_file}': {time_err}\n")
236
+ else:
237
+ try:
238
+ times = get_times(nc_file, time_var)
239
+ format_args_time_formats = (
240
+ nc_file,
241
+ times.values,
242
+ len(times)
243
+ )
244
+ report.write(format_string(TIME_FORMAT_INFO_TEMPLATE, format_args_time_formats))
245
+ except Exception as e:
246
+ report.write(f"ERROR PROCESSING TIME FORMATS IN FILE '{nc_file}': {e}\n")
247
+
248
+ move_files(TIME_FORMATS_FILE_NAME,
249
+ input_directories=".",
250
+ destination_directories=dir_name,
251
+ match_type="glob")
252
+ else:
253
+ report.write(f"No netCDF files in directory {dir_name}\n")
254
+ move_files(TIME_FORMATS_FILE_NAME,
255
+ input_directories=".",
256
+ destination_directories=dir_name,
257
+ match_type="glob")
258
+
259
+ # File regridding #
260
+ #-----------------#
261
+
262
+ def netcdf_regridder(ds_in: 'xr.Dataset', ds_image: 'xr.Dataset', regrid_method: str = "bilinear") -> 'xr.Dataset':
263
+ """
264
+ Function that regrids a xarray Dataset to that of the desired Dataset.
265
+ It is similar to CDO but more intuitive and
266
+ easier to understand, supported by Python.
267
+
268
+ Parameters
269
+ ----------
270
+ ds_in : xarray.Dataset
271
+ Input xarray data set
272
+ ds_image : xarray.Dataset
273
+ Xarray data set with grid specifications to which apply on ds_in.
274
+ regrid_method : {'bilinear', 'conservative', 'nearest_s2d', 'nearest_d2s', 'patch'}
275
+ Regridding method. Defaults 'bilinear'.
276
+
277
+ Returns
278
+ -------
279
+ ds_out : xarray.Dataset
280
+ Output data set regridded according to the grid specs of ds_in.
281
+
282
+ Raises
283
+ ------
284
+ TypeError
285
+ If input datasets are not xarray.Dataset objects.
286
+ ValueError
287
+ If regrid_method is not valid.
288
+ ImportError
289
+ If xesmf package is not available.
290
+ RuntimeError
291
+ If regridding operation fails.
292
+ """
293
+ # Parameter validation
294
+ try:
295
+ import xarray as xr
296
+ if not isinstance(ds_in, xr.Dataset):
297
+ raise TypeError("ds_in must be an xarray.Dataset")
298
+
299
+ if not isinstance(ds_image, xr.Dataset):
300
+ raise TypeError("ds_image must be an xarray.Dataset")
301
+ except ImportError:
302
+ raise ImportError("xarray package is required but not available")
303
+
304
+ if not isinstance(regrid_method, str):
305
+ raise TypeError("regrid_method must be a string")
306
+
307
+ if regrid_method not in REGRID_METHOD_LIST:
308
+ raise ValueError("Invalid regridding method.\n"
309
+ f"Choose one from {REGRID_METHOD_LIST}.")
310
+
311
+ try:
312
+ import xesmf as xe
313
+ except ImportError:
314
+ raise ImportError("xesmf package is required for regridding but not available")
315
+
316
+ try:
317
+ regridder = xe.Regridder(ds_in, ds_image, regrid_method)
318
+ ds_out = regridder(ds_in)
319
+ return ds_out
320
+ except Exception as e:
321
+ raise RuntimeError(f"Regridding operation failed: {e}")
322
+
323
+ #--------------------------#
324
+ # Parameters and constants #
325
+ #--------------------------#
326
+
327
+ # Directory from where this code is being called #
328
+ CODE_CALL_DIR = os.getcwd()
329
+
330
+ # File extensions #
331
+ EXTENSIONS = CLIMATE_FILE_EXTENSIONS[::3]
332
+
333
+ # Main file names #
334
+ COORD_INFO_FNAME = "latlon_bounds.txt"
335
+ DATE_RANGE_INFO_FNAME = "period_bounds.txt"
336
+ TIME_FORMATS_FILE_NAME = "time_formats.txt"
337
+
338
+ # Regridding method options #
339
+ REGRID_METHOD_LIST = [
340
+ "bilinear",
341
+ "conservative",
342
+ "conservative_normed",
343
+ "nearest_s2d",
344
+ "nearest_d2s",
345
+ "patch"
346
+ ]
347
+
348
+ # Template strings #
349
+ #------------------#
350
+
351
+ # Main parameter scanning info strings #
352
+ LATLON_INFO_TEMPLATE = \
353
+ """=========================================================
354
+ ·File: {}
355
+
356
+ ·Latitudes:
357
+ {}
358
+
359
+ ·Longitudes:
360
+ {}
361
+
362
+ -Latitude-longitude array dimensions = {} x {}
363
+ -Latitude-longitude array delta = ({}, {})
364
+
365
+ """
366
+
367
+ PERIOD_INFO_TEMPLATE = \
368
+ """=========================================================
369
+ ·File: {}
370
+ ·Time range: {} -- {}
371
+ -Range length = {}
372
+
373
+ """
374
+
375
+ TIME_FORMAT_INFO_TEMPLATE = \
376
+ """=========================================================
377
+ ·File: {}
378
+
379
+ ·Time array:
380
+ {}
381
+
382
+ -Array length = {}
383
+ """
384
+
385
+ # File scanning progress information strings #
386
+ DIR_INFO_TEMPLATE = """\nDirectory: {}"""