climarraykit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,616 @@
1
+ #! /usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ #----------------#
5
+ # Import modules #
6
+ #----------------#
7
+
8
+ import numpy as np
9
+ import xarray as xr
10
+ from pathlib import Path
11
+
12
+ #------------------------#
13
+ # Import project modules #
14
+ #------------------------#
15
+
16
+ from climarraykit.file_utils import ncfile_integrity_status
17
+ from paramlib.global_parameters import COMMON_DELIMITER_LIST
18
+ from pygenutils.arrays_and_lists.data_manipulation import flatten_list
19
+
20
+ #-------------------------#
21
+ # Define custom functions #
22
+ #-------------------------#
23
+
24
+ # Dimension handlers #
25
+ #--------------------#
26
+
27
+ def rename_xarray_dimension(
28
+ obj: xr.Dataset | xr.DataArray,
29
+ old_dim: str,
30
+ new_dim: str,
31
+ ) -> xr.Dataset | xr.DataArray:
32
+ """
33
+ Rename a dimension in an xarray :class:`~xarray.Dataset` or :class:`~xarray.DataArray`.
34
+
35
+ Tries ``rename_dims`` followed by ``rename`` on the same object, then falls back to
36
+ paired ``swap_dims`` calls when those steps fail (e.g. awkward coordinate setups).
37
+
38
+ Parameters
39
+ ----------
40
+ obj : xarray.Dataset | xarray.DataArray
41
+ Object whose dimension should be renamed.
42
+ old_dim : str
43
+ Current dimension name.
44
+ new_dim : str
45
+ Target dimension name.
46
+
47
+ Returns
48
+ -------
49
+ xarray.Dataset | xarray.DataArray
50
+ Object after renaming attempts (may be unchanged if all strategies fail).
51
+ """
52
+ try:
53
+ obj = obj.rename_dims({old_dim: new_dim})
54
+ obj = obj.rename({old_dim: new_dim})
55
+ except Exception:
56
+ try:
57
+ obj = obj.swap_dims({old_dim: new_dim})
58
+ obj = obj.swap_dims({old_dim: new_dim})
59
+ except Exception:
60
+ pass
61
+ return obj
62
+
63
+
64
+ # Main functions #
65
+ #-#-#-#-#-#-#-#-#-
66
+
67
+ def get_file_dimensions(nc_file: str | xr.Dataset | Path) -> list[str] | str:
68
+ """
69
+ Extracts dimension names from a netCDF file or xarray.Dataset. In some cases,
70
+ dimensions can also appear as variables, so this function ensures only
71
+ dimensions are returned.
72
+
73
+ Parameters
74
+ ----------
75
+ nc_file : str | Path | xarray.Dataset
76
+ Either the path to the netCDF file or an already opened xarray.Dataset object.
77
+
78
+ Returns
79
+ -------
80
+ dimension_names : str | list[str]
81
+ A list of dimension names, or a single dimension name if only one is found.
82
+
83
+ Raises
84
+ ------
85
+ TypeError
86
+ If the input is not a string, Path, or xarray.Dataset object.
87
+ ValueError
88
+ If the file path is empty or invalid.
89
+ FileNotFoundError
90
+ If the file doesn't exist.
91
+ """
92
+ if isinstance(nc_file, (str, Path)):
93
+ if not str(nc_file).strip():
94
+ raise ValueError("File path cannot be empty")
95
+
96
+ nc_path = Path(nc_file)
97
+ if not nc_path.exists():
98
+ raise FileNotFoundError(f"NetCDF file not found: {nc_path}")
99
+
100
+ ncfile_integrity_status(nc_file)
101
+ ds = xr.open_dataset(nc_file)
102
+ close_dataset = True
103
+ elif isinstance(nc_file, xr.Dataset):
104
+ ds = nc_file
105
+ close_dataset = False
106
+ else:
107
+ raise TypeError("Unsupported data file type. Expected str, Path, or xarray.Dataset.")
108
+
109
+ try:
110
+ dimlist = list(ds.dims)
111
+ varlist = list(ds.variables)
112
+
113
+ # Retain only those dimensions that are present among variables
114
+ dimlist_nodim = [dim for dim in dimlist if dim in varlist]
115
+
116
+ return dimlist_nodim[0] if len(dimlist_nodim) == 1 else dimlist_nodim
117
+ finally:
118
+ if close_dataset:
119
+ ds.close()
120
+
121
+
122
+ def get_file_variables(nc_file: str | xr.Dataset | Path) -> list[str] | str:
123
+ """
124
+ Extracts variable names from a netCDF file or xarray.Dataset, excluding
125
+ dimensions, as dimensions may also be present in the variable list.
126
+
127
+ Parameters
128
+ ----------
129
+ nc_file : str | xarray.Dataset | Path
130
+ Either the path to the netCDF file or an already opened xarray.Dataset object.
131
+
132
+ Returns
133
+ -------
134
+ variable_names : str | list[str]
135
+ A list of variable names, or a single variable name if only one is found.
136
+
137
+ Raises
138
+ ------
139
+ TypeError
140
+ If the input is not a string, Path, or xarray.Dataset object.
141
+ ValueError
142
+ If the file path is empty or invalid.
143
+ FileNotFoundError
144
+ If the file doesn't exist.
145
+ """
146
+ if isinstance(nc_file, (str, Path)):
147
+ if not str(nc_file).strip():
148
+ raise ValueError("File path cannot be empty")
149
+
150
+ nc_path = Path(nc_file)
151
+ if not nc_path.exists():
152
+ raise FileNotFoundError(f"NetCDF file not found: {nc_path}")
153
+
154
+ ncfile_integrity_status(nc_file)
155
+ ds = xr.open_dataset(nc_file)
156
+ close_dataset = True
157
+ elif isinstance(nc_file, xr.Dataset):
158
+ ds = nc_file
159
+ close_dataset = False
160
+ else:
161
+ raise TypeError("Unsupported data file type. Expected str, Path, or xarray.Dataset.")
162
+
163
+ try:
164
+ varlist = list(ds.variables)
165
+ dimlist = list(ds.dims)
166
+
167
+ # Remove dimensions from the variable list
168
+ varlist_nodim = [var for var in varlist if var not in dimlist]
169
+
170
+ return varlist_nodim[0] if len(varlist_nodim) == 1 else varlist_nodim
171
+ finally:
172
+ if close_dataset:
173
+ ds.close()
174
+
175
+
176
+ def get_model_list(path_list: list[str], split_pos: int, SPLIT_DELIM: str = "_") -> list[str]:
177
+ """
178
+ Extracts model names from a list of file paths or file names by splitting the file
179
+ name at a specified position. The function can handle both absolute/relative paths
180
+ and file names, assuming they contain low bars ('_') as separators.
181
+
182
+ Parameters
183
+ ----------
184
+ path_list : list[str]
185
+ List of file paths (absolute or relative) or file names.
186
+ split_pos : int
187
+ Position in the split file name (after splitting by the delimiter) that contains
188
+ the model name.
189
+ SPLIT_DELIM : str, optional
190
+ Delimiter used to split the file name. Default is "_".
191
+
192
+ Returns
193
+ -------
194
+ unique_model_list : list[str]
195
+ A list of unique model names extracted from the file paths.
196
+
197
+ Raises
198
+ ------
199
+ TypeError
200
+ If path_list is not a list or contains non-string elements.
201
+ ValueError
202
+ If path_list is empty, split_pos is negative, or SPLIT_DELIM is empty.
203
+ IndexError
204
+ If split_pos is beyond the available splits for any file.
205
+ """
206
+ # Parameter validation
207
+ if not isinstance(path_list, list):
208
+ raise TypeError("path_list must be a list")
209
+
210
+ # Flatten nested lists for defensive programming
211
+ path_list_flat = flatten_list(path_list)
212
+
213
+ if not path_list_flat:
214
+ raise ValueError("path_list cannot be empty")
215
+
216
+ if not all(isinstance(path, str) for path in path_list_flat):
217
+ raise TypeError("All items in path_list must be strings")
218
+
219
+ if not all(path.strip() for path in path_list_flat):
220
+ raise ValueError("All file paths must be non-empty strings")
221
+
222
+ if not isinstance(split_pos, int) or split_pos < 0:
223
+ raise ValueError("split_pos must be a non-negative integer")
224
+
225
+ if not isinstance(SPLIT_DELIM, str) or not SPLIT_DELIM:
226
+ raise ValueError("SPLIT_DELIM must be a non-empty string")
227
+
228
+ # Handle paths with forward slashes to extract file names
229
+ grib_file_list = [path.split("/")[-1] for path in path_list_flat]
230
+
231
+ # Split file names by the delimiter and extract model names from the specified position
232
+ model_list = []
233
+ for f in grib_file_list:
234
+ parts = f.split(SPLIT_DELIM)
235
+ if split_pos >= len(parts):
236
+ raise IndexError(f"split_pos {split_pos} is beyond available splits for file {f}")
237
+ model_list.append(parts[split_pos])
238
+
239
+ # Return unique model names
240
+ unique_model_list = np.unique(model_list).tolist()
241
+ return unique_model_list
242
+
243
+
244
+ def get_latlon_bounds(nc_file: str | xr.Dataset | Path,
245
+ lat_dimension_name: str,
246
+ lon_dimension_name: str,
247
+ decimal_places: int = 3) -> tuple[np.ndarray, np.ndarray]:
248
+ """
249
+ Retrieves the latitude and longitude values from a netCDF file and rounds them
250
+ to the specified decimal precision.
251
+
252
+ Parameters
253
+ ----------
254
+ nc_file : str | xarray.Dataset | Path
255
+ Path to the netCDF file or an already opened xarray.Dataset object.
256
+ lat_dimension_name : str
257
+ Name of the latitude dimension in the dataset.
258
+ lon_dimension_name : str
259
+ Name of the longitude dimension in the dataset.
260
+ decimal_places : int, optional
261
+ Number of decimal places to round the latitude and longitude values. Default is 3.
262
+
263
+ Returns
264
+ -------
265
+ tuple of numpy.ndarray
266
+ Rounded latitude and longitude values from the netCDF file.
267
+
268
+ Raises
269
+ ------
270
+ TypeError
271
+ If nc_file is not str, Path, or xarray.Dataset, or if dimension names are not strings.
272
+ ValueError
273
+ If file path, dimension names are empty, or decimal_places is negative.
274
+ FileNotFoundError
275
+ If the file doesn't exist.
276
+ KeyError
277
+ If the specified dimensions don't exist in the dataset.
278
+ """
279
+ # Parameter validation
280
+ if not isinstance(nc_file, (str, xr.Dataset, Path)):
281
+ raise TypeError("nc_file must be a string, Path, or xarray.Dataset object")
282
+
283
+ if not isinstance(lat_dimension_name, str) or not lat_dimension_name.strip():
284
+ raise ValueError("lat_dimension_name must be a non-empty string")
285
+
286
+ if not isinstance(lon_dimension_name, str) or not lon_dimension_name.strip():
287
+ raise ValueError("lon_dimension_name must be a non-empty string")
288
+
289
+ if not isinstance(decimal_places, int) or decimal_places < 0:
290
+ raise ValueError("decimal_places must be a non-negative integer")
291
+
292
+ # Open the netCDF file if it's a file path
293
+ if isinstance(nc_file, (str, Path)):
294
+ if not str(nc_file).strip():
295
+ raise ValueError("File path cannot be empty")
296
+
297
+ nc_path = Path(nc_file)
298
+ if not nc_path.exists():
299
+ raise FileNotFoundError(f"NetCDF file not found: {nc_path}")
300
+
301
+ ncfile_integrity_status(nc_file)
302
+ ds = xr.open_dataset(nc_file)
303
+ close_dataset = True
304
+ else:
305
+ ds = nc_file
306
+ close_dataset = False
307
+
308
+ try:
309
+ # Check if dimensions exist
310
+ if lat_dimension_name not in ds.variables:
311
+ raise KeyError(f"Latitude dimension '{lat_dimension_name}' not found in dataset")
312
+
313
+ if lon_dimension_name not in ds.variables:
314
+ raise KeyError(f"Longitude dimension '{lon_dimension_name}' not found in dataset")
315
+
316
+ # Retrieve and round latitude and longitude values
317
+ lat_values = ds[lat_dimension_name].values.round(decimal_places)
318
+ lon_values = ds[lon_dimension_name].values.round(decimal_places)
319
+
320
+ return lat_values, lon_values
321
+ finally:
322
+ if close_dataset:
323
+ ds.close()
324
+
325
+
326
+ def get_latlon_deltas(lat_values: np.ndarray,
327
+ lon_values: np.ndarray,
328
+ decimal_places: int = 3) -> tuple[str, str]:
329
+ """
330
+ Computes the delta (difference) between the first two latitude and longitude values
331
+ and returns the deltas as rounded strings.
332
+
333
+ Parameters
334
+ ----------
335
+ lat_values : numpy.ndarray
336
+ Array of latitude values.
337
+ lon_values : numpy.ndarray
338
+ Array of longitude values.
339
+ decimal_places : int, optional
340
+ Number of decimal places to round the computed deltas. Default is 3.
341
+
342
+ Returns
343
+ -------
344
+ tuple of str
345
+ Rounded latitude and longitude deltas as strings.
346
+
347
+ Raises
348
+ ------
349
+ TypeError
350
+ If lat_values or lon_values are not numpy arrays.
351
+ ValueError
352
+ If arrays are empty, have less than 2 elements, or decimal_places is negative.
353
+ """
354
+ # Parameter validation
355
+ if not isinstance(lat_values, np.ndarray):
356
+ raise TypeError("lat_values must be a numpy array")
357
+
358
+ if not isinstance(lon_values, np.ndarray):
359
+ raise TypeError("lon_values must be a numpy array")
360
+
361
+ if lat_values.size < 2:
362
+ raise ValueError("lat_values must contain at least 2 elements")
363
+
364
+ if lon_values.size < 2:
365
+ raise ValueError("lon_values must contain at least 2 elements")
366
+
367
+ if not isinstance(decimal_places, int) or decimal_places < 0:
368
+ raise ValueError("decimal_places must be a non-negative integer")
369
+
370
+ lat_delta = f"{abs(lat_values[1] - lat_values[0]):.{decimal_places}f}"
371
+ lon_delta = f"{abs(lon_values[1] - lon_values[0]):.{decimal_places}f}"
372
+ return lat_delta, lon_delta
373
+
374
+
375
+ def get_times(nc_file: str | xr.Dataset | Path, time_dimension_name: str) -> xr.DataArray:
376
+ """
377
+ Retrieves the time values from a specified time dimension in a netCDF file.
378
+
379
+ Parameters
380
+ ----------
381
+ nc_file : str | xarray.Dataset | Path
382
+ Path to the netCDF file or an already opened xarray.Dataset object.
383
+ time_dimension_name : str
384
+ Name of the time dimension in the dataset.
385
+
386
+ Returns
387
+ -------
388
+ xarray.DataArray
389
+ Time values as an xarray.DataArray.
390
+
391
+ Raises
392
+ ------
393
+ TypeError
394
+ If nc_file is not str, Path, or xarray.Dataset, or time_dimension_name is not string.
395
+ ValueError
396
+ If file path or time_dimension_name is empty.
397
+ FileNotFoundError
398
+ If the file doesn't exist.
399
+ KeyError
400
+ If the specified time dimension doesn't exist in the dataset.
401
+ """
402
+ # Parameter validation
403
+ if not isinstance(nc_file, (str, xr.Dataset, Path)):
404
+ raise TypeError("nc_file must be a string, Path, or xarray.Dataset object")
405
+
406
+ if not isinstance(time_dimension_name, str) or not time_dimension_name.strip():
407
+ raise ValueError("time_dimension_name must be a non-empty string")
408
+
409
+ # Open the netCDF file if it's a file path
410
+ if isinstance(nc_file, (str, Path)):
411
+ if not str(nc_file).strip():
412
+ raise ValueError("File path cannot be empty")
413
+
414
+ nc_path = Path(nc_file)
415
+ if not nc_path.exists():
416
+ raise FileNotFoundError(f"NetCDF file not found: {nc_path}")
417
+
418
+ ncfile_integrity_status(nc_file)
419
+ ds = xr.open_dataset(nc_file)
420
+ close_dataset = True
421
+ else:
422
+ ds = nc_file
423
+ close_dataset = False
424
+
425
+ try:
426
+ # Check if time dimension exists
427
+ if time_dimension_name not in ds.variables:
428
+ raise KeyError(f"Time dimension '{time_dimension_name}' not found in dataset")
429
+
430
+ # Extract time values from the specified time dimension
431
+ time_values = ds[time_dimension_name]
432
+
433
+ return time_values
434
+ finally:
435
+ if close_dataset:
436
+ ds.close()
437
+
438
+
439
+ # Particular functions #
440
+ #-#-#-#-#-#-#-#-#-#-#-#-
441
+
442
+ def find_coordinate_variables(nc_file: str | xr.Dataset | Path) -> list[str]:
443
+ """
444
+ Function that searches for coordinate dimensions or variables
445
+ ('latitude', 'longitude', 'x', 'y') in an xarray Dataset.
446
+ The coordinates should ideally be located among dimensions,
447
+ but they might also appear among variables. This function attempts both cases using
448
+ 'get_file_dimensions' and 'get_file_variables'.
449
+
450
+ Parameters
451
+ ----------
452
+ nc_file : str | xarray.Dataset | Path
453
+ String of the data file path, Path object, or the dataset itself.
454
+
455
+ Returns
456
+ -------
457
+ list[str]
458
+ A list of strings identifying the coordinate dimensions or variables.
459
+ If duplicates are found, only unique keys are returned.
460
+
461
+ Raises
462
+ ------
463
+ TypeError
464
+ If nc_file is not str, Path, or xarray.Dataset.
465
+ ValueError
466
+ If no coordinate dimensions or variables are found, or file path is empty.
467
+ FileNotFoundError
468
+ If the file doesn't exist.
469
+ """
470
+ # Parameter validation
471
+ if not isinstance(nc_file, (str, xr.Dataset, Path)):
472
+ raise TypeError("nc_file must be a string, Path, or xarray.Dataset object")
473
+
474
+ if isinstance(nc_file, (str, Path)) and not str(nc_file).strip():
475
+ raise ValueError("File path cannot be empty")
476
+
477
+ # Retrieve the dimension and variable lists
478
+ dims = get_file_dimensions(nc_file)
479
+ vars_ = get_file_variables(nc_file)
480
+
481
+ # Ensure dims and vars_ are lists
482
+ if isinstance(dims, str):
483
+ dims = [dims]
484
+ if isinstance(vars_, str):
485
+ vars_ = [vars_]
486
+
487
+ # Search for coordinate-related elements in dimensions and variables
488
+ coord_keys = [key for key in dims + vars_
489
+ if key.lower().startswith(('lat', 'y', 'lon', 'x'))]
490
+
491
+ if not coord_keys:
492
+ raise ValueError("No 'latitude' or 'longitude' coordinates found "
493
+ f"in file '{nc_file}'.")
494
+
495
+ unique_coord_keys = list(set(coord_keys)) # Remove duplicates and return a list of unique keys
496
+ return unique_coord_keys
497
+
498
+
499
+ def find_nearest_coordinates(nc_file: str | xr.Dataset | Path,
500
+ lats_obs: list[float] | np.ndarray,
501
+ lons_obs: list[float] | np.ndarray,
502
+ decimal_places: int = 3) -> tuple[np.ndarray, np.ndarray]:
503
+ """
504
+ Compares a set of observed latitude and longitude values with those from a netCDF file
505
+ or xarray.Dataset object, and finds the nearest coordinates in the dataset that match
506
+ the observed values.
507
+
508
+ Parameters
509
+ ----------
510
+ nc_file : str | xarray.Dataset | Path
511
+ Path to the netCDF file, Path object, or an already opened xarray.Dataset object containing
512
+ latitude and longitude coordinates.
513
+ lats_obs : list[float] | numpy.ndarray
514
+ List or array of observed latitude values to compare.
515
+ lons_obs : list[float] | numpy.ndarray
516
+ List or array of observed longitude values to compare.
517
+ decimal_places : int, optional
518
+ Number of decimal places to round the latitude and longitude values.
519
+ Default is 3.
520
+
521
+ Returns
522
+ -------
523
+ tuple of numpy.ndarray
524
+ Two arrays containing the nearest latitude and longitude values from the dataset
525
+ for each observed coordinate. The values are rounded to specified decimal places.
526
+
527
+ Raises
528
+ ------
529
+ TypeError
530
+ If parameters have incorrect types.
531
+ ValueError
532
+ If coordinate arrays are empty or have mismatched lengths, or if no coordinate variables are found.
533
+ FileNotFoundError
534
+ If the file doesn't exist.
535
+ """
536
+ # Parameter validation
537
+ if not isinstance(nc_file, (str, xr.Dataset, Path)):
538
+ raise TypeError("nc_file must be a string, Path, or xarray.Dataset object")
539
+
540
+ if not isinstance(lats_obs, (list, np.ndarray)):
541
+ raise TypeError("lats_obs must be a list or numpy array")
542
+
543
+ if not isinstance(lons_obs, (list, np.ndarray)):
544
+ raise TypeError("lons_obs must be a list or numpy array")
545
+
546
+ if not isinstance(decimal_places, int) or decimal_places < 0:
547
+ raise ValueError("decimal_places must be a non-negative integer")
548
+
549
+ # Convert to numpy arrays and validate
550
+ lats_obs = np.array(lats_obs, dtype='d')
551
+ lons_obs = np.array(lons_obs, dtype='d')
552
+
553
+ if lats_obs.size == 0:
554
+ raise ValueError("lats_obs cannot be empty")
555
+
556
+ if lons_obs.size == 0:
557
+ raise ValueError("lons_obs cannot be empty")
558
+
559
+ if lats_obs.size != lons_obs.size:
560
+ raise ValueError("lats_obs and lons_obs must have the same length")
561
+
562
+ # Retrieve coordinate variable names (latitude and longitude)
563
+ coord_varlist = find_coordinate_variables(nc_file)
564
+
565
+ # Handle file opening: accept both file paths and already opened xarray.Dataset objects
566
+ if isinstance(nc_file, (str, Path)):
567
+ if not str(nc_file).strip():
568
+ raise ValueError("File path cannot be empty")
569
+
570
+ nc_path = Path(nc_file)
571
+ if not nc_path.exists():
572
+ raise FileNotFoundError(f"NetCDF file not found: {nc_path}")
573
+
574
+ ncfile_integrity_status(nc_file)
575
+ ds = xr.open_dataset(nc_file)
576
+ close_ds = True
577
+ elif isinstance(nc_file, xr.Dataset):
578
+ ds = nc_file
579
+ close_ds = False
580
+ else:
581
+ raise TypeError("Input must be a file path (str/Path) or an xarray.Dataset object.")
582
+
583
+ try:
584
+ # Retrieve latitude and longitude data from the dataset
585
+ lats_ds = np.array(ds[coord_varlist[0]], dtype='d')
586
+ lons_ds = np.array(ds[coord_varlist[1]], dtype='d')
587
+
588
+ nearest_lats = []
589
+ nearest_lons = []
590
+
591
+ # Find the nearest latitude and longitude for each observed coordinate
592
+ for lat_obs, lon_obs in zip(lats_obs, lons_obs):
593
+ nearest_lat_idx = np.abs(lat_obs - lats_ds).argmin()
594
+ nearest_lon_idx = np.abs(lon_obs - lons_ds).argmin()
595
+
596
+ nearest_lats.append(lats_ds[nearest_lat_idx])
597
+ nearest_lons.append(lons_ds[nearest_lon_idx])
598
+
599
+ # Return nearest latitudes and longitudes, rounded to specified decimal places
600
+ nearest_lats = np.round(nearest_lats, decimal_places)
601
+ nearest_lons = np.round(nearest_lons, decimal_places)
602
+
603
+ return nearest_lats, nearest_lons
604
+
605
+ finally:
606
+ # Close the dataset if it was opened within this function
607
+ if close_ds:
608
+ ds.close()
609
+
610
+
611
+ #--------------------------#
612
+ # Parameters and constants #
613
+ #--------------------------#
614
+
615
+ # String splitting character #
616
+ SPLIT_DELIM = COMMON_DELIMITER_LIST[0]