climarraykit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,309 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Wed Oct 16 13:54:11 2024
5
+
6
+ @author: jonander
7
+ """
8
+
9
+ #! /usr/bin/env python3
10
+ # -*- coding: utf-8 -*-
11
+
12
+ #----------------#
13
+ # Import modules #
14
+ #----------------#
15
+
16
+ import numpy as np
17
+ import xarray as xr
18
+ import os
19
+ from pathlib import Path
20
+
21
+ #------------------------#
22
+ # Import project modules #
23
+ #------------------------#
24
+
25
+ from filewise.file_operations.path_utils import find_files
26
+ from pygenutils.arrays_and_lists.data_manipulation import flatten_list
27
+ from pygenutils.strings.string_handler import get_obj_specs
28
+ from pygenutils.strings.text_formatters import (
29
+ format_string,
30
+ print_format_string,
31
+ string_underliner
32
+ )
33
+
34
+ #-------------------------#
35
+ # Define custom functions #
36
+ #-------------------------#
37
+
38
+ # Internal #
39
+ #----------#
40
+
41
+ def _unique_sorted(items: list) -> list:
42
+ """
43
+ Returns a sorted list of unique items.
44
+
45
+ Parameters
46
+ ----------
47
+ items : list
48
+ List of items to deduplicate and sort.
49
+
50
+ Returns
51
+ -------
52
+ list
53
+ Sorted list of unique items.
54
+
55
+ Raises
56
+ ------
57
+ TypeError
58
+ If items is not a list.
59
+ """
60
+ if not isinstance(items, list):
61
+ raise TypeError("Input must be a list")
62
+
63
+ return sorted(set(items))
64
+
65
+ # Public #
66
+ #--------#
67
+
68
+ # netCDF file searching #
69
+ #~~~~~~~~~~~~~~~~~~~~~~~#
70
+
71
+ # Main function #
72
+ #-#-#-#-#-#-#-#-#
73
+
74
+ def scan_ncfiles(search_path: str | list[str] | Path) -> dict[str, int | list[str]]:
75
+ """
76
+ Scans directories for netCDF (.nc) files, checks file integrity,
77
+ and generates a report for faulty files. Returns comprehensive information
78
+ about netCDF files and their status.
79
+
80
+ Parameters
81
+ ----------
82
+ search_path : str | list[str] | Path
83
+ The directory or list of directories to scan for .nc files.
84
+
85
+ Returns
86
+ -------
87
+ dict
88
+ A dictionary containing:
89
+ - 'total_dirs': Number of directories containing faulty files
90
+ - 'total_files': Total number of netCDF files scanned
91
+ - 'faulty_files': List of faulty netCDF file paths
92
+ - 'faulty_count': Total number of faulty netCDF files
93
+ - 'faulty_by_dir': Dictionary mapping directories to their faulty files
94
+
95
+ Raises
96
+ ------
97
+ TypeError
98
+ If search_path is not str, list, or Path.
99
+ ValueError
100
+ If search_path is empty or contains invalid paths.
101
+ FileNotFoundError
102
+ If any specified search path doesn't exist.
103
+
104
+ Example
105
+ -------
106
+ # Example: Scan and check file integrity, generate a report for faulty files
107
+ result = scan_ncfiles("/path/to/scan")
108
+ print(f"Faulty files: {result['faulty_files']}, Count: {result['faulty_count']}")
109
+ """
110
+ # Parameter validation
111
+ if not isinstance(search_path, (str, list, Path)):
112
+ raise TypeError("search_path must be a string, list of strings, or Path object")
113
+
114
+ # Convert to list and flatten if necessary
115
+ if isinstance(search_path, (str, Path)):
116
+ search_paths = [str(search_path)]
117
+ else:
118
+ search_paths = flatten_list([str(p) for p in search_path])
119
+
120
+ # Validate paths
121
+ if not search_paths:
122
+ raise ValueError("search_path cannot be empty")
123
+
124
+ for path in search_paths:
125
+ if not isinstance(path, str) or not path.strip():
126
+ raise ValueError("All search paths must be non-empty strings")
127
+
128
+ if not Path(path).exists():
129
+ raise FileNotFoundError(f"Search path does not exist: {path}")
130
+
131
+ if not Path(path).is_dir():
132
+ raise ValueError(f"Search path must be a directory: {path}")
133
+
134
+ # Import here to avoid circular imports
135
+ from paramlib.global_parameters import CLIMATE_FILE_EXTENSIONS
136
+
137
+ # Step 1: Search for all netCDF files #
138
+ #######################################
139
+ all_files = []
140
+ for path in search_paths:
141
+ files_in_path = find_files(CLIMATE_FILE_EXTENSIONS[0], path)
142
+ all_files.extend(files_in_path)
143
+
144
+ if not all_files:
145
+ print("No netCDF files found in the specified directories")
146
+ return {
147
+ 'total_dirs': 0,
148
+ 'total_files': 0,
149
+ 'faulty_files': [],
150
+ 'faulty_count': 0,
151
+ 'faulty_by_dir': {}
152
+ }
153
+
154
+ # Step 2: Check each file's integrity and collect faulty files #
155
+ #################################################################
156
+ file_vs_err_list = []
157
+ for idx, file in enumerate(all_files, start=1):
158
+ format_args_scan_progress = (idx, len(all_files), file)
159
+ print_format_string(SCAN_PROGRESS_TEMPLATE, format_args_scan_progress)
160
+ try:
161
+ ncfile_integrity_status(file)
162
+ except Exception as ncf_err:
163
+ err_tuple = (file, str(ncf_err))
164
+ file_vs_err_list.append(err_tuple)
165
+
166
+ # Step 3: Find directories containing faulty files #
167
+ ####################################################
168
+ dir_list = _unique_sorted([get_obj_specs(err_tuple[0], "parent") for err_tuple in file_vs_err_list])
169
+
170
+ # Step 4: Group faulty files by directory
171
+ file_vs_errs_dict = {dirc: [err_tuple for err_tuple in file_vs_err_list
172
+ if get_obj_specs(err_tuple[0], "parent")==dirc]
173
+ for dirc in dir_list}
174
+
175
+ # Step 5: Generate report #
176
+ ###########################
177
+
178
+ # Statistics #
179
+ total_dirs = len(dir_list)
180
+ total_files = len(all_files)
181
+ total_faulties = sum(len(lst) for lst in file_vs_errs_dict.values())
182
+
183
+ # Report generation #
184
+ with open(REPORT_FILE_PATH, "w") as report:
185
+ report.write(REPORT_INFO_TEMPLATE.format(*(total_dirs, total_files, total_faulties)))
186
+
187
+ for dirc in file_vs_errs_dict.keys():
188
+ format_args_dir_info = (dirc, len(file_vs_errs_dict[dirc]))
189
+ report.write(format_string(string_underliner(DIR_INFO_TEMPLATE, format_args_dir_info), "="))
190
+ for values in file_vs_errs_dict[dirc]:
191
+ report.write(format_string(FILE_INFO_WRITING_TEMPLATE, values))
192
+
193
+ # Return comprehensive results
194
+ return {
195
+ 'total_dirs': total_dirs,
196
+ 'total_files': total_files,
197
+ 'faulty_files': [err_tuple[0] for err_tuple in file_vs_err_list],
198
+ 'faulty_count': total_faulties,
199
+ 'faulty_by_dir': file_vs_errs_dict
200
+ }
201
+
202
+ # Auxiliary functions #
203
+ #-#-#-#-#-#-#-#-#-#-#-#
204
+
205
+ def ncfile_integrity_status(ncfile_name: str | Path) -> xr.Dataset:
206
+ """
207
+ Checks the integrity of a given netCDF file by attempting to open it with xarray.
208
+
209
+ This function tries to open the specified netCDF file using `xarray.open_dataset`.
210
+ If the file is successfully opened, it returns the dataset before closing it.
211
+ If an error occurs during this process, it delegates the exception
212
+ raise to the output of xarray.dataset class.
213
+
214
+ Parameters
215
+ ----------
216
+ ncfile_name : str | Path
217
+ Path to the netCDF file to be checked.
218
+
219
+ Returns
220
+ -------
221
+ xarray.Dataset
222
+ The opened dataset if successful.
223
+
224
+ Raises
225
+ ------
226
+ TypeError
227
+ If ncfile_name is not str or Path.
228
+ ValueError
229
+ If the file path is empty or invalid.
230
+ FileNotFoundError
231
+ If the file doesn't exist.
232
+ OSError
233
+ Raised if the file cannot be found, opened, or there are issues with file permissions.
234
+ ValueError
235
+ Raised if the file is successfully opened but is not a valid netCDF file or has
236
+ an unsupported format.
237
+ RuntimeError
238
+ Raised for internal errors within the netCDF4 or h5py libraries, such as when
239
+ reading compressed data fails.
240
+ IOError
241
+ Raised for input/output errors at the system level, such as file corruption
242
+ or disk read failures.
243
+ KeyError
244
+ Raised in rare cases when essential variables or attributes required for reading
245
+ the file are missing or invalid.
246
+ """
247
+ # Parameter validation
248
+ if not isinstance(ncfile_name, (str, Path)):
249
+ raise TypeError("ncfile_name must be a string or Path object")
250
+
251
+ ncfile_path = Path(ncfile_name)
252
+
253
+ if not str(ncfile_path).strip():
254
+ raise ValueError("File path cannot be empty")
255
+
256
+ if not ncfile_path.exists():
257
+ raise FileNotFoundError(f"NetCDF file not found: {ncfile_path}")
258
+
259
+ if not ncfile_path.is_file():
260
+ raise ValueError(f"Path must be a file, not a directory: {ncfile_path}")
261
+
262
+ # Check file extension
263
+ if not str(ncfile_path).lower().endswith(('.nc', '.netcdf')):
264
+ print(f"Warning: File {ncfile_path} may not be a netCDF file based on extension")
265
+
266
+ try:
267
+ ds = xr.open_dataset(ncfile_path)
268
+ return ds
269
+ except Exception as e:
270
+ raise type(e)(f"Failed to open netCDF file {ncfile_path}: {str(e)}")
271
+ finally:
272
+ if 'ds' in locals():
273
+ ds.close()
274
+
275
+ #--------------------------#
276
+ # Parameters and constants #
277
+ #--------------------------#
278
+
279
+ # Directory from where this code is being called #
280
+ CODE_CALL_DIR = os.getcwd()
281
+
282
+ # Template strings #
283
+ #------------------#
284
+
285
+ # File scanning progress information strings #
286
+ SCAN_PROGRESS_TEMPLATE =\
287
+ """
288
+ File number: {} out of {}
289
+ File name: {}
290
+ """
291
+
292
+ DIR_INFO_TEMPLATE = """\nDirectory: {} | Faulty files in this directory: {}"""
293
+ FILE_INFO_WRITING_TEMPLATE = """\nFile: {} -> {}\n"""
294
+
295
+ # Report results
296
+ REPORT_FN_NOEXT = "faulty_netcdf_file_report"
297
+ REPORT_FILE_PATH = f"{CODE_CALL_DIR}/{REPORT_FN_NOEXT}.txt"
298
+ REPORT_INFO_TEMPLATE =\
299
+ """
300
+ +--------------------------------+
301
+ |Faulty NETCDF format file report|
302
+ +--------------------------------+
303
+ ·Total directories scanned : {}
304
+ ·Total files scanned: {}
305
+ ·Total faulty files: {}
306
+
307
+ Faulty files
308
+ +----------+
309
+ """