climarraykit 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- climarraykit/__init__.py +12 -0
- climarraykit/conversions.py +208 -0
- climarraykit/data_manipulation.py +386 -0
- climarraykit/file_utils.py +309 -0
- climarraykit/patterns.py +616 -0
- climarraykit/xarray_obj_handler.py +575 -0
- climarraykit-0.2.0.dist-info/METADATA +86 -0
- climarraykit-0.2.0.dist-info/RECORD +11 -0
- climarraykit-0.2.0.dist-info/WHEEL +5 -0
- climarraykit-0.2.0.dist-info/licenses/LICENSE +21 -0
- climarraykit-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Created on Wed Oct 16 13:54:11 2024
|
|
5
|
+
|
|
6
|
+
@author: jonander
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
#! /usr/bin/env python3
|
|
10
|
+
# -*- coding: utf-8 -*-
|
|
11
|
+
|
|
12
|
+
#----------------#
|
|
13
|
+
# Import modules #
|
|
14
|
+
#----------------#
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import xarray as xr
|
|
18
|
+
import os
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
#------------------------#
|
|
22
|
+
# Import project modules #
|
|
23
|
+
#------------------------#
|
|
24
|
+
|
|
25
|
+
from filewise.file_operations.path_utils import find_files
|
|
26
|
+
from pygenutils.arrays_and_lists.data_manipulation import flatten_list
|
|
27
|
+
from pygenutils.strings.string_handler import get_obj_specs
|
|
28
|
+
from pygenutils.strings.text_formatters import (
|
|
29
|
+
format_string,
|
|
30
|
+
print_format_string,
|
|
31
|
+
string_underliner
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
#-------------------------#
|
|
35
|
+
# Define custom functions #
|
|
36
|
+
#-------------------------#
|
|
37
|
+
|
|
38
|
+
# Internal #
|
|
39
|
+
#----------#
|
|
40
|
+
|
|
41
|
+
def _unique_sorted(items: list) -> list:
|
|
42
|
+
"""
|
|
43
|
+
Returns a sorted list of unique items.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
items : list
|
|
48
|
+
List of items to deduplicate and sort.
|
|
49
|
+
|
|
50
|
+
Returns
|
|
51
|
+
-------
|
|
52
|
+
list
|
|
53
|
+
Sorted list of unique items.
|
|
54
|
+
|
|
55
|
+
Raises
|
|
56
|
+
------
|
|
57
|
+
TypeError
|
|
58
|
+
If items is not a list.
|
|
59
|
+
"""
|
|
60
|
+
if not isinstance(items, list):
|
|
61
|
+
raise TypeError("Input must be a list")
|
|
62
|
+
|
|
63
|
+
return sorted(set(items))
|
|
64
|
+
|
|
65
|
+
# Public #
|
|
66
|
+
#--------#
|
|
67
|
+
|
|
68
|
+
# netCDF file searching #
|
|
69
|
+
#~~~~~~~~~~~~~~~~~~~~~~~#
|
|
70
|
+
|
|
71
|
+
# Main function #
|
|
72
|
+
#-#-#-#-#-#-#-#-#
|
|
73
|
+
|
|
74
|
+
def scan_ncfiles(search_path: str | list[str] | Path) -> dict[str, int | list[str]]:
|
|
75
|
+
"""
|
|
76
|
+
Scans directories for netCDF (.nc) files, checks file integrity,
|
|
77
|
+
and generates a report for faulty files. Returns comprehensive information
|
|
78
|
+
about netCDF files and their status.
|
|
79
|
+
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
search_path : str | list[str] | Path
|
|
83
|
+
The directory or list of directories to scan for .nc files.
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
dict
|
|
88
|
+
A dictionary containing:
|
|
89
|
+
- 'total_dirs': Number of directories containing faulty files
|
|
90
|
+
- 'total_files': Total number of netCDF files scanned
|
|
91
|
+
- 'faulty_files': List of faulty netCDF file paths
|
|
92
|
+
- 'faulty_count': Total number of faulty netCDF files
|
|
93
|
+
- 'faulty_by_dir': Dictionary mapping directories to their faulty files
|
|
94
|
+
|
|
95
|
+
Raises
|
|
96
|
+
------
|
|
97
|
+
TypeError
|
|
98
|
+
If search_path is not str, list, or Path.
|
|
99
|
+
ValueError
|
|
100
|
+
If search_path is empty or contains invalid paths.
|
|
101
|
+
FileNotFoundError
|
|
102
|
+
If any specified search path doesn't exist.
|
|
103
|
+
|
|
104
|
+
Example
|
|
105
|
+
-------
|
|
106
|
+
# Example: Scan and check file integrity, generate a report for faulty files
|
|
107
|
+
result = scan_ncfiles("/path/to/scan")
|
|
108
|
+
print(f"Faulty files: {result['faulty_files']}, Count: {result['faulty_count']}")
|
|
109
|
+
"""
|
|
110
|
+
# Parameter validation
|
|
111
|
+
if not isinstance(search_path, (str, list, Path)):
|
|
112
|
+
raise TypeError("search_path must be a string, list of strings, or Path object")
|
|
113
|
+
|
|
114
|
+
# Convert to list and flatten if necessary
|
|
115
|
+
if isinstance(search_path, (str, Path)):
|
|
116
|
+
search_paths = [str(search_path)]
|
|
117
|
+
else:
|
|
118
|
+
search_paths = flatten_list([str(p) for p in search_path])
|
|
119
|
+
|
|
120
|
+
# Validate paths
|
|
121
|
+
if not search_paths:
|
|
122
|
+
raise ValueError("search_path cannot be empty")
|
|
123
|
+
|
|
124
|
+
for path in search_paths:
|
|
125
|
+
if not isinstance(path, str) or not path.strip():
|
|
126
|
+
raise ValueError("All search paths must be non-empty strings")
|
|
127
|
+
|
|
128
|
+
if not Path(path).exists():
|
|
129
|
+
raise FileNotFoundError(f"Search path does not exist: {path}")
|
|
130
|
+
|
|
131
|
+
if not Path(path).is_dir():
|
|
132
|
+
raise ValueError(f"Search path must be a directory: {path}")
|
|
133
|
+
|
|
134
|
+
# Import here to avoid circular imports
|
|
135
|
+
from paramlib.global_parameters import CLIMATE_FILE_EXTENSIONS
|
|
136
|
+
|
|
137
|
+
# Step 1: Search for all netCDF files #
|
|
138
|
+
#######################################
|
|
139
|
+
all_files = []
|
|
140
|
+
for path in search_paths:
|
|
141
|
+
files_in_path = find_files(CLIMATE_FILE_EXTENSIONS[0], path)
|
|
142
|
+
all_files.extend(files_in_path)
|
|
143
|
+
|
|
144
|
+
if not all_files:
|
|
145
|
+
print("No netCDF files found in the specified directories")
|
|
146
|
+
return {
|
|
147
|
+
'total_dirs': 0,
|
|
148
|
+
'total_files': 0,
|
|
149
|
+
'faulty_files': [],
|
|
150
|
+
'faulty_count': 0,
|
|
151
|
+
'faulty_by_dir': {}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
# Step 2: Check each file's integrity and collect faulty files #
|
|
155
|
+
#################################################################
|
|
156
|
+
file_vs_err_list = []
|
|
157
|
+
for idx, file in enumerate(all_files, start=1):
|
|
158
|
+
format_args_scan_progress = (idx, len(all_files), file)
|
|
159
|
+
print_format_string(SCAN_PROGRESS_TEMPLATE, format_args_scan_progress)
|
|
160
|
+
try:
|
|
161
|
+
ncfile_integrity_status(file)
|
|
162
|
+
except Exception as ncf_err:
|
|
163
|
+
err_tuple = (file, str(ncf_err))
|
|
164
|
+
file_vs_err_list.append(err_tuple)
|
|
165
|
+
|
|
166
|
+
# Step 3: Find directories containing faulty files #
|
|
167
|
+
####################################################
|
|
168
|
+
dir_list = _unique_sorted([get_obj_specs(err_tuple[0], "parent") for err_tuple in file_vs_err_list])
|
|
169
|
+
|
|
170
|
+
# Step 4: Group faulty files by directory
|
|
171
|
+
file_vs_errs_dict = {dirc: [err_tuple for err_tuple in file_vs_err_list
|
|
172
|
+
if get_obj_specs(err_tuple[0], "parent")==dirc]
|
|
173
|
+
for dirc in dir_list}
|
|
174
|
+
|
|
175
|
+
# Step 5: Generate report #
|
|
176
|
+
###########################
|
|
177
|
+
|
|
178
|
+
# Statistics #
|
|
179
|
+
total_dirs = len(dir_list)
|
|
180
|
+
total_files = len(all_files)
|
|
181
|
+
total_faulties = sum(len(lst) for lst in file_vs_errs_dict.values())
|
|
182
|
+
|
|
183
|
+
# Report generation #
|
|
184
|
+
with open(REPORT_FILE_PATH, "w") as report:
|
|
185
|
+
report.write(REPORT_INFO_TEMPLATE.format(*(total_dirs, total_files, total_faulties)))
|
|
186
|
+
|
|
187
|
+
for dirc in file_vs_errs_dict.keys():
|
|
188
|
+
format_args_dir_info = (dirc, len(file_vs_errs_dict[dirc]))
|
|
189
|
+
report.write(format_string(string_underliner(DIR_INFO_TEMPLATE, format_args_dir_info), "="))
|
|
190
|
+
for values in file_vs_errs_dict[dirc]:
|
|
191
|
+
report.write(format_string(FILE_INFO_WRITING_TEMPLATE, values))
|
|
192
|
+
|
|
193
|
+
# Return comprehensive results
|
|
194
|
+
return {
|
|
195
|
+
'total_dirs': total_dirs,
|
|
196
|
+
'total_files': total_files,
|
|
197
|
+
'faulty_files': [err_tuple[0] for err_tuple in file_vs_err_list],
|
|
198
|
+
'faulty_count': total_faulties,
|
|
199
|
+
'faulty_by_dir': file_vs_errs_dict
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
# Auxiliary functions #
|
|
203
|
+
#-#-#-#-#-#-#-#-#-#-#-#
|
|
204
|
+
|
|
205
|
+
def ncfile_integrity_status(ncfile_name: str | Path) -> xr.Dataset:
|
|
206
|
+
"""
|
|
207
|
+
Checks the integrity of a given netCDF file by attempting to open it with xarray.
|
|
208
|
+
|
|
209
|
+
This function tries to open the specified netCDF file using `xarray.open_dataset`.
|
|
210
|
+
If the file is successfully opened, it returns the dataset before closing it.
|
|
211
|
+
If an error occurs during this process, it delegates the exception
|
|
212
|
+
raise to the output of xarray.dataset class.
|
|
213
|
+
|
|
214
|
+
Parameters
|
|
215
|
+
----------
|
|
216
|
+
ncfile_name : str | Path
|
|
217
|
+
Path to the netCDF file to be checked.
|
|
218
|
+
|
|
219
|
+
Returns
|
|
220
|
+
-------
|
|
221
|
+
xarray.Dataset
|
|
222
|
+
The opened dataset if successful.
|
|
223
|
+
|
|
224
|
+
Raises
|
|
225
|
+
------
|
|
226
|
+
TypeError
|
|
227
|
+
If ncfile_name is not str or Path.
|
|
228
|
+
ValueError
|
|
229
|
+
If the file path is empty or invalid.
|
|
230
|
+
FileNotFoundError
|
|
231
|
+
If the file doesn't exist.
|
|
232
|
+
OSError
|
|
233
|
+
Raised if the file cannot be found, opened, or there are issues with file permissions.
|
|
234
|
+
ValueError
|
|
235
|
+
Raised if the file is successfully opened but is not a valid netCDF file or has
|
|
236
|
+
an unsupported format.
|
|
237
|
+
RuntimeError
|
|
238
|
+
Raised for internal errors within the netCDF4 or h5py libraries, such as when
|
|
239
|
+
reading compressed data fails.
|
|
240
|
+
IOError
|
|
241
|
+
Raised for input/output errors at the system level, such as file corruption
|
|
242
|
+
or disk read failures.
|
|
243
|
+
KeyError
|
|
244
|
+
Raised in rare cases when essential variables or attributes required for reading
|
|
245
|
+
the file are missing or invalid.
|
|
246
|
+
"""
|
|
247
|
+
# Parameter validation
|
|
248
|
+
if not isinstance(ncfile_name, (str, Path)):
|
|
249
|
+
raise TypeError("ncfile_name must be a string or Path object")
|
|
250
|
+
|
|
251
|
+
ncfile_path = Path(ncfile_name)
|
|
252
|
+
|
|
253
|
+
if not str(ncfile_path).strip():
|
|
254
|
+
raise ValueError("File path cannot be empty")
|
|
255
|
+
|
|
256
|
+
if not ncfile_path.exists():
|
|
257
|
+
raise FileNotFoundError(f"NetCDF file not found: {ncfile_path}")
|
|
258
|
+
|
|
259
|
+
if not ncfile_path.is_file():
|
|
260
|
+
raise ValueError(f"Path must be a file, not a directory: {ncfile_path}")
|
|
261
|
+
|
|
262
|
+
# Check file extension
|
|
263
|
+
if not str(ncfile_path).lower().endswith(('.nc', '.netcdf')):
|
|
264
|
+
print(f"Warning: File {ncfile_path} may not be a netCDF file based on extension")
|
|
265
|
+
|
|
266
|
+
try:
|
|
267
|
+
ds = xr.open_dataset(ncfile_path)
|
|
268
|
+
return ds
|
|
269
|
+
except Exception as e:
|
|
270
|
+
raise type(e)(f"Failed to open netCDF file {ncfile_path}: {str(e)}")
|
|
271
|
+
finally:
|
|
272
|
+
if 'ds' in locals():
|
|
273
|
+
ds.close()
|
|
274
|
+
|
|
275
|
+
#--------------------------#
|
|
276
|
+
# Parameters and constants #
|
|
277
|
+
#--------------------------#
|
|
278
|
+
|
|
279
|
+
# Directory from where this code is being called #
|
|
280
|
+
CODE_CALL_DIR = os.getcwd()
|
|
281
|
+
|
|
282
|
+
# Template strings #
|
|
283
|
+
#------------------#
|
|
284
|
+
|
|
285
|
+
# File scanning progress information strings #
|
|
286
|
+
SCAN_PROGRESS_TEMPLATE =\
|
|
287
|
+
"""
|
|
288
|
+
File number: {} out of {}
|
|
289
|
+
File name: {}
|
|
290
|
+
"""
|
|
291
|
+
|
|
292
|
+
DIR_INFO_TEMPLATE = """\nDirectory: {} | Faulty files in this directory: {}"""
|
|
293
|
+
FILE_INFO_WRITING_TEMPLATE = """\nFile: {} -> {}\n"""
|
|
294
|
+
|
|
295
|
+
# Report results
|
|
296
|
+
REPORT_FN_NOEXT = "faulty_netcdf_file_report"
|
|
297
|
+
REPORT_FILE_PATH = f"{CODE_CALL_DIR}/{REPORT_FN_NOEXT}.txt"
|
|
298
|
+
REPORT_INFO_TEMPLATE =\
|
|
299
|
+
"""
|
|
300
|
+
+--------------------------------+
|
|
301
|
+
|Faulty NETCDF format file report|
|
|
302
|
+
+--------------------------------+
|
|
303
|
+
·Total directories scanned : {}
|
|
304
|
+
·Total files scanned: {}
|
|
305
|
+
·Total faulty files: {}
|
|
306
|
+
|
|
307
|
+
Faulty files
|
|
308
|
+
+----------+
|
|
309
|
+
"""
|