tfv-get-tools 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tfv_get_tools/__init__.py +4 -0
- tfv_get_tools/_standard_attrs.py +107 -0
- tfv_get_tools/atmos.py +167 -0
- tfv_get_tools/cli/_cli_base.py +173 -0
- tfv_get_tools/cli/atmos_cli.py +192 -0
- tfv_get_tools/cli/ocean_cli.py +204 -0
- tfv_get_tools/cli/tide_cli.py +118 -0
- tfv_get_tools/cli/wave_cli.py +183 -0
- tfv_get_tools/fvc/__init__.py +3 -0
- tfv_get_tools/fvc/_atmos.py +230 -0
- tfv_get_tools/fvc/_fvc.py +218 -0
- tfv_get_tools/fvc/_ocean.py +171 -0
- tfv_get_tools/fvc/_tide.py +195 -0
- tfv_get_tools/ocean.py +170 -0
- tfv_get_tools/providers/__init__.py +0 -0
- tfv_get_tools/providers/_custom_conversions.py +34 -0
- tfv_get_tools/providers/_downloader.py +566 -0
- tfv_get_tools/providers/_merger.py +520 -0
- tfv_get_tools/providers/_utilities.py +255 -0
- tfv_get_tools/providers/atmos/barra2.py +209 -0
- tfv_get_tools/providers/atmos/cfgs/barra2_c2.yaml +52 -0
- tfv_get_tools/providers/atmos/cfgs/barra2_r2.yaml +85 -0
- tfv_get_tools/providers/atmos/cfgs/barra2_re2.yaml +70 -0
- tfv_get_tools/providers/atmos/cfgs/cfsr.yaml +68 -0
- tfv_get_tools/providers/atmos/cfgs/era5.yaml +77 -0
- tfv_get_tools/providers/atmos/cfgs/era5_gcp.yaml +77 -0
- tfv_get_tools/providers/atmos/cfsr.py +207 -0
- tfv_get_tools/providers/atmos/era5.py +20 -0
- tfv_get_tools/providers/atmos/era5_gcp.py +20 -0
- tfv_get_tools/providers/ocean/cfgs/copernicus_blk.yaml +64 -0
- tfv_get_tools/providers/ocean/cfgs/copernicus_glo.yaml +67 -0
- tfv_get_tools/providers/ocean/cfgs/copernicus_nws.yaml +62 -0
- tfv_get_tools/providers/ocean/cfgs/hycom.yaml +73 -0
- tfv_get_tools/providers/ocean/copernicus_ocean.py +457 -0
- tfv_get_tools/providers/ocean/hycom.py +611 -0
- tfv_get_tools/providers/wave/cawcr.py +166 -0
- tfv_get_tools/providers/wave/cfgs/cawcr_aus_10m.yaml +39 -0
- tfv_get_tools/providers/wave/cfgs/cawcr_aus_4m.yaml +39 -0
- tfv_get_tools/providers/wave/cfgs/cawcr_glob_24m.yaml +39 -0
- tfv_get_tools/providers/wave/cfgs/cawcr_pac_10m.yaml +39 -0
- tfv_get_tools/providers/wave/cfgs/cawcr_pac_4m.yaml +39 -0
- tfv_get_tools/providers/wave/cfgs/copernicus_glo.yaml +56 -0
- tfv_get_tools/providers/wave/cfgs/copernicus_nws.yaml +51 -0
- tfv_get_tools/providers/wave/cfgs/era5.yaml +48 -0
- tfv_get_tools/providers/wave/cfgs/era5_gcp.yaml +48 -0
- tfv_get_tools/providers/wave/copernicus_wave.py +38 -0
- tfv_get_tools/providers/wave/era5.py +232 -0
- tfv_get_tools/providers/wave/era5_gcp.py +169 -0
- tfv_get_tools/tide/__init__.py +2 -0
- tfv_get_tools/tide/_nodestring.py +214 -0
- tfv_get_tools/tide/_tidal_base.py +568 -0
- tfv_get_tools/utilities/_tfv_bc.py +78 -0
- tfv_get_tools/utilities/horizontal_padding.py +89 -0
- tfv_get_tools/utilities/land_masking.py +93 -0
- tfv_get_tools/utilities/parsers.py +44 -0
- tfv_get_tools/utilities/warnings.py +38 -0
- tfv_get_tools/wave.py +179 -0
- tfv_get_tools-0.2.0.dist-info/METADATA +286 -0
- tfv_get_tools-0.2.0.dist-info/RECORD +62 -0
- tfv_get_tools-0.2.0.dist-info/WHEEL +5 -0
- tfv_get_tools-0.2.0.dist-info/entry_points.txt +5 -0
- tfv_get_tools-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ERA5 Wave downloader
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import shutil
|
|
6
|
+
import zipfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import List, Tuple
|
|
9
|
+
|
|
10
|
+
import cdsapi
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import xarray as xr
|
|
14
|
+
from pandas.tseries.offsets import MonthEnd
|
|
15
|
+
from tqdm import tqdm
|
|
16
|
+
|
|
17
|
+
from tfv_get_tools.providers._downloader import BaseDownloader
|
|
18
|
+
from tfv_get_tools.providers._merger import BaseMerger
|
|
19
|
+
from tfv_get_tools.providers._utilities import todstr
|
|
20
|
+
|
|
21
|
+
class DownloadERA5Wave(BaseDownloader):
|
|
22
|
+
"""ERA5 Wave downloader via CDS API"""
|
|
23
|
+
|
|
24
|
+
def _init_specific(self, **kwargs):
|
|
25
|
+
"""Set source and mode - matches original interface"""
|
|
26
|
+
self.source = "ERA5"
|
|
27
|
+
self.mode = "WAVE"
|
|
28
|
+
self._load_config()
|
|
29
|
+
|
|
30
|
+
def _get_output_filename(self, ts: pd.Timestamp, te: pd.Timestamp) -> Path:
|
|
31
|
+
"""ERA5 Wave filename pattern (no variable in filename)"""
|
|
32
|
+
fname = f"{self.prefix}_{todstr(ts)}_{todstr(te)}.nc"
|
|
33
|
+
return self.outdir / fname
|
|
34
|
+
|
|
35
|
+
def _construct_cds_request(self, date: pd.Timestamp) -> dict:
|
|
36
|
+
"""Construct CDS API request parameters"""
|
|
37
|
+
limstr = self._to_limstr(self.xlims, self.ylims)
|
|
38
|
+
|
|
39
|
+
year = date.year
|
|
40
|
+
month = date.month
|
|
41
|
+
times = [f"{x:02}:00" for x in range(0, 24, 1)] # Hourly
|
|
42
|
+
days = [str(x) for x in range(1, 32)]
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
"product_type": "reanalysis",
|
|
46
|
+
"variable": self.variables,
|
|
47
|
+
"year": [year],
|
|
48
|
+
"month": [month],
|
|
49
|
+
"day": days,
|
|
50
|
+
"time": times,
|
|
51
|
+
"area": limstr,
|
|
52
|
+
"data_format": "netcdf",
|
|
53
|
+
"download_format": "unarchived",
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def _to_limstr(x, y):
|
|
58
|
+
"""Convert coordinate bounds to ERA5 area string format"""
|
|
59
|
+
return f"{y[1]}/{x[0]}/{y[0]}/{x[1]}"
|
|
60
|
+
|
|
61
|
+
def _download_single_file(self, temp_file: Path, final_file: Path, cds_request: dict) -> bool:
|
|
62
|
+
"""Download single file via CDS API"""
|
|
63
|
+
try:
|
|
64
|
+
c = cdsapi.Client()
|
|
65
|
+
|
|
66
|
+
# Download to temporary file first
|
|
67
|
+
c.retrieve("reanalysis-era5-single-levels", cds_request, temp_file)
|
|
68
|
+
|
|
69
|
+
if temp_file.exists():
|
|
70
|
+
# ERA5 sometimes returns zip files, handle both cases
|
|
71
|
+
self._convert_split_netcdf_data(temp_file, final_file)
|
|
72
|
+
return True
|
|
73
|
+
else:
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
except Exception as e:
|
|
77
|
+
error_msg = str(e).lower()
|
|
78
|
+
|
|
79
|
+
# Check for specific error types and provide helpful messages
|
|
80
|
+
if "missing/incomplete configuration file" in error_msg or ".cdsapirc" in error_msg:
|
|
81
|
+
print("\n" + "="*60)
|
|
82
|
+
print("CDS API CONFIGURATION ERROR")
|
|
83
|
+
print("="*60)
|
|
84
|
+
print("The CDS API configuration file is missing or incomplete.")
|
|
85
|
+
print("To fix this issue:")
|
|
86
|
+
print("1. Register at https://cds.climate.copernicus.eu/")
|
|
87
|
+
print("2. Go to your profile page and copy your API key")
|
|
88
|
+
print("3. Create a file called '.cdsapirc' in your home directory with:")
|
|
89
|
+
print(" url: https://cds.climate.copernicus.eu/api")
|
|
90
|
+
print(" key: YOUR_API_KEY_HERE")
|
|
91
|
+
print("="*60)
|
|
92
|
+
|
|
93
|
+
elif "authentication" in error_msg or "invalid key" in error_msg:
|
|
94
|
+
print("\n" + "="*60)
|
|
95
|
+
print("CDS API AUTHENTICATION ERROR")
|
|
96
|
+
print("="*60)
|
|
97
|
+
print("Your CDS API key appears to be invalid.")
|
|
98
|
+
print("Please check your .cdsapirc file and ensure your API key is correct.")
|
|
99
|
+
print("You can find your key at: https://cds.climate.copernicus.eu/user/")
|
|
100
|
+
print("="*60)
|
|
101
|
+
|
|
102
|
+
else:
|
|
103
|
+
# For any other errors, show the original message if verbose
|
|
104
|
+
if self.verbose:
|
|
105
|
+
print(f"Failed to download via CDS API: {e}")
|
|
106
|
+
else:
|
|
107
|
+
# Always show some info for unhandled errors
|
|
108
|
+
print(f"\nCDS API Error: {e}")
|
|
109
|
+
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
@staticmethod
|
|
113
|
+
def _convert_split_netcdf_data(file_handle_temp: Path, file_handle: Path) -> bool:
|
|
114
|
+
"""
|
|
115
|
+
Handle ERA5 zip files or direct NetCDF files
|
|
116
|
+
"""
|
|
117
|
+
file_path = Path(file_handle_temp)
|
|
118
|
+
file_path_out = Path(file_handle)
|
|
119
|
+
|
|
120
|
+
# Check if file is a zip file
|
|
121
|
+
if zipfile.is_zipfile(file_path):
|
|
122
|
+
datasets = []
|
|
123
|
+
with zipfile.ZipFile(file_path, 'r') as zip_ref:
|
|
124
|
+
# Extract all files to a temporary directory
|
|
125
|
+
temp_dir = file_path.parent / 'temp_netcdf'
|
|
126
|
+
temp_dir.mkdir(exist_ok=True)
|
|
127
|
+
zip_ref.extractall(temp_dir)
|
|
128
|
+
|
|
129
|
+
# Load all NetCDF files
|
|
130
|
+
for extracted_file in temp_dir.glob('*.nc'):
|
|
131
|
+
with xr.open_dataset(extracted_file) as dsx:
|
|
132
|
+
dsx.load()
|
|
133
|
+
datasets.append(dsx)
|
|
134
|
+
|
|
135
|
+
# Clean up extracted files
|
|
136
|
+
for file in temp_dir.iterdir():
|
|
137
|
+
file.unlink()
|
|
138
|
+
temp_dir.rmdir()
|
|
139
|
+
|
|
140
|
+
if not datasets:
|
|
141
|
+
raise ValueError("No NetCDF files found in zip archive")
|
|
142
|
+
|
|
143
|
+
# Combine all datasets
|
|
144
|
+
ds = xr.merge(datasets)
|
|
145
|
+
ds.to_netcdf(file_path_out)
|
|
146
|
+
|
|
147
|
+
# Delete the zip file
|
|
148
|
+
file_path.unlink()
|
|
149
|
+
|
|
150
|
+
else:
|
|
151
|
+
# Move the NetCDF file to the output path
|
|
152
|
+
shutil.move(file_path, file_path_out)
|
|
153
|
+
|
|
154
|
+
return True
|
|
155
|
+
|
|
156
|
+
def download(self):
|
|
157
|
+
"""ERA5 Wave-specific download loop - yields tasks for new base class"""
|
|
158
|
+
for ts in self.times:
|
|
159
|
+
te = ts + MonthEnd() + pd.Timedelta("23.9h")
|
|
160
|
+
|
|
161
|
+
final_file = self._get_output_filename(ts, te)
|
|
162
|
+
temp_file = self.outdir / '_temp_era5_file'
|
|
163
|
+
cds_request = self._construct_cds_request(ts)
|
|
164
|
+
|
|
165
|
+
yield {
|
|
166
|
+
'file_path': final_file,
|
|
167
|
+
'url': 'CDS_API', # Not a URL but API call
|
|
168
|
+
'timestamp': ts,
|
|
169
|
+
'variable': 'all_variables', # ERA5 downloads all vars together
|
|
170
|
+
'download_func': lambda tf=temp_file, ff=final_file, req=cds_request:
|
|
171
|
+
self._download_single_file(tf, ff, req)
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class MergeERA5Wave(BaseMerger):
|
|
176
|
+
def _init_specific(self) -> None:
|
|
177
|
+
self.source = "ERA5"
|
|
178
|
+
self.mode = "WAVE"
|
|
179
|
+
self._load_config()
|
|
180
|
+
|
|
181
|
+
def _process_era5_dataset(self, ds: xr.Dataset) -> xr.Dataset:
|
|
182
|
+
"""Apply ERA5-specific coordinate and dimension processing."""
|
|
183
|
+
# Rename time coordinate
|
|
184
|
+
if "valid_time" in ds.coords:
|
|
185
|
+
ds = ds.rename({"valid_time": "time"})
|
|
186
|
+
|
|
187
|
+
# Handle experiment version dimension
|
|
188
|
+
if "expver" in ds.dims:
|
|
189
|
+
ds = ds.mean(dim="expver", keep_attrs=True)
|
|
190
|
+
ds = ds.drop_vars("expver", errors='ignore')
|
|
191
|
+
|
|
192
|
+
# Standardise coordinate names
|
|
193
|
+
coord_mappings = {'lon': 'longitude', 'lng': 'longitude', 'lat': 'latitude'}
|
|
194
|
+
rename_dict = {old: new for old, new in coord_mappings.items() if old in ds.coords}
|
|
195
|
+
if rename_dict:
|
|
196
|
+
ds = ds.rename(rename_dict)
|
|
197
|
+
|
|
198
|
+
return ds
|
|
199
|
+
|
|
200
|
+
def merge_files(self, file_list: List[Path]) -> Tuple[xr.Dataset, List[Path]]:
|
|
201
|
+
"""Merge ERA5 wave files with time concatenation."""
|
|
202
|
+
if not file_list:
|
|
203
|
+
raise ValueError("No files provided for merging")
|
|
204
|
+
|
|
205
|
+
datasets = []
|
|
206
|
+
skipped_files = []
|
|
207
|
+
|
|
208
|
+
for file_path in tqdm(file_list, disable=not self.verbose):
|
|
209
|
+
ds = self._open_subset_netcdf(file_path, time=("time", "valid_time"))
|
|
210
|
+
if ds is not None:
|
|
211
|
+
ds = self._process_era5_dataset(ds)
|
|
212
|
+
datasets.append(ds)
|
|
213
|
+
else:
|
|
214
|
+
skipped_files.append(file_path)
|
|
215
|
+
|
|
216
|
+
if not datasets:
|
|
217
|
+
raise ValueError("No valid datasets could be loaded")
|
|
218
|
+
|
|
219
|
+
# Concatenate and clean up
|
|
220
|
+
merged = xr.concat(datasets, dim="time", combine_attrs="override",
|
|
221
|
+
data_vars="minimal", coords="minimal", compat="override")
|
|
222
|
+
|
|
223
|
+
# Remove duplicates and sort
|
|
224
|
+
merged = merged.sortby("time")
|
|
225
|
+
_, unique_idx = np.unique(merged["time"], return_index=True)
|
|
226
|
+
merged = merged.isel(time=np.sort(unique_idx))
|
|
227
|
+
|
|
228
|
+
# Sort latitudes south to north
|
|
229
|
+
if 'latitude' in merged.coords:
|
|
230
|
+
merged = merged.sortby("latitude")
|
|
231
|
+
|
|
232
|
+
return merged, skipped_files
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AEW - This is an alternative source ERA5 that we used for awhile while CDSAPI wasn't playing ball.
|
|
3
|
+
It's marginally faster, requires no registration, but runs several months behind CDSAPI which
|
|
4
|
+
got called out a few times, so we should stick with CDSAPI. Also generally believe it's good form to
|
|
5
|
+
have users go through CDSAPI as it is their data.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import xarray as xr
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
|
|
15
|
+
from pandas.tseries.offsets import MonthEnd
|
|
16
|
+
|
|
17
|
+
from tfv_get_tools.providers._utilities import todstr
|
|
18
|
+
from tfv_get_tools.providers._downloader import BaseDownloader
|
|
19
|
+
from tfv_get_tools.providers._merger import BaseMerger
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DownloadERA5WaveGCP(BaseDownloader):
|
|
23
|
+
"""Updated version that makes use of the public GCP Zarr ERA5 release"""
|
|
24
|
+
|
|
25
|
+
def _init_specific(self, **kwargs):
|
|
26
|
+
"""Set source and mode - matches original interface"""
|
|
27
|
+
self.source = "ERA5_GCP"
|
|
28
|
+
self.mode = "WAVE"
|
|
29
|
+
self._load_config()
|
|
30
|
+
|
|
31
|
+
# Initialize the Zarr dataset connection
|
|
32
|
+
self.era5_ds = None
|
|
33
|
+
self.last_valid = None
|
|
34
|
+
|
|
35
|
+
def _initialize_zarr_connection(self):
|
|
36
|
+
"""Initialize connection to ERA5 Zarr dataset"""
|
|
37
|
+
if self.era5_ds is None:
|
|
38
|
+
if self.verbose:
|
|
39
|
+
print('Opening connection to ERA5 data - please wait')
|
|
40
|
+
|
|
41
|
+
self.era5_ds = xr.open_zarr(
|
|
42
|
+
self.base_url,
|
|
43
|
+
chunks=dict(time=744), # 31 day chunk
|
|
44
|
+
storage_options=dict(token='anon'),
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
self.last_valid = pd.Timestamp(self.era5_ds.attrs['valid_time_stop']).floor('1d')
|
|
48
|
+
|
|
49
|
+
self.era5_ds = self.era5_ds.sel(time=slice(
|
|
50
|
+
pd.Timestamp(1940, 1, 1), self.last_valid
|
|
51
|
+
))
|
|
52
|
+
|
|
53
|
+
def _get_output_filename(self, ts: pd.Timestamp, te: pd.Timestamp) -> Path:
|
|
54
|
+
"""ERA5 GCP filename pattern (no variable in filename - downloads all together)"""
|
|
55
|
+
fname = f"{self.prefix}_{todstr(ts)}_{todstr(te)}.nc"
|
|
56
|
+
return self.outdir / fname
|
|
57
|
+
|
|
58
|
+
def _download_single_time_period(self, ts: pd.Timestamp, te: pd.Timestamp, output_file: Path) -> bool:
|
|
59
|
+
"""Download single time period from Zarr dataset"""
|
|
60
|
+
try:
|
|
61
|
+
# Check if we're past the valid time range
|
|
62
|
+
if ts > self.last_valid:
|
|
63
|
+
if self.verbose:
|
|
64
|
+
print(f'The final valid ERA5 time on this database is {self.last_valid.strftime("%Y-%m-%d")}.')
|
|
65
|
+
print('Skipping this time period')
|
|
66
|
+
return False
|
|
67
|
+
|
|
68
|
+
# Apply slices and variable filters
|
|
69
|
+
dsx = self.era5_ds.sel(
|
|
70
|
+
time=slice(ts, te),
|
|
71
|
+
latitude=slice(*self.ylims[::-1]),
|
|
72
|
+
longitude=slice(*self.xlims),
|
|
73
|
+
)[self.variables]
|
|
74
|
+
|
|
75
|
+
if self.verbose:
|
|
76
|
+
print(f"... Downloading {output_file.name}")
|
|
77
|
+
|
|
78
|
+
# Wrap the cut-down piece back to -180 to 180
|
|
79
|
+
dsx = dsx.assign_coords({'longitude': (dsx['longitude'] + 180) % 360 - 180})
|
|
80
|
+
dsx = dsx.sortby('longitude')
|
|
81
|
+
|
|
82
|
+
dsx.to_netcdf(output_file)
|
|
83
|
+
return True
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
if self.verbose:
|
|
87
|
+
print(f"Failed to download {output_file.name}: {e}")
|
|
88
|
+
return False
|
|
89
|
+
|
|
90
|
+
def download(self):
|
|
91
|
+
"""ERA5 GCP-specific download loop - yields tasks for new base class"""
|
|
92
|
+
# Initialize the Zarr connection once
|
|
93
|
+
self._initialize_zarr_connection()
|
|
94
|
+
|
|
95
|
+
if self.verbose:
|
|
96
|
+
print('Starting downloading loop')
|
|
97
|
+
|
|
98
|
+
for ts in self.times:
|
|
99
|
+
te = ts + MonthEnd() + pd.Timedelta("23.9h")
|
|
100
|
+
|
|
101
|
+
# Check if we're past the valid time range
|
|
102
|
+
if ts > self.last_valid:
|
|
103
|
+
if self.verbose:
|
|
104
|
+
print(f'The final valid ERA5 time on this database is {self.last_valid.strftime("%Y-%m-%d")}.')
|
|
105
|
+
print('Exiting download loop early')
|
|
106
|
+
break
|
|
107
|
+
|
|
108
|
+
output_file = self._get_output_filename(ts, te)
|
|
109
|
+
|
|
110
|
+
yield {
|
|
111
|
+
'file_path': output_file,
|
|
112
|
+
'url': f'zarr://{self.base_url}', # Pseudo-URL for logging
|
|
113
|
+
'timestamp': ts,
|
|
114
|
+
'variable': 'all_variables', # ERA5 downloads all vars together
|
|
115
|
+
'download_func': lambda start=ts, end=te, out=output_file:
|
|
116
|
+
self._download_single_time_period(start, end, out)
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
class MergeERA5WaveGCP(BaseMerger):
|
|
120
|
+
def _init_specific(self):
|
|
121
|
+
self.source = "ERA5"
|
|
122
|
+
self.mode = "WAVE"
|
|
123
|
+
self._load_config()
|
|
124
|
+
|
|
125
|
+
def merge_files(self, file_list):
|
|
126
|
+
"""
|
|
127
|
+
ERA5 merging logic.
|
|
128
|
+
|
|
129
|
+
ERA5 names the time variable "valid_time" which we rename after opening.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
file_list (list): list of path objects to open and concat.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
xr.Dataset: merged xarray dataset
|
|
136
|
+
list: files unable to be merged
|
|
137
|
+
"""
|
|
138
|
+
dsset = []
|
|
139
|
+
skipped_list = []
|
|
140
|
+
for f in tqdm(file_list):
|
|
141
|
+
dsx = self._open_subset_netcdf(f, time=("time", "valid_time"))
|
|
142
|
+
if dsx is not None:
|
|
143
|
+
if "valid_time" in dsx:
|
|
144
|
+
dsx = dsx.rename(valid_time="time")
|
|
145
|
+
if "expver" in dsx.dims:
|
|
146
|
+
dsx = dsx.mean(dim="expver", keep_attrs=True)
|
|
147
|
+
dsx = dsx.drop("expver", errors='ignore')
|
|
148
|
+
dsset.append(dsx)
|
|
149
|
+
else:
|
|
150
|
+
skipped_list.append(f)
|
|
151
|
+
|
|
152
|
+
print("Concatenating xarray dataset")
|
|
153
|
+
ds = xr.concat(
|
|
154
|
+
dsset,
|
|
155
|
+
dim="time",
|
|
156
|
+
combine_attrs="override",
|
|
157
|
+
data_vars="minimal",
|
|
158
|
+
coords="minimal",
|
|
159
|
+
compat="override",
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Sort by time and drop duplicates (from overlaps)
|
|
163
|
+
ds = ds.sortby("time")
|
|
164
|
+
_, idx = np.unique(ds["time"], return_index=True)
|
|
165
|
+
ds = ds.isel(time=idx)
|
|
166
|
+
|
|
167
|
+
ds = ds.sortby("latitude") # Latitudes should go south to north
|
|
168
|
+
|
|
169
|
+
return ds, skipped_list
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Union, List, Any
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import geopandas as gpd
|
|
7
|
+
from pyproj import CRS
|
|
8
|
+
from geopandas import GeoDataFrame
|
|
9
|
+
from shapely.geometry import LineString, Point
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _convert_id(x: Any):
|
|
13
|
+
"""Simple ID entry conversion
|
|
14
|
+
|
|
15
|
+
Try to make each ID an integer, but fall back on a string.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
x (Any): ID Field (e.g., 2, 'NS1')
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Union[int, str]: ID as either an int or str, with preference on integer.
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
x = int(x)
|
|
25
|
+
except:
|
|
26
|
+
x = str(x)
|
|
27
|
+
return x
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def load_nodestring_shapefile(
|
|
31
|
+
filename: Union[Path, str], crs: int = None, process_ids: Union[tuple, list] = None
|
|
32
|
+
) -> GeoDataFrame:
|
|
33
|
+
"""Load a TUFLOW FV nodestring shapefile as a GeoDataFrame.
|
|
34
|
+
|
|
35
|
+
The CRS will be read from the .prj file if present. Otherwise, `crs=X` can be passed as an EPSG integer code.
|
|
36
|
+
|
|
37
|
+
By default, all the features in the nodestring will be loaded. Use `process_ids` arg to filter to only certain features.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
filename (Union[Path, str]): Path to the nodestring .shp file
|
|
41
|
+
crs (int, optional): Coordinate reference system EPSG code. Defaults to None.
|
|
42
|
+
process_ids (Union[tuple, list], optional): List of ID's to process. Defaults to None.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
GeoDataFrame: Frame containing the geometry and ID features ready for processing.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
gdf = gpd.read_file(filename, columns=["ID"])
|
|
49
|
+
|
|
50
|
+
# Set the CRS of the geodataframe. Assume 4326 as backup.
|
|
51
|
+
if crs is None:
|
|
52
|
+
if not gdf.crs:
|
|
53
|
+
print(
|
|
54
|
+
"No CRS could be read from the shapefile. Assuming nodestring is in latitude/longitude (EPSG 4326)"
|
|
55
|
+
)
|
|
56
|
+
gdf.set_crs(4326)
|
|
57
|
+
else:
|
|
58
|
+
try:
|
|
59
|
+
crs = CRS.from_epsg(crs)
|
|
60
|
+
gdf = gdf.set_crs(crs)
|
|
61
|
+
except:
|
|
62
|
+
raise ValueError(
|
|
63
|
+
f"Supplied CRS `{crs}` is not valid. Please provide an EPSG code as an integer, e.g., 7856"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Parse the process ID's
|
|
67
|
+
assert "ID" in gdf.columns, "There must be an `ID` column present in the shapefile"
|
|
68
|
+
shp_ids = gdf["ID"].apply(_convert_id).tolist()
|
|
69
|
+
|
|
70
|
+
if process_ids is None:
|
|
71
|
+
# If no id's are supplied, then we take everything except obvious nan's.
|
|
72
|
+
process_ids = [x for x in shp_ids if x != "nan"]
|
|
73
|
+
else:
|
|
74
|
+
# If they are supplied, then we first convert to INT or STR
|
|
75
|
+
process_ids = [_convert_id(x) for x in process_ids]
|
|
76
|
+
for x in process_ids:
|
|
77
|
+
assert (
|
|
78
|
+
x in shp_ids
|
|
79
|
+
), f"Nodestring feature ID `{x}` was not found in the shapefile"
|
|
80
|
+
|
|
81
|
+
# Do a check on geometry before moving on
|
|
82
|
+
checked_ids = []
|
|
83
|
+
for x in process_ids:
|
|
84
|
+
idx = shp_ids.index(x)
|
|
85
|
+
geo = gdf.loc[idx, "geometry"]
|
|
86
|
+
|
|
87
|
+
if geo is None:
|
|
88
|
+
print(
|
|
89
|
+
f"Warning - No geometry detected for Nodestring ID {x}. Skipping this feature..."
|
|
90
|
+
)
|
|
91
|
+
elif not isinstance(geo, LineString):
|
|
92
|
+
print(
|
|
93
|
+
f"Warning - Invalid geometry detected for Nodestring ID {x}. Must be a Linestring type. Skipping this feature..."
|
|
94
|
+
)
|
|
95
|
+
else:
|
|
96
|
+
checked_ids.append(x)
|
|
97
|
+
|
|
98
|
+
msk = [shp_ids.index(x) for x in checked_ids]
|
|
99
|
+
gdf = gdf.loc[msk]
|
|
100
|
+
|
|
101
|
+
return gdf
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def process_nodestring_gdf(gdf: GeoDataFrame, spacing=2500.0) -> dict:
|
|
105
|
+
"""Generates a dictionary containing a Nx2 array of lat/lon coordinates from a nodestring geodataframe.
|
|
106
|
+
|
|
107
|
+
The geodataframe must have a `crs` set. It does not need to be lat/lon.
|
|
108
|
+
|
|
109
|
+
All features in the geodataframe will be processed - it is assumed that filtering has already happened.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
gdf (GeoDataFrame): nodestring geodataframe.
|
|
113
|
+
spacing (float, optional): Spacing in meters. Defaults to 2500.0
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
dict: nodestring dictionary where key is the `ID` and values are a Nx2 np.ndarray.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
if gdf.crs is None:
|
|
120
|
+
raise ValueError("The nodestring geodataframe must have a CRS defined.")
|
|
121
|
+
|
|
122
|
+
coords = sample_coordinates_along_linestring(gdf, spacing)
|
|
123
|
+
|
|
124
|
+
process_ids = [_convert_id(x) for x in gdf["ID"]]
|
|
125
|
+
|
|
126
|
+
ns_dat = {}
|
|
127
|
+
for i, k in enumerate(process_ids):
|
|
128
|
+
coord_array = np.squeeze(np.asarray([x.xy for x in coords.iloc[i]]))
|
|
129
|
+
ns_dat[k] = coord_array
|
|
130
|
+
|
|
131
|
+
return ns_dat
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def sample_coordinates_along_linestring(
|
|
135
|
+
gdf: gpd.GeoDataFrame, spacing: float
|
|
136
|
+
) -> gpd.GeoSeries:
|
|
137
|
+
"""
|
|
138
|
+
Sample coordinates along LineString geometries in a GeoDataFrame.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
gdf (gpd.GeoDataFrame): The input GeoDataFrame containing LineString geometries.
|
|
142
|
+
spacing (float): The desired spacing between sampled points in meters.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
gpd.GeoSeries: A GeoSeries where each element is a list of sampled Points in WGS84 coordinates.
|
|
146
|
+
"""
|
|
147
|
+
return gdf.apply(lambda row: process_row(row, spacing, gdf.crs), axis=1)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def sample_linestring(line: LineString, spacing_meters: float) -> List[Point]:
|
|
151
|
+
"""
|
|
152
|
+
Sample points along a LineString at a specified spacing.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
line (LineString): The LineString to sample points from.
|
|
156
|
+
spacing_meters (float): The spacing between points in meters.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
List[Point]: A list of sampled points along the LineString.
|
|
160
|
+
|
|
161
|
+
Raises:
|
|
162
|
+
ValueError: If the line is empty or the spacing is not positive.
|
|
163
|
+
"""
|
|
164
|
+
if line.is_empty:
|
|
165
|
+
raise ValueError("The LineString is empty.")
|
|
166
|
+
if spacing_meters <= 0:
|
|
167
|
+
raise ValueError("Spacing must be a positive number.")
|
|
168
|
+
|
|
169
|
+
line_length = line.length
|
|
170
|
+
num_points = int(line_length / spacing_meters) + 1
|
|
171
|
+
distances = np.linspace(0, line_length, num_points)
|
|
172
|
+
return [line.interpolate(distance) for distance in distances]
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def process_row(row: pd.Series, spacing: float, crs: CRS) -> List[Point]:
|
|
176
|
+
"""
|
|
177
|
+
Process a single row of a GeoDataFrame, sampling points along its LineString geometry.
|
|
178
|
+
|
|
179
|
+
This function handles both geographic (lat/lon) and projected coordinate systems.
|
|
180
|
+
It always returns the sampled points in WGS84 (EPSG:4326) lat/lon coordinates.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
row (pd.Series): A row from a GeoDataFrame containing a 'geometry' column
|
|
184
|
+
with a LineString object.
|
|
185
|
+
spacing (float): The desired spacing between sampled points in meters.
|
|
186
|
+
crs (CRS): The coordinate reference system of the input geometry.
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
List[Point]: A list of sampled points along the LineString, in lat/lon coordinates.
|
|
190
|
+
|
|
191
|
+
Raises:
|
|
192
|
+
ValueError: If the input geometry is not a LineString.
|
|
193
|
+
"""
|
|
194
|
+
if not isinstance(row.geometry, LineString):
|
|
195
|
+
raise ValueError("The geometry must be a LineString.")
|
|
196
|
+
|
|
197
|
+
if crs.is_geographic:
|
|
198
|
+
# Convert to UTM for accurate distance calculations
|
|
199
|
+
centroid = row.geometry.centroid
|
|
200
|
+
utm_zone = int((centroid.x + 180) // 6 + 1)
|
|
201
|
+
utm_crs = CRS.from_proj4(
|
|
202
|
+
f"+proj=utm +zone={utm_zone} +datum=WGS84 +units=m +no_defs"
|
|
203
|
+
)
|
|
204
|
+
utm_geometry = gpd.GeoSeries([row.geometry], crs=crs).to_crs(utm_crs)[0]
|
|
205
|
+
sampled_points = sample_linestring(utm_geometry, spacing)
|
|
206
|
+
points_gdf = gpd.GeoDataFrame(geometry=sampled_points, crs=utm_crs)
|
|
207
|
+
else:
|
|
208
|
+
# If already projected, sample directly
|
|
209
|
+
sampled_points = sample_linestring(row.geometry, spacing)
|
|
210
|
+
points_gdf = gpd.GeoDataFrame(geometry=sampled_points, crs=crs)
|
|
211
|
+
|
|
212
|
+
# Convert sampled points to lat/lon (WGS84)
|
|
213
|
+
points_gdf = points_gdf.to_crs(epsg=4326)
|
|
214
|
+
return points_gdf.geometry.tolist()
|