tfv-get-tools 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tfv_get_tools/__init__.py +4 -0
- tfv_get_tools/_standard_attrs.py +107 -0
- tfv_get_tools/atmos.py +167 -0
- tfv_get_tools/cli/_cli_base.py +173 -0
- tfv_get_tools/cli/atmos_cli.py +192 -0
- tfv_get_tools/cli/ocean_cli.py +204 -0
- tfv_get_tools/cli/tide_cli.py +118 -0
- tfv_get_tools/cli/wave_cli.py +183 -0
- tfv_get_tools/fvc/__init__.py +3 -0
- tfv_get_tools/fvc/_atmos.py +230 -0
- tfv_get_tools/fvc/_fvc.py +218 -0
- tfv_get_tools/fvc/_ocean.py +171 -0
- tfv_get_tools/fvc/_tide.py +195 -0
- tfv_get_tools/ocean.py +170 -0
- tfv_get_tools/providers/__init__.py +0 -0
- tfv_get_tools/providers/_custom_conversions.py +34 -0
- tfv_get_tools/providers/_downloader.py +566 -0
- tfv_get_tools/providers/_merger.py +520 -0
- tfv_get_tools/providers/_utilities.py +255 -0
- tfv_get_tools/providers/atmos/barra2.py +209 -0
- tfv_get_tools/providers/atmos/cfgs/barra2_c2.yaml +52 -0
- tfv_get_tools/providers/atmos/cfgs/barra2_r2.yaml +85 -0
- tfv_get_tools/providers/atmos/cfgs/barra2_re2.yaml +70 -0
- tfv_get_tools/providers/atmos/cfgs/cfsr.yaml +68 -0
- tfv_get_tools/providers/atmos/cfgs/era5.yaml +77 -0
- tfv_get_tools/providers/atmos/cfgs/era5_gcp.yaml +77 -0
- tfv_get_tools/providers/atmos/cfsr.py +207 -0
- tfv_get_tools/providers/atmos/era5.py +20 -0
- tfv_get_tools/providers/atmos/era5_gcp.py +20 -0
- tfv_get_tools/providers/ocean/cfgs/copernicus_blk.yaml +64 -0
- tfv_get_tools/providers/ocean/cfgs/copernicus_glo.yaml +67 -0
- tfv_get_tools/providers/ocean/cfgs/copernicus_nws.yaml +62 -0
- tfv_get_tools/providers/ocean/cfgs/hycom.yaml +73 -0
- tfv_get_tools/providers/ocean/copernicus_ocean.py +457 -0
- tfv_get_tools/providers/ocean/hycom.py +611 -0
- tfv_get_tools/providers/wave/cawcr.py +166 -0
- tfv_get_tools/providers/wave/cfgs/cawcr_aus_10m.yaml +39 -0
- tfv_get_tools/providers/wave/cfgs/cawcr_aus_4m.yaml +39 -0
- tfv_get_tools/providers/wave/cfgs/cawcr_glob_24m.yaml +39 -0
- tfv_get_tools/providers/wave/cfgs/cawcr_pac_10m.yaml +39 -0
- tfv_get_tools/providers/wave/cfgs/cawcr_pac_4m.yaml +39 -0
- tfv_get_tools/providers/wave/cfgs/copernicus_glo.yaml +56 -0
- tfv_get_tools/providers/wave/cfgs/copernicus_nws.yaml +51 -0
- tfv_get_tools/providers/wave/cfgs/era5.yaml +48 -0
- tfv_get_tools/providers/wave/cfgs/era5_gcp.yaml +48 -0
- tfv_get_tools/providers/wave/copernicus_wave.py +38 -0
- tfv_get_tools/providers/wave/era5.py +232 -0
- tfv_get_tools/providers/wave/era5_gcp.py +169 -0
- tfv_get_tools/tide/__init__.py +2 -0
- tfv_get_tools/tide/_nodestring.py +214 -0
- tfv_get_tools/tide/_tidal_base.py +568 -0
- tfv_get_tools/utilities/_tfv_bc.py +78 -0
- tfv_get_tools/utilities/horizontal_padding.py +89 -0
- tfv_get_tools/utilities/land_masking.py +93 -0
- tfv_get_tools/utilities/parsers.py +44 -0
- tfv_get_tools/utilities/warnings.py +38 -0
- tfv_get_tools/wave.py +179 -0
- tfv_get_tools-0.2.0.dist-info/METADATA +286 -0
- tfv_get_tools-0.2.0.dist-info/RECORD +62 -0
- tfv_get_tools-0.2.0.dist-info/WHEEL +5 -0
- tfv_get_tools-0.2.0.dist-info/entry_points.txt +5 -0
- tfv_get_tools-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,611 @@
|
|
|
1
|
+
from datetime import datetime, timedelta
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from time import sleep
|
|
4
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import xarray as xr
|
|
9
|
+
from tqdm.auto import tqdm
|
|
10
|
+
|
|
11
|
+
from tfv_get_tools.providers._downloader import BaseDownloader
|
|
12
|
+
from tfv_get_tools.providers._merger import BaseMerger
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DownloadHycom(BaseDownloader):
|
|
16
|
+
"""Downloader class for HYCOM oceanographic data.
|
|
17
|
+
|
|
18
|
+
Handles both single-database (pre-2024-08) and multi-database (post-2024-08) formats.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def _init_specific(self, **kwargs):
|
|
22
|
+
"""Initialize HYCOM specific attributes"""
|
|
23
|
+
self.source = "HYCOM"
|
|
24
|
+
self.mode = "OCEAN"
|
|
25
|
+
self._load_config()
|
|
26
|
+
|
|
27
|
+
# Dictionary to cache database coordinate information
|
|
28
|
+
self.db_cache = {}
|
|
29
|
+
# Track which variables are 2D (no depth dimension)
|
|
30
|
+
self.two_dimensional_vars = ["surf_el"]
|
|
31
|
+
|
|
32
|
+
# Convert time_interval to string if "best" is specified in model field
|
|
33
|
+
if self.model and self.model.lower() == "best":
|
|
34
|
+
self.time_interval = "best"
|
|
35
|
+
# Reset model to default to avoid confusion in filename
|
|
36
|
+
self.model = "default"
|
|
37
|
+
if self.verbose:
|
|
38
|
+
print("Using 'best' time interval: downloading all available timesteps")
|
|
39
|
+
|
|
40
|
+
def _get_output_filename(self, date: datetime, db_name: str = None) -> Path:
|
|
41
|
+
"""Generate output filename based on date, time interval, and database name"""
|
|
42
|
+
date_str = date.strftime('%Y%m%d')
|
|
43
|
+
|
|
44
|
+
# Format time interval part of filename
|
|
45
|
+
if self.time_interval == "best":
|
|
46
|
+
interval_str = "best"
|
|
47
|
+
else:
|
|
48
|
+
interval_str = f"{self.time_interval:02d}h"
|
|
49
|
+
|
|
50
|
+
# Construct filename
|
|
51
|
+
if db_name:
|
|
52
|
+
fname = f"{self.prefix}_{date_str}_{interval_str}_{db_name}.nc"
|
|
53
|
+
else:
|
|
54
|
+
fname = f"{self.prefix}_{date_str}_{interval_str}.nc"
|
|
55
|
+
|
|
56
|
+
return self.outdir / fname
|
|
57
|
+
|
|
58
|
+
def _get_database(self, date: datetime) -> Union[str, Dict[str, List[str]], None]:
|
|
59
|
+
"""Get database URL or mapping for a date"""
|
|
60
|
+
if not isinstance(date, datetime):
|
|
61
|
+
raise ValueError("Input must be a datetime object")
|
|
62
|
+
|
|
63
|
+
# Sort the dates in ascending order
|
|
64
|
+
sorted_dates = sorted(
|
|
65
|
+
self.dsmap.keys(), key=lambda x: datetime.strptime(x, "%Y-%m-%d")
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
for i, start_date_str in enumerate(sorted_dates):
|
|
69
|
+
start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
|
|
70
|
+
|
|
71
|
+
# If it's the last item, or if the date is within this range
|
|
72
|
+
if i == len(sorted_dates) - 1 or date < datetime.strptime(
|
|
73
|
+
sorted_dates[i + 1], "%Y-%m-%d"
|
|
74
|
+
):
|
|
75
|
+
if date >= start_date:
|
|
76
|
+
database_info = self.dsmap[start_date_str]
|
|
77
|
+
|
|
78
|
+
# Check if this is a dictionary with multiple databases (new format)
|
|
79
|
+
if isinstance(database_info, dict) and not isinstance(next(iter(database_info.values())), str):
|
|
80
|
+
# New format: mapping of database URLs to variable lists
|
|
81
|
+
result = {}
|
|
82
|
+
for db_url, var_list in database_info.items():
|
|
83
|
+
# Check if we need to filter variables based on user request
|
|
84
|
+
if self._custom_variables:
|
|
85
|
+
# Only include databases that have variables we need
|
|
86
|
+
filtered_vars = [v for v in var_list if v in self.variables]
|
|
87
|
+
if filtered_vars:
|
|
88
|
+
formatted_url = db_url.format(year=date.year) if "{year}" in db_url else db_url
|
|
89
|
+
result[formatted_url] = filtered_vars
|
|
90
|
+
else:
|
|
91
|
+
# Include all databases and their variables
|
|
92
|
+
formatted_url = db_url.format(year=date.year) if "{year}" in db_url else db_url
|
|
93
|
+
result[formatted_url] = var_list
|
|
94
|
+
return result
|
|
95
|
+
else:
|
|
96
|
+
# Original format: single database URL
|
|
97
|
+
database_url = next(iter(database_info.keys()))
|
|
98
|
+
if "{year}" in database_url:
|
|
99
|
+
return database_url.format(year=date.year)
|
|
100
|
+
return database_url
|
|
101
|
+
break
|
|
102
|
+
|
|
103
|
+
if self.verbose:
|
|
104
|
+
print(f"No data for {date}")
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
def _initialize_database_coords(self, date: datetime, db_url: str, is_2d: bool):
|
|
108
|
+
"""Initialize coordinates for a database if not already cached"""
|
|
109
|
+
db_key = f"{db_url}_{'2d' if is_2d else '3d'}"
|
|
110
|
+
|
|
111
|
+
if db_key in self.db_cache:
|
|
112
|
+
return True
|
|
113
|
+
|
|
114
|
+
# Adjust longitude limits for post-2017 databases
|
|
115
|
+
adjusted_xlims = self._adjust_longitude_limits(date, self.xlims)
|
|
116
|
+
|
|
117
|
+
# Define bounding box
|
|
118
|
+
if is_2d:
|
|
119
|
+
bbox = {
|
|
120
|
+
"xmin": adjusted_xlims[0],
|
|
121
|
+
"xmax": adjusted_xlims[1],
|
|
122
|
+
"ymin": self.ylims[0],
|
|
123
|
+
"ymax": self.ylims[1],
|
|
124
|
+
}
|
|
125
|
+
else:
|
|
126
|
+
bbox = {
|
|
127
|
+
"xmin": adjusted_xlims[0],
|
|
128
|
+
"xmax": adjusted_xlims[1],
|
|
129
|
+
"ymin": self.ylims[0],
|
|
130
|
+
"ymax": self.ylims[1],
|
|
131
|
+
"zmin": self.zlims[0],
|
|
132
|
+
"zmax": self.zlims[1],
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return self._get_database_coords(date, db_url, bbox, not is_2d)
|
|
136
|
+
|
|
137
|
+
def _download_hycom_file(self, date: datetime, db_url: str, var_list: List[str], output_file: Path) -> bool:
|
|
138
|
+
"""Download a single HYCOM file for given database and variables"""
|
|
139
|
+
try:
|
|
140
|
+
# Determine if this is a 2D database
|
|
141
|
+
is_2d_db = all(var in self.two_dimensional_vars for var in var_list)
|
|
142
|
+
|
|
143
|
+
# Initialize coordinates for this database
|
|
144
|
+
if not self._initialize_database_coords(date, db_url, is_2d_db):
|
|
145
|
+
return False
|
|
146
|
+
|
|
147
|
+
# Get indices for this database
|
|
148
|
+
idx_set = self._get_idxs(date, db_url, is_2d=is_2d_db)
|
|
149
|
+
|
|
150
|
+
# Skip if no valid time indices
|
|
151
|
+
if idx_set[0] == "0:1:0":
|
|
152
|
+
if self.verbose:
|
|
153
|
+
print(f"No time indices found for {date} in {db_url}, skipping...")
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
# Construct OpenDAP URL
|
|
157
|
+
url = self._construct_opendap_url(var_list, idx_set, date, db_url, is_2d=is_2d_db)
|
|
158
|
+
|
|
159
|
+
if self.verbose:
|
|
160
|
+
self.log(url)
|
|
161
|
+
|
|
162
|
+
# Download and save file
|
|
163
|
+
return self._download_single_file(output_file, url)
|
|
164
|
+
|
|
165
|
+
except Exception as e:
|
|
166
|
+
if self.verbose:
|
|
167
|
+
print(f"Error downloading from {db_url} for date {date}: {e}")
|
|
168
|
+
return False
|
|
169
|
+
|
|
170
|
+
def download(self):
|
|
171
|
+
"""HYCOM-specific download loop - yields tasks for new base class"""
|
|
172
|
+
for date in self.times:
|
|
173
|
+
# Get database mapping for this date
|
|
174
|
+
db_info = self._get_database(date)
|
|
175
|
+
if db_info is None:
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
# Check if we have a multi-database situation (new format)
|
|
179
|
+
is_multi_db = isinstance(db_info, dict)
|
|
180
|
+
|
|
181
|
+
if is_multi_db:
|
|
182
|
+
# Multi-database format: yield one task per database
|
|
183
|
+
for db_url, var_list in db_info.items():
|
|
184
|
+
db_name = db_url.split('/')[-1]
|
|
185
|
+
output_file = self._get_output_filename(date, db_name)
|
|
186
|
+
|
|
187
|
+
yield {
|
|
188
|
+
'file_path': output_file,
|
|
189
|
+
'url': db_url,
|
|
190
|
+
'timestamp': date,
|
|
191
|
+
'variable': f"{len(var_list)}_vars",
|
|
192
|
+
'download_func': lambda d=date, url=db_url, vars=var_list, out=output_file:
|
|
193
|
+
self._download_hycom_file(d, url, vars, out)
|
|
194
|
+
}
|
|
195
|
+
else:
|
|
196
|
+
# Single database format
|
|
197
|
+
output_file = self._get_output_filename(date)
|
|
198
|
+
|
|
199
|
+
yield {
|
|
200
|
+
'file_path': output_file,
|
|
201
|
+
'url': db_info,
|
|
202
|
+
'timestamp': date,
|
|
203
|
+
'variable': f"{len(self.variables)}_vars",
|
|
204
|
+
'download_func': lambda d=date, url=db_info, vars=self.variables, out=output_file:
|
|
205
|
+
self._download_hycom_file(d, url, vars, out)
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
# All the complex HYCOM-specific methods remain the same
|
|
209
|
+
def _adjust_longitude_limits(self, date: datetime, xlims: Tuple[float, float]) -> Tuple[float, float]:
|
|
210
|
+
"""Adjust longitude limits if needed based on date"""
|
|
211
|
+
xmin, xmax = xlims
|
|
212
|
+
# Hycom specific logic - xmin/xmax needs jumped up after this date
|
|
213
|
+
if date >= datetime(2017, 10, 1):
|
|
214
|
+
xmin = xmin + 360.0 if xmin < 0 else xmin
|
|
215
|
+
xmax = xmax + 360.0 if xmax <= 0 else xmax
|
|
216
|
+
return (xmin, xmax)
|
|
217
|
+
|
|
218
|
+
def _get_database_coords(self, date: datetime, database: str, bbox: Dict[str, float], include_depth: bool) -> bool:
|
|
219
|
+
"""Get coordinates for a database"""
|
|
220
|
+
# Create key for database cache
|
|
221
|
+
db_key = f"{database}_{'3d' if include_depth else '2d'}"
|
|
222
|
+
|
|
223
|
+
# Check if coordinates are already cached
|
|
224
|
+
if db_key in self.db_cache:
|
|
225
|
+
return True
|
|
226
|
+
|
|
227
|
+
# Construct base URL for coordinates
|
|
228
|
+
if include_depth:
|
|
229
|
+
baseurl = f"https://tds.hycom.org/thredds/dodsC/{database}?lat,lon,time,depth"
|
|
230
|
+
else:
|
|
231
|
+
baseurl = f"https://tds.hycom.org/thredds/dodsC/{database}?lat,lon,time"
|
|
232
|
+
|
|
233
|
+
if self.verbose:
|
|
234
|
+
print(f"-- Getting coordinates for database: {database} ({'3D' if include_depth else '2D'}) --")
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
ds = xr.open_dataset(baseurl)
|
|
238
|
+
|
|
239
|
+
# Extract coordinates
|
|
240
|
+
lon = ds["lon"].values
|
|
241
|
+
lat = ds["lat"].values
|
|
242
|
+
times = ds["time"].values
|
|
243
|
+
|
|
244
|
+
# Find indices within bounds
|
|
245
|
+
lat_idxs = np.where((lat >= bbox["ymin"]) & (lat <= bbox["ymax"]))[0]
|
|
246
|
+
lon_idxs = np.where((lon >= bbox["xmin"]) & (lon <= bbox["xmax"]))[0]
|
|
247
|
+
|
|
248
|
+
# Create cache entry
|
|
249
|
+
cache_entry = {
|
|
250
|
+
'times': times,
|
|
251
|
+
'lon_idxs': lon_idxs,
|
|
252
|
+
'lat_idxs': lat_idxs,
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
# Add depth if needed
|
|
256
|
+
if include_depth:
|
|
257
|
+
dep = ds["depth"].values
|
|
258
|
+
dep_idxs = np.where((dep >= bbox["zmin"]) & (dep <= bbox["zmax"]))[0]
|
|
259
|
+
cache_entry['dep_idxs'] = dep_idxs
|
|
260
|
+
|
|
261
|
+
# Store in cache
|
|
262
|
+
self.db_cache[db_key] = cache_entry
|
|
263
|
+
|
|
264
|
+
if self.verbose:
|
|
265
|
+
print(f"Coordinates cached for {db_key}")
|
|
266
|
+
print(f" Time shape: {times.shape}")
|
|
267
|
+
print(f" Lon indices: {len(lon_idxs)}")
|
|
268
|
+
print(f" Lat indices: {len(lat_idxs)}")
|
|
269
|
+
if include_depth:
|
|
270
|
+
print(f" Depth indices: {len(dep_idxs)}")
|
|
271
|
+
|
|
272
|
+
return True
|
|
273
|
+
|
|
274
|
+
except Exception as e:
|
|
275
|
+
print(f"Error retrieving coordinates for database {database}: {e}")
|
|
276
|
+
# Create an empty entry to prevent repeated attempts
|
|
277
|
+
self.db_cache[db_key] = {
|
|
278
|
+
'times': np.array([]),
|
|
279
|
+
'lon_idxs': np.array([]),
|
|
280
|
+
'lat_idxs': np.array([]),
|
|
281
|
+
}
|
|
282
|
+
if include_depth:
|
|
283
|
+
self.db_cache[db_key]['dep_idxs'] = np.array([])
|
|
284
|
+
|
|
285
|
+
return False
|
|
286
|
+
|
|
287
|
+
def _find_time_indices(self, date: datetime, times: np.ndarray) -> np.ndarray:
|
|
288
|
+
"""Find indices in times array that match the given date at specified time intervals"""
|
|
289
|
+
# Convert to datetime for consistent handling
|
|
290
|
+
date_dt = pd.Timestamp(date).floor('us').to_pydatetime()
|
|
291
|
+
# Create a date-only object for comparison
|
|
292
|
+
date_only = date_dt.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
293
|
+
next_day = date_only + timedelta(days=1)
|
|
294
|
+
|
|
295
|
+
# Get all timesteps within this day
|
|
296
|
+
day_indices = []
|
|
297
|
+
for i, t in enumerate(times):
|
|
298
|
+
t_dt = pd.Timestamp(t).floor('us').to_pydatetime()
|
|
299
|
+
# Check if the timestamp is on the same date
|
|
300
|
+
if date_only <= t_dt < next_day:
|
|
301
|
+
day_indices.append((i, t_dt))
|
|
302
|
+
|
|
303
|
+
# If no timestamps found for this day, return empty array
|
|
304
|
+
if not day_indices:
|
|
305
|
+
return np.array([])
|
|
306
|
+
|
|
307
|
+
# Handle "best" time interval - return all available timesteps
|
|
308
|
+
if self.time_interval == "best":
|
|
309
|
+
return np.array([idx for idx, _ in day_indices])
|
|
310
|
+
|
|
311
|
+
# For daily interval (24 hours), try to find a timestamp closest to noon
|
|
312
|
+
if self.time_interval >= 24:
|
|
313
|
+
# Target time is noon
|
|
314
|
+
target_time = date_only + timedelta(hours=12)
|
|
315
|
+
|
|
316
|
+
# Find timestamp closest to noon
|
|
317
|
+
closest_idx = min(day_indices, key=lambda x: abs((x[1] - target_time).total_seconds()))[0]
|
|
318
|
+
return np.array([closest_idx])
|
|
319
|
+
|
|
320
|
+
# For sub-daily intervals (e.g., 3, 6, 12 hours)
|
|
321
|
+
selected_indices = []
|
|
322
|
+
for hour in range(0, 24, self.time_interval):
|
|
323
|
+
target_time = date_only + timedelta(hours=hour) # Target start of interval
|
|
324
|
+
|
|
325
|
+
# Find indices within this interval
|
|
326
|
+
interval_indices = [
|
|
327
|
+
(idx, dt) for idx, dt in day_indices
|
|
328
|
+
if date_only + timedelta(hours=hour) <= dt < date_only + timedelta(hours=hour + self.time_interval)
|
|
329
|
+
]
|
|
330
|
+
|
|
331
|
+
if interval_indices:
|
|
332
|
+
# Find closest to the start of the interval
|
|
333
|
+
closest_idx = min(interval_indices, key=lambda x: abs((x[1] - target_time).total_seconds()))[0]
|
|
334
|
+
selected_indices.append(closest_idx)
|
|
335
|
+
|
|
336
|
+
return np.array(selected_indices)
|
|
337
|
+
|
|
338
|
+
def _format_opendap_slice(self, indices: np.ndarray) -> str:
|
|
339
|
+
"""Convert array of indices into OpenDAP slice format (start:step:stop)"""
|
|
340
|
+
if len(indices) == 0:
|
|
341
|
+
return "0:1:0" # Empty slice
|
|
342
|
+
|
|
343
|
+
if len(indices) == 1:
|
|
344
|
+
return f"{indices[0]}:1:{indices[0]}"
|
|
345
|
+
|
|
346
|
+
# Calculate differences between consecutive indices
|
|
347
|
+
diff = np.diff(indices)
|
|
348
|
+
|
|
349
|
+
# Check if we have a constant step size
|
|
350
|
+
unique_steps = np.unique(diff)
|
|
351
|
+
|
|
352
|
+
if len(unique_steps) == 1:
|
|
353
|
+
# Continuous sequence with constant step
|
|
354
|
+
step = unique_steps[0]
|
|
355
|
+
return f"{indices[0]}:{step}:{indices[-1]}"
|
|
356
|
+
else:
|
|
357
|
+
# Try to find the most common step size
|
|
358
|
+
step = np.bincount(diff).argmax()
|
|
359
|
+
return f"{indices[0]}:{step}:{indices[-1]}"
|
|
360
|
+
|
|
361
|
+
def _get_idxs(self, date: datetime, database: str, is_2d: bool = False) -> Tuple[str, str, str, Optional[str]]:
|
|
362
|
+
"""Get indices for the given database"""
|
|
363
|
+
# Create key for database lookup
|
|
364
|
+
db_key = f"{database}_{'2d' if is_2d else '3d'}"
|
|
365
|
+
|
|
366
|
+
# Check if database has been initialized
|
|
367
|
+
if db_key not in self.db_cache:
|
|
368
|
+
if self.verbose:
|
|
369
|
+
print(f"Warning: Database {db_key} not initialized. Unable to get indices.")
|
|
370
|
+
return "0:1:0", "0:1:0", "0:1:0", None if is_2d else "0:1:0"
|
|
371
|
+
|
|
372
|
+
# Get database data
|
|
373
|
+
db_data = self.db_cache[db_key]
|
|
374
|
+
|
|
375
|
+
# Check if database has valid data
|
|
376
|
+
if len(db_data['times']) == 0:
|
|
377
|
+
if self.verbose:
|
|
378
|
+
print(f"Warning: No time data for {db_key}. Unable to get indices.")
|
|
379
|
+
return "0:1:0", "0:1:0", "0:1:0", None if is_2d else "0:1:0"
|
|
380
|
+
|
|
381
|
+
# Get the database-specific indices and times
|
|
382
|
+
times = db_data['times']
|
|
383
|
+
lon_idxs = db_data['lon_idxs']
|
|
384
|
+
lat_idxs = db_data['lat_idxs']
|
|
385
|
+
|
|
386
|
+
# Format basic indices for OpenDAP
|
|
387
|
+
lon_idx = self._format_opendap_slice(lon_idxs)
|
|
388
|
+
lat_idx = self._format_opendap_slice(lat_idxs)
|
|
389
|
+
|
|
390
|
+
# For 2D variables (like surf_el), don't include depth
|
|
391
|
+
if is_2d:
|
|
392
|
+
dep_idx = None
|
|
393
|
+
else:
|
|
394
|
+
dep_idxs = db_data['dep_idxs']
|
|
395
|
+
dep_idx = self._format_opendap_slice(dep_idxs)
|
|
396
|
+
|
|
397
|
+
# Find time indices for this date
|
|
398
|
+
try:
|
|
399
|
+
time_idxs = self._find_time_indices(date, times)
|
|
400
|
+
if len(time_idxs) == 0:
|
|
401
|
+
if self.verbose:
|
|
402
|
+
print(f"Warning: No time indices found for {date} in {db_key}.")
|
|
403
|
+
print(f"Available times range: {pd.Timestamp(times[0])} to {pd.Timestamp(times[-1])}")
|
|
404
|
+
return "0:1:0", lon_idx, lat_idx, dep_idx
|
|
405
|
+
|
|
406
|
+
time_idx = self._format_opendap_slice(time_idxs)
|
|
407
|
+
return time_idx, lon_idx, lat_idx, dep_idx
|
|
408
|
+
except Exception as e:
|
|
409
|
+
print(f"Error finding time indices for {date} in {db_key}: {e}")
|
|
410
|
+
return "0:1:0", lon_idx, lat_idx, dep_idx
|
|
411
|
+
|
|
412
|
+
def _construct_opendap_url(self, variables: List[str], idx_set: Tuple[str, str, str, Optional[str]],
|
|
413
|
+
date: datetime, database: str, is_2d: bool = False) -> str:
|
|
414
|
+
"""Construct OpenDAP URL for the given variables and indices"""
|
|
415
|
+
time_idx, lon_idx, lat_idx, dep_idx = idx_set
|
|
416
|
+
|
|
417
|
+
# For future dates (forecasts), use a different URL format
|
|
418
|
+
if date >= datetime.now():
|
|
419
|
+
date2 = datetime.now() - timedelta(days=1)
|
|
420
|
+
base_url = (
|
|
421
|
+
f"https://tds.hycom.org/thredds/dodsC/{database}/FMRC/runs/GLBy0.08_930_FMRC_RUN_"
|
|
422
|
+
+ f'{date2.strftime("%Y-%m-%dT12:00:00Z")}?'
|
|
423
|
+
)
|
|
424
|
+
else:
|
|
425
|
+
base_url = f"https://tds.hycom.org/thredds/dodsC/{database}?"
|
|
426
|
+
|
|
427
|
+
# Add coordinate subsetting
|
|
428
|
+
if is_2d:
|
|
429
|
+
# For 2D variables (no depth)
|
|
430
|
+
url = base_url + f"lat[{lat_idx}],lon[{lon_idx}],time[{time_idx}]"
|
|
431
|
+
else:
|
|
432
|
+
# For 3D variables (with depth)
|
|
433
|
+
url = base_url + f"lat[{lat_idx}],lon[{lon_idx}],depth[{dep_idx}],time[{time_idx}]"
|
|
434
|
+
|
|
435
|
+
# Add variable subsetting
|
|
436
|
+
for v in variables:
|
|
437
|
+
if v == "surf_el" or is_2d:
|
|
438
|
+
# 2D variable (no depth dimension)
|
|
439
|
+
url += f",{v}[{time_idx}][{lat_idx}][{lon_idx}]"
|
|
440
|
+
else:
|
|
441
|
+
# 3D variable (with depth dimension)
|
|
442
|
+
url += f",{v}[{time_idx}][{dep_idx}][{lat_idx}][{lon_idx}]"
|
|
443
|
+
|
|
444
|
+
return url
|
|
445
|
+
|
|
446
|
+
def _download_single_file(self, fname: Path, url: str) -> bool:
|
|
447
|
+
"""Download a single HYCOM subset file"""
|
|
448
|
+
assert fname.parent.exists(), "Output folder does not exist. Please create this first"
|
|
449
|
+
|
|
450
|
+
try:
|
|
451
|
+
ds = xr.open_dataset(url)
|
|
452
|
+
ds.to_netcdf(fname, format="NETCDF4")
|
|
453
|
+
assert fname.exists()
|
|
454
|
+
return True
|
|
455
|
+
except Exception as e:
|
|
456
|
+
if self.verbose:
|
|
457
|
+
print(f"File download failed: {e}")
|
|
458
|
+
sleep(self.timeout)
|
|
459
|
+
|
|
460
|
+
# Retry with decreasing tries
|
|
461
|
+
if self.max_tries > 1:
|
|
462
|
+
if self.verbose:
|
|
463
|
+
print(f"Retrying... {self.max_tries-1} tries left")
|
|
464
|
+
self.max_tries -= 1
|
|
465
|
+
return self._download_single_file(fname, url)
|
|
466
|
+
return False
|
|
467
|
+
|
|
468
|
+
class MergeHYCOM(BaseMerger):
|
|
469
|
+
def _init_specific(self) -> None:
|
|
470
|
+
self.source = "HYCOM"
|
|
471
|
+
self.mode = "OCEAN"
|
|
472
|
+
self._load_config()
|
|
473
|
+
|
|
474
|
+
def _extract_target_coordinates(self, datasets: List[xr.Dataset]) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]]:
|
|
475
|
+
"""Extract coordinates from the most complete dataset (prefer later models)."""
|
|
476
|
+
# Use last dataset (reverse sorted) as it has extended coordinates
|
|
477
|
+
for ds in reversed(datasets):
|
|
478
|
+
if ds is not None:
|
|
479
|
+
lon = ds['lon'].values
|
|
480
|
+
lat = ds['lat'].values
|
|
481
|
+
depth = ds['depth'].values if 'depth' in ds else None
|
|
482
|
+
return lon, lat, depth
|
|
483
|
+
|
|
484
|
+
raise ValueError("No valid datasets found for coordinate extraction")
|
|
485
|
+
|
|
486
|
+
def _process_file(self, file_path: Path) -> Optional[xr.Dataset]:
|
|
487
|
+
"""Load and apply basic HYCOM processing."""
|
|
488
|
+
ds = self._open_subset_netcdf(file_path, chunks=dict(time=24))
|
|
489
|
+
if ds is not None:
|
|
490
|
+
# Apply longitude wrapping (crucial for HYCOM because of mixed grids!!)
|
|
491
|
+
ds = ds.assign_coords(lon=((ds.lon + 180) % 360) - 180)
|
|
492
|
+
return ds
|
|
493
|
+
|
|
494
|
+
def _check_sub_daily_data(self, ds: xr.Dataset) -> bool:
|
|
495
|
+
"""Check if dataset contains sub-daily (< 24 hour) temporal resolution."""
|
|
496
|
+
if ds is not None and len(ds['time']) > 1:
|
|
497
|
+
time_diffs = np.diff(ds['time'].values).astype('timedelta64[h]').astype(int)
|
|
498
|
+
return np.any(time_diffs < 24)
|
|
499
|
+
return False
|
|
500
|
+
|
|
501
|
+
def _apply_tidal_filtering(self, ds: xr.Dataset) -> xr.Dataset:
|
|
502
|
+
"""Apply 25-hour centered rolling mean to remove tidal signal from post-2024-08-09 data."""
|
|
503
|
+
cutoff_time = pd.Timestamp('2024-08-09 23:59:00')
|
|
504
|
+
|
|
505
|
+
# Filter time indices after cutoff
|
|
506
|
+
post_cutoff_mask = ds['time'] > cutoff_time
|
|
507
|
+
|
|
508
|
+
if not post_cutoff_mask.any():
|
|
509
|
+
return ds # No data after cutoff, return unchanged
|
|
510
|
+
|
|
511
|
+
# Apply rolling mean only to post-cutoff data
|
|
512
|
+
ds_filtered = ds.copy()
|
|
513
|
+
|
|
514
|
+
# Process all time-dependent variables at once to avoid repeated rolling operations
|
|
515
|
+
time_vars = [var for var in ds.data_vars if 'time' in ds[var].dims]
|
|
516
|
+
|
|
517
|
+
if time_vars:
|
|
518
|
+
# Create single rolling object and apply to all variables
|
|
519
|
+
rolling_ds = ds[time_vars].rolling(time=25, center=True).reduce(np.nanmean)
|
|
520
|
+
|
|
521
|
+
for var_name in time_vars:
|
|
522
|
+
# Only replace post-cutoff values
|
|
523
|
+
ds_filtered[var_name] = xr.where(
|
|
524
|
+
post_cutoff_mask,
|
|
525
|
+
rolling_ds[var_name],
|
|
526
|
+
ds[var_name]
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
return ds_filtered
|
|
530
|
+
|
|
531
|
+
def merge_files(self, file_list: List[Path]) -> Tuple[xr.Dataset, List[Path]]:
|
|
532
|
+
"""Merge HYCOM files: group by startdate, merge variables, then concat time."""
|
|
533
|
+
if not file_list:
|
|
534
|
+
raise ValueError("No files provided for merging")
|
|
535
|
+
|
|
536
|
+
# Sort reverse to get extended coordinates from later models
|
|
537
|
+
file_list.sort(reverse=True)
|
|
538
|
+
|
|
539
|
+
# Group by start date
|
|
540
|
+
startdates = [f.stem.split('_')[2] for f in file_list]
|
|
541
|
+
grouped_files = {date: [] for date in np.unique(startdates)}
|
|
542
|
+
|
|
543
|
+
# Load files and group them
|
|
544
|
+
all_datasets = []
|
|
545
|
+
skipped_files = []
|
|
546
|
+
|
|
547
|
+
for i, file_path in enumerate(tqdm(file_list, disable=not self.verbose)):
|
|
548
|
+
ds = self._process_file(file_path)
|
|
549
|
+
if ds is not None:
|
|
550
|
+
grouped_files[startdates[i]].append(ds)
|
|
551
|
+
all_datasets.append(ds)
|
|
552
|
+
else:
|
|
553
|
+
skipped_files.append(file_path)
|
|
554
|
+
|
|
555
|
+
if not all_datasets:
|
|
556
|
+
raise ValueError("No valid datasets could be loaded")
|
|
557
|
+
|
|
558
|
+
# Extract target coordinates
|
|
559
|
+
target_lon, target_lat, target_depth = self._extract_target_coordinates(all_datasets)
|
|
560
|
+
|
|
561
|
+
# Check if we have post-2024-08-10 data and if it's sub-daily
|
|
562
|
+
cutoff_date = pd.Timestamp('2024-08-10')
|
|
563
|
+
has_post_cutoff_data = False
|
|
564
|
+
apply_tidal_filtering = False
|
|
565
|
+
|
|
566
|
+
# Check latest startdate (files are reverse sorted)
|
|
567
|
+
if startdates:
|
|
568
|
+
latest_startdate = pd.Timestamp(startdates[0])
|
|
569
|
+
if latest_startdate >= cutoff_date:
|
|
570
|
+
has_post_cutoff_data = True
|
|
571
|
+
# Check the most recent dataset for sub-daily data
|
|
572
|
+
if all_datasets and self._check_sub_daily_data(all_datasets[0]):
|
|
573
|
+
apply_tidal_filtering = True
|
|
574
|
+
|
|
575
|
+
if self.verbose:
|
|
576
|
+
print("Concatenating and interpolating xarray dataset")
|
|
577
|
+
if has_post_cutoff_data and apply_tidal_filtering:
|
|
578
|
+
print('... Dataset contains sub-daily data post-2024-08-10 (HYCOM ESPC-D-V02), applying tidal filtering.')
|
|
579
|
+
|
|
580
|
+
# Merge variables for each start date group
|
|
581
|
+
merged_by_date = []
|
|
582
|
+
for date_group in grouped_files.values():
|
|
583
|
+
if date_group:
|
|
584
|
+
interpolated = []
|
|
585
|
+
for ds in date_group:
|
|
586
|
+
# Interpolate to common grid (2D or 3D as appropriate)
|
|
587
|
+
if target_depth is not None and 'depth' in ds.dims:
|
|
588
|
+
ds_interp = ds.interp(lon=target_lon, lat=target_lat, depth=target_depth,
|
|
589
|
+
method='nearest', kwargs=dict(fill_value='extrapolate'))
|
|
590
|
+
else:
|
|
591
|
+
ds_interp = ds.interp(lon=target_lon, lat=target_lat,
|
|
592
|
+
method='nearest', kwargs=dict(fill_value='extrapolate'))
|
|
593
|
+
interpolated.append(ds_interp)
|
|
594
|
+
|
|
595
|
+
merged_by_date.append(xr.merge(interpolated, compat='override'))
|
|
596
|
+
|
|
597
|
+
# Concatenate along time dimension
|
|
598
|
+
merged = xr.concat(merged_by_date, dim="time", combine_attrs="override",
|
|
599
|
+
data_vars="minimal", coords="minimal", compat="override")
|
|
600
|
+
|
|
601
|
+
# Apply tidal filtering to the merged dataset if needed
|
|
602
|
+
if apply_tidal_filtering:
|
|
603
|
+
merged = self._apply_tidal_filtering(merged)
|
|
604
|
+
|
|
605
|
+
# Final cleanup
|
|
606
|
+
merged = merged.rename({'lon': 'longitude', 'lat': 'latitude'})
|
|
607
|
+
merged = merged.sortby('time')
|
|
608
|
+
_, unique_idx = np.unique(merged['time'], return_index=True)
|
|
609
|
+
merged = merged.isel(time=unique_idx)
|
|
610
|
+
|
|
611
|
+
return merged, skipped_files
|