tfv-get-tools 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. tfv_get_tools/__init__.py +4 -0
  2. tfv_get_tools/_standard_attrs.py +107 -0
  3. tfv_get_tools/atmos.py +167 -0
  4. tfv_get_tools/cli/_cli_base.py +173 -0
  5. tfv_get_tools/cli/atmos_cli.py +192 -0
  6. tfv_get_tools/cli/ocean_cli.py +204 -0
  7. tfv_get_tools/cli/tide_cli.py +118 -0
  8. tfv_get_tools/cli/wave_cli.py +183 -0
  9. tfv_get_tools/fvc/__init__.py +3 -0
  10. tfv_get_tools/fvc/_atmos.py +230 -0
  11. tfv_get_tools/fvc/_fvc.py +218 -0
  12. tfv_get_tools/fvc/_ocean.py +171 -0
  13. tfv_get_tools/fvc/_tide.py +195 -0
  14. tfv_get_tools/ocean.py +170 -0
  15. tfv_get_tools/providers/__init__.py +0 -0
  16. tfv_get_tools/providers/_custom_conversions.py +34 -0
  17. tfv_get_tools/providers/_downloader.py +566 -0
  18. tfv_get_tools/providers/_merger.py +520 -0
  19. tfv_get_tools/providers/_utilities.py +255 -0
  20. tfv_get_tools/providers/atmos/barra2.py +209 -0
  21. tfv_get_tools/providers/atmos/cfgs/barra2_c2.yaml +52 -0
  22. tfv_get_tools/providers/atmos/cfgs/barra2_r2.yaml +85 -0
  23. tfv_get_tools/providers/atmos/cfgs/barra2_re2.yaml +70 -0
  24. tfv_get_tools/providers/atmos/cfgs/cfsr.yaml +68 -0
  25. tfv_get_tools/providers/atmos/cfgs/era5.yaml +77 -0
  26. tfv_get_tools/providers/atmos/cfgs/era5_gcp.yaml +77 -0
  27. tfv_get_tools/providers/atmos/cfsr.py +207 -0
  28. tfv_get_tools/providers/atmos/era5.py +20 -0
  29. tfv_get_tools/providers/atmos/era5_gcp.py +20 -0
  30. tfv_get_tools/providers/ocean/cfgs/copernicus_blk.yaml +64 -0
  31. tfv_get_tools/providers/ocean/cfgs/copernicus_glo.yaml +67 -0
  32. tfv_get_tools/providers/ocean/cfgs/copernicus_nws.yaml +62 -0
  33. tfv_get_tools/providers/ocean/cfgs/hycom.yaml +73 -0
  34. tfv_get_tools/providers/ocean/copernicus_ocean.py +457 -0
  35. tfv_get_tools/providers/ocean/hycom.py +611 -0
  36. tfv_get_tools/providers/wave/cawcr.py +166 -0
  37. tfv_get_tools/providers/wave/cfgs/cawcr_aus_10m.yaml +39 -0
  38. tfv_get_tools/providers/wave/cfgs/cawcr_aus_4m.yaml +39 -0
  39. tfv_get_tools/providers/wave/cfgs/cawcr_glob_24m.yaml +39 -0
  40. tfv_get_tools/providers/wave/cfgs/cawcr_pac_10m.yaml +39 -0
  41. tfv_get_tools/providers/wave/cfgs/cawcr_pac_4m.yaml +39 -0
  42. tfv_get_tools/providers/wave/cfgs/copernicus_glo.yaml +56 -0
  43. tfv_get_tools/providers/wave/cfgs/copernicus_nws.yaml +51 -0
  44. tfv_get_tools/providers/wave/cfgs/era5.yaml +48 -0
  45. tfv_get_tools/providers/wave/cfgs/era5_gcp.yaml +48 -0
  46. tfv_get_tools/providers/wave/copernicus_wave.py +38 -0
  47. tfv_get_tools/providers/wave/era5.py +232 -0
  48. tfv_get_tools/providers/wave/era5_gcp.py +169 -0
  49. tfv_get_tools/tide/__init__.py +2 -0
  50. tfv_get_tools/tide/_nodestring.py +214 -0
  51. tfv_get_tools/tide/_tidal_base.py +568 -0
  52. tfv_get_tools/utilities/_tfv_bc.py +78 -0
  53. tfv_get_tools/utilities/horizontal_padding.py +89 -0
  54. tfv_get_tools/utilities/land_masking.py +93 -0
  55. tfv_get_tools/utilities/parsers.py +44 -0
  56. tfv_get_tools/utilities/warnings.py +38 -0
  57. tfv_get_tools/wave.py +179 -0
  58. tfv_get_tools-0.2.0.dist-info/METADATA +286 -0
  59. tfv_get_tools-0.2.0.dist-info/RECORD +62 -0
  60. tfv_get_tools-0.2.0.dist-info/WHEEL +5 -0
  61. tfv_get_tools-0.2.0.dist-info/entry_points.txt +5 -0
  62. tfv_get_tools-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,611 @@
1
+ from datetime import datetime, timedelta
2
+ from pathlib import Path
3
+ from time import sleep
4
+ from typing import Dict, List, Optional, Tuple, Union
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ import xarray as xr
9
+ from tqdm.auto import tqdm
10
+
11
+ from tfv_get_tools.providers._downloader import BaseDownloader
12
+ from tfv_get_tools.providers._merger import BaseMerger
13
+
14
+
15
+ class DownloadHycom(BaseDownloader):
16
+ """Downloader class for HYCOM oceanographic data.
17
+
18
+ Handles both single-database (pre-2024-08) and multi-database (post-2024-08) formats.
19
+ """
20
+
21
+ def _init_specific(self, **kwargs):
22
+ """Initialize HYCOM specific attributes"""
23
+ self.source = "HYCOM"
24
+ self.mode = "OCEAN"
25
+ self._load_config()
26
+
27
+ # Dictionary to cache database coordinate information
28
+ self.db_cache = {}
29
+ # Track which variables are 2D (no depth dimension)
30
+ self.two_dimensional_vars = ["surf_el"]
31
+
32
+ # Convert time_interval to string if "best" is specified in model field
33
+ if self.model and self.model.lower() == "best":
34
+ self.time_interval = "best"
35
+ # Reset model to default to avoid confusion in filename
36
+ self.model = "default"
37
+ if self.verbose:
38
+ print("Using 'best' time interval: downloading all available timesteps")
39
+
40
+ def _get_output_filename(self, date: datetime, db_name: str = None) -> Path:
41
+ """Generate output filename based on date, time interval, and database name"""
42
+ date_str = date.strftime('%Y%m%d')
43
+
44
+ # Format time interval part of filename
45
+ if self.time_interval == "best":
46
+ interval_str = "best"
47
+ else:
48
+ interval_str = f"{self.time_interval:02d}h"
49
+
50
+ # Construct filename
51
+ if db_name:
52
+ fname = f"{self.prefix}_{date_str}_{interval_str}_{db_name}.nc"
53
+ else:
54
+ fname = f"{self.prefix}_{date_str}_{interval_str}.nc"
55
+
56
+ return self.outdir / fname
57
+
58
+ def _get_database(self, date: datetime) -> Union[str, Dict[str, List[str]], None]:
59
+ """Get database URL or mapping for a date"""
60
+ if not isinstance(date, datetime):
61
+ raise ValueError("Input must be a datetime object")
62
+
63
+ # Sort the dates in ascending order
64
+ sorted_dates = sorted(
65
+ self.dsmap.keys(), key=lambda x: datetime.strptime(x, "%Y-%m-%d")
66
+ )
67
+
68
+ for i, start_date_str in enumerate(sorted_dates):
69
+ start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
70
+
71
+ # If it's the last item, or if the date is within this range
72
+ if i == len(sorted_dates) - 1 or date < datetime.strptime(
73
+ sorted_dates[i + 1], "%Y-%m-%d"
74
+ ):
75
+ if date >= start_date:
76
+ database_info = self.dsmap[start_date_str]
77
+
78
+ # Check if this is a dictionary with multiple databases (new format)
79
+ if isinstance(database_info, dict) and not isinstance(next(iter(database_info.values())), str):
80
+ # New format: mapping of database URLs to variable lists
81
+ result = {}
82
+ for db_url, var_list in database_info.items():
83
+ # Check if we need to filter variables based on user request
84
+ if self._custom_variables:
85
+ # Only include databases that have variables we need
86
+ filtered_vars = [v for v in var_list if v in self.variables]
87
+ if filtered_vars:
88
+ formatted_url = db_url.format(year=date.year) if "{year}" in db_url else db_url
89
+ result[formatted_url] = filtered_vars
90
+ else:
91
+ # Include all databases and their variables
92
+ formatted_url = db_url.format(year=date.year) if "{year}" in db_url else db_url
93
+ result[formatted_url] = var_list
94
+ return result
95
+ else:
96
+ # Original format: single database URL
97
+ database_url = next(iter(database_info.keys()))
98
+ if "{year}" in database_url:
99
+ return database_url.format(year=date.year)
100
+ return database_url
101
+ break
102
+
103
+ if self.verbose:
104
+ print(f"No data for {date}")
105
+ return None
106
+
107
+ def _initialize_database_coords(self, date: datetime, db_url: str, is_2d: bool):
108
+ """Initialize coordinates for a database if not already cached"""
109
+ db_key = f"{db_url}_{'2d' if is_2d else '3d'}"
110
+
111
+ if db_key in self.db_cache:
112
+ return True
113
+
114
+ # Adjust longitude limits for post-2017 databases
115
+ adjusted_xlims = self._adjust_longitude_limits(date, self.xlims)
116
+
117
+ # Define bounding box
118
+ if is_2d:
119
+ bbox = {
120
+ "xmin": adjusted_xlims[0],
121
+ "xmax": adjusted_xlims[1],
122
+ "ymin": self.ylims[0],
123
+ "ymax": self.ylims[1],
124
+ }
125
+ else:
126
+ bbox = {
127
+ "xmin": adjusted_xlims[0],
128
+ "xmax": adjusted_xlims[1],
129
+ "ymin": self.ylims[0],
130
+ "ymax": self.ylims[1],
131
+ "zmin": self.zlims[0],
132
+ "zmax": self.zlims[1],
133
+ }
134
+
135
+ return self._get_database_coords(date, db_url, bbox, not is_2d)
136
+
137
+ def _download_hycom_file(self, date: datetime, db_url: str, var_list: List[str], output_file: Path) -> bool:
138
+ """Download a single HYCOM file for given database and variables"""
139
+ try:
140
+ # Determine if this is a 2D database
141
+ is_2d_db = all(var in self.two_dimensional_vars for var in var_list)
142
+
143
+ # Initialize coordinates for this database
144
+ if not self._initialize_database_coords(date, db_url, is_2d_db):
145
+ return False
146
+
147
+ # Get indices for this database
148
+ idx_set = self._get_idxs(date, db_url, is_2d=is_2d_db)
149
+
150
+ # Skip if no valid time indices
151
+ if idx_set[0] == "0:1:0":
152
+ if self.verbose:
153
+ print(f"No time indices found for {date} in {db_url}, skipping...")
154
+ return False
155
+
156
+ # Construct OpenDAP URL
157
+ url = self._construct_opendap_url(var_list, idx_set, date, db_url, is_2d=is_2d_db)
158
+
159
+ if self.verbose:
160
+ self.log(url)
161
+
162
+ # Download and save file
163
+ return self._download_single_file(output_file, url)
164
+
165
+ except Exception as e:
166
+ if self.verbose:
167
+ print(f"Error downloading from {db_url} for date {date}: {e}")
168
+ return False
169
+
170
+ def download(self):
171
+ """HYCOM-specific download loop - yields tasks for new base class"""
172
+ for date in self.times:
173
+ # Get database mapping for this date
174
+ db_info = self._get_database(date)
175
+ if db_info is None:
176
+ continue
177
+
178
+ # Check if we have a multi-database situation (new format)
179
+ is_multi_db = isinstance(db_info, dict)
180
+
181
+ if is_multi_db:
182
+ # Multi-database format: yield one task per database
183
+ for db_url, var_list in db_info.items():
184
+ db_name = db_url.split('/')[-1]
185
+ output_file = self._get_output_filename(date, db_name)
186
+
187
+ yield {
188
+ 'file_path': output_file,
189
+ 'url': db_url,
190
+ 'timestamp': date,
191
+ 'variable': f"{len(var_list)}_vars",
192
+ 'download_func': lambda d=date, url=db_url, vars=var_list, out=output_file:
193
+ self._download_hycom_file(d, url, vars, out)
194
+ }
195
+ else:
196
+ # Single database format
197
+ output_file = self._get_output_filename(date)
198
+
199
+ yield {
200
+ 'file_path': output_file,
201
+ 'url': db_info,
202
+ 'timestamp': date,
203
+ 'variable': f"{len(self.variables)}_vars",
204
+ 'download_func': lambda d=date, url=db_info, vars=self.variables, out=output_file:
205
+ self._download_hycom_file(d, url, vars, out)
206
+ }
207
+
208
+ # All the complex HYCOM-specific methods remain the same
209
+ def _adjust_longitude_limits(self, date: datetime, xlims: Tuple[float, float]) -> Tuple[float, float]:
210
+ """Adjust longitude limits if needed based on date"""
211
+ xmin, xmax = xlims
212
+ # Hycom specific logic - xmin/xmax needs jumped up after this date
213
+ if date >= datetime(2017, 10, 1):
214
+ xmin = xmin + 360.0 if xmin < 0 else xmin
215
+ xmax = xmax + 360.0 if xmax <= 0 else xmax
216
+ return (xmin, xmax)
217
+
218
+ def _get_database_coords(self, date: datetime, database: str, bbox: Dict[str, float], include_depth: bool) -> bool:
219
+ """Get coordinates for a database"""
220
+ # Create key for database cache
221
+ db_key = f"{database}_{'3d' if include_depth else '2d'}"
222
+
223
+ # Check if coordinates are already cached
224
+ if db_key in self.db_cache:
225
+ return True
226
+
227
+ # Construct base URL for coordinates
228
+ if include_depth:
229
+ baseurl = f"https://tds.hycom.org/thredds/dodsC/{database}?lat,lon,time,depth"
230
+ else:
231
+ baseurl = f"https://tds.hycom.org/thredds/dodsC/{database}?lat,lon,time"
232
+
233
+ if self.verbose:
234
+ print(f"-- Getting coordinates for database: {database} ({'3D' if include_depth else '2D'}) --")
235
+
236
+ try:
237
+ ds = xr.open_dataset(baseurl)
238
+
239
+ # Extract coordinates
240
+ lon = ds["lon"].values
241
+ lat = ds["lat"].values
242
+ times = ds["time"].values
243
+
244
+ # Find indices within bounds
245
+ lat_idxs = np.where((lat >= bbox["ymin"]) & (lat <= bbox["ymax"]))[0]
246
+ lon_idxs = np.where((lon >= bbox["xmin"]) & (lon <= bbox["xmax"]))[0]
247
+
248
+ # Create cache entry
249
+ cache_entry = {
250
+ 'times': times,
251
+ 'lon_idxs': lon_idxs,
252
+ 'lat_idxs': lat_idxs,
253
+ }
254
+
255
+ # Add depth if needed
256
+ if include_depth:
257
+ dep = ds["depth"].values
258
+ dep_idxs = np.where((dep >= bbox["zmin"]) & (dep <= bbox["zmax"]))[0]
259
+ cache_entry['dep_idxs'] = dep_idxs
260
+
261
+ # Store in cache
262
+ self.db_cache[db_key] = cache_entry
263
+
264
+ if self.verbose:
265
+ print(f"Coordinates cached for {db_key}")
266
+ print(f" Time shape: {times.shape}")
267
+ print(f" Lon indices: {len(lon_idxs)}")
268
+ print(f" Lat indices: {len(lat_idxs)}")
269
+ if include_depth:
270
+ print(f" Depth indices: {len(dep_idxs)}")
271
+
272
+ return True
273
+
274
+ except Exception as e:
275
+ print(f"Error retrieving coordinates for database {database}: {e}")
276
+ # Create an empty entry to prevent repeated attempts
277
+ self.db_cache[db_key] = {
278
+ 'times': np.array([]),
279
+ 'lon_idxs': np.array([]),
280
+ 'lat_idxs': np.array([]),
281
+ }
282
+ if include_depth:
283
+ self.db_cache[db_key]['dep_idxs'] = np.array([])
284
+
285
+ return False
286
+
287
+ def _find_time_indices(self, date: datetime, times: np.ndarray) -> np.ndarray:
288
+ """Find indices in times array that match the given date at specified time intervals"""
289
+ # Convert to datetime for consistent handling
290
+ date_dt = pd.Timestamp(date).floor('us').to_pydatetime()
291
+ # Create a date-only object for comparison
292
+ date_only = date_dt.replace(hour=0, minute=0, second=0, microsecond=0)
293
+ next_day = date_only + timedelta(days=1)
294
+
295
+ # Get all timesteps within this day
296
+ day_indices = []
297
+ for i, t in enumerate(times):
298
+ t_dt = pd.Timestamp(t).floor('us').to_pydatetime()
299
+ # Check if the timestamp is on the same date
300
+ if date_only <= t_dt < next_day:
301
+ day_indices.append((i, t_dt))
302
+
303
+ # If no timestamps found for this day, return empty array
304
+ if not day_indices:
305
+ return np.array([])
306
+
307
+ # Handle "best" time interval - return all available timesteps
308
+ if self.time_interval == "best":
309
+ return np.array([idx for idx, _ in day_indices])
310
+
311
+ # For daily interval (24 hours), try to find a timestamp closest to noon
312
+ if self.time_interval >= 24:
313
+ # Target time is noon
314
+ target_time = date_only + timedelta(hours=12)
315
+
316
+ # Find timestamp closest to noon
317
+ closest_idx = min(day_indices, key=lambda x: abs((x[1] - target_time).total_seconds()))[0]
318
+ return np.array([closest_idx])
319
+
320
+ # For sub-daily intervals (e.g., 3, 6, 12 hours)
321
+ selected_indices = []
322
+ for hour in range(0, 24, self.time_interval):
323
+ target_time = date_only + timedelta(hours=hour) # Target start of interval
324
+
325
+ # Find indices within this interval
326
+ interval_indices = [
327
+ (idx, dt) for idx, dt in day_indices
328
+ if date_only + timedelta(hours=hour) <= dt < date_only + timedelta(hours=hour + self.time_interval)
329
+ ]
330
+
331
+ if interval_indices:
332
+ # Find closest to the start of the interval
333
+ closest_idx = min(interval_indices, key=lambda x: abs((x[1] - target_time).total_seconds()))[0]
334
+ selected_indices.append(closest_idx)
335
+
336
+ return np.array(selected_indices)
337
+
338
+ def _format_opendap_slice(self, indices: np.ndarray) -> str:
339
+ """Convert array of indices into OpenDAP slice format (start:step:stop)"""
340
+ if len(indices) == 0:
341
+ return "0:1:0" # Empty slice
342
+
343
+ if len(indices) == 1:
344
+ return f"{indices[0]}:1:{indices[0]}"
345
+
346
+ # Calculate differences between consecutive indices
347
+ diff = np.diff(indices)
348
+
349
+ # Check if we have a constant step size
350
+ unique_steps = np.unique(diff)
351
+
352
+ if len(unique_steps) == 1:
353
+ # Continuous sequence with constant step
354
+ step = unique_steps[0]
355
+ return f"{indices[0]}:{step}:{indices[-1]}"
356
+ else:
357
+ # Try to find the most common step size
358
+ step = np.bincount(diff).argmax()
359
+ return f"{indices[0]}:{step}:{indices[-1]}"
360
+
361
+ def _get_idxs(self, date: datetime, database: str, is_2d: bool = False) -> Tuple[str, str, str, Optional[str]]:
362
+ """Get indices for the given database"""
363
+ # Create key for database lookup
364
+ db_key = f"{database}_{'2d' if is_2d else '3d'}"
365
+
366
+ # Check if database has been initialized
367
+ if db_key not in self.db_cache:
368
+ if self.verbose:
369
+ print(f"Warning: Database {db_key} not initialized. Unable to get indices.")
370
+ return "0:1:0", "0:1:0", "0:1:0", None if is_2d else "0:1:0"
371
+
372
+ # Get database data
373
+ db_data = self.db_cache[db_key]
374
+
375
+ # Check if database has valid data
376
+ if len(db_data['times']) == 0:
377
+ if self.verbose:
378
+ print(f"Warning: No time data for {db_key}. Unable to get indices.")
379
+ return "0:1:0", "0:1:0", "0:1:0", None if is_2d else "0:1:0"
380
+
381
+ # Get the database-specific indices and times
382
+ times = db_data['times']
383
+ lon_idxs = db_data['lon_idxs']
384
+ lat_idxs = db_data['lat_idxs']
385
+
386
+ # Format basic indices for OpenDAP
387
+ lon_idx = self._format_opendap_slice(lon_idxs)
388
+ lat_idx = self._format_opendap_slice(lat_idxs)
389
+
390
+ # For 2D variables (like surf_el), don't include depth
391
+ if is_2d:
392
+ dep_idx = None
393
+ else:
394
+ dep_idxs = db_data['dep_idxs']
395
+ dep_idx = self._format_opendap_slice(dep_idxs)
396
+
397
+ # Find time indices for this date
398
+ try:
399
+ time_idxs = self._find_time_indices(date, times)
400
+ if len(time_idxs) == 0:
401
+ if self.verbose:
402
+ print(f"Warning: No time indices found for {date} in {db_key}.")
403
+ print(f"Available times range: {pd.Timestamp(times[0])} to {pd.Timestamp(times[-1])}")
404
+ return "0:1:0", lon_idx, lat_idx, dep_idx
405
+
406
+ time_idx = self._format_opendap_slice(time_idxs)
407
+ return time_idx, lon_idx, lat_idx, dep_idx
408
+ except Exception as e:
409
+ print(f"Error finding time indices for {date} in {db_key}: {e}")
410
+ return "0:1:0", lon_idx, lat_idx, dep_idx
411
+
412
+ def _construct_opendap_url(self, variables: List[str], idx_set: Tuple[str, str, str, Optional[str]],
413
+ date: datetime, database: str, is_2d: bool = False) -> str:
414
+ """Construct OpenDAP URL for the given variables and indices"""
415
+ time_idx, lon_idx, lat_idx, dep_idx = idx_set
416
+
417
+ # For future dates (forecasts), use a different URL format
418
+ if date >= datetime.now():
419
+ date2 = datetime.now() - timedelta(days=1)
420
+ base_url = (
421
+ f"https://tds.hycom.org/thredds/dodsC/{database}/FMRC/runs/GLBy0.08_930_FMRC_RUN_"
422
+ + f'{date2.strftime("%Y-%m-%dT12:00:00Z")}?'
423
+ )
424
+ else:
425
+ base_url = f"https://tds.hycom.org/thredds/dodsC/{database}?"
426
+
427
+ # Add coordinate subsetting
428
+ if is_2d:
429
+ # For 2D variables (no depth)
430
+ url = base_url + f"lat[{lat_idx}],lon[{lon_idx}],time[{time_idx}]"
431
+ else:
432
+ # For 3D variables (with depth)
433
+ url = base_url + f"lat[{lat_idx}],lon[{lon_idx}],depth[{dep_idx}],time[{time_idx}]"
434
+
435
+ # Add variable subsetting
436
+ for v in variables:
437
+ if v == "surf_el" or is_2d:
438
+ # 2D variable (no depth dimension)
439
+ url += f",{v}[{time_idx}][{lat_idx}][{lon_idx}]"
440
+ else:
441
+ # 3D variable (with depth dimension)
442
+ url += f",{v}[{time_idx}][{dep_idx}][{lat_idx}][{lon_idx}]"
443
+
444
+ return url
445
+
446
+ def _download_single_file(self, fname: Path, url: str) -> bool:
447
+ """Download a single HYCOM subset file"""
448
+ assert fname.parent.exists(), "Output folder does not exist. Please create this first"
449
+
450
+ try:
451
+ ds = xr.open_dataset(url)
452
+ ds.to_netcdf(fname, format="NETCDF4")
453
+ assert fname.exists()
454
+ return True
455
+ except Exception as e:
456
+ if self.verbose:
457
+ print(f"File download failed: {e}")
458
+ sleep(self.timeout)
459
+
460
+ # Retry with decreasing tries
461
+ if self.max_tries > 1:
462
+ if self.verbose:
463
+ print(f"Retrying... {self.max_tries-1} tries left")
464
+ self.max_tries -= 1
465
+ return self._download_single_file(fname, url)
466
+ return False
467
+
468
+ class MergeHYCOM(BaseMerger):
469
+ def _init_specific(self) -> None:
470
+ self.source = "HYCOM"
471
+ self.mode = "OCEAN"
472
+ self._load_config()
473
+
474
+ def _extract_target_coordinates(self, datasets: List[xr.Dataset]) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]]:
475
+ """Extract coordinates from the most complete dataset (prefer later models)."""
476
+ # Use last dataset (reverse sorted) as it has extended coordinates
477
+ for ds in reversed(datasets):
478
+ if ds is not None:
479
+ lon = ds['lon'].values
480
+ lat = ds['lat'].values
481
+ depth = ds['depth'].values if 'depth' in ds else None
482
+ return lon, lat, depth
483
+
484
+ raise ValueError("No valid datasets found for coordinate extraction")
485
+
486
+ def _process_file(self, file_path: Path) -> Optional[xr.Dataset]:
487
+ """Load and apply basic HYCOM processing."""
488
+ ds = self._open_subset_netcdf(file_path, chunks=dict(time=24))
489
+ if ds is not None:
490
+ # Apply longitude wrapping (crucial for HYCOM because of mixed grids!!)
491
+ ds = ds.assign_coords(lon=((ds.lon + 180) % 360) - 180)
492
+ return ds
493
+
494
+ def _check_sub_daily_data(self, ds: xr.Dataset) -> bool:
495
+ """Check if dataset contains sub-daily (< 24 hour) temporal resolution."""
496
+ if ds is not None and len(ds['time']) > 1:
497
+ time_diffs = np.diff(ds['time'].values).astype('timedelta64[h]').astype(int)
498
+ return np.any(time_diffs < 24)
499
+ return False
500
+
501
+ def _apply_tidal_filtering(self, ds: xr.Dataset) -> xr.Dataset:
502
+ """Apply 25-hour centered rolling mean to remove tidal signal from post-2024-08-09 data."""
503
+ cutoff_time = pd.Timestamp('2024-08-09 23:59:00')
504
+
505
+ # Filter time indices after cutoff
506
+ post_cutoff_mask = ds['time'] > cutoff_time
507
+
508
+ if not post_cutoff_mask.any():
509
+ return ds # No data after cutoff, return unchanged
510
+
511
+ # Apply rolling mean only to post-cutoff data
512
+ ds_filtered = ds.copy()
513
+
514
+ # Process all time-dependent variables at once to avoid repeated rolling operations
515
+ time_vars = [var for var in ds.data_vars if 'time' in ds[var].dims]
516
+
517
+ if time_vars:
518
+ # Create single rolling object and apply to all variables
519
+ rolling_ds = ds[time_vars].rolling(time=25, center=True).reduce(np.nanmean)
520
+
521
+ for var_name in time_vars:
522
+ # Only replace post-cutoff values
523
+ ds_filtered[var_name] = xr.where(
524
+ post_cutoff_mask,
525
+ rolling_ds[var_name],
526
+ ds[var_name]
527
+ )
528
+
529
+ return ds_filtered
530
+
531
+ def merge_files(self, file_list: List[Path]) -> Tuple[xr.Dataset, List[Path]]:
532
+ """Merge HYCOM files: group by startdate, merge variables, then concat time."""
533
+ if not file_list:
534
+ raise ValueError("No files provided for merging")
535
+
536
+ # Sort reverse to get extended coordinates from later models
537
+ file_list.sort(reverse=True)
538
+
539
+ # Group by start date
540
+ startdates = [f.stem.split('_')[2] for f in file_list]
541
+ grouped_files = {date: [] for date in np.unique(startdates)}
542
+
543
+ # Load files and group them
544
+ all_datasets = []
545
+ skipped_files = []
546
+
547
+ for i, file_path in enumerate(tqdm(file_list, disable=not self.verbose)):
548
+ ds = self._process_file(file_path)
549
+ if ds is not None:
550
+ grouped_files[startdates[i]].append(ds)
551
+ all_datasets.append(ds)
552
+ else:
553
+ skipped_files.append(file_path)
554
+
555
+ if not all_datasets:
556
+ raise ValueError("No valid datasets could be loaded")
557
+
558
+ # Extract target coordinates
559
+ target_lon, target_lat, target_depth = self._extract_target_coordinates(all_datasets)
560
+
561
+ # Check if we have post-2024-08-10 data and if it's sub-daily
562
+ cutoff_date = pd.Timestamp('2024-08-10')
563
+ has_post_cutoff_data = False
564
+ apply_tidal_filtering = False
565
+
566
+ # Check latest startdate (files are reverse sorted)
567
+ if startdates:
568
+ latest_startdate = pd.Timestamp(startdates[0])
569
+ if latest_startdate >= cutoff_date:
570
+ has_post_cutoff_data = True
571
+ # Check the most recent dataset for sub-daily data
572
+ if all_datasets and self._check_sub_daily_data(all_datasets[0]):
573
+ apply_tidal_filtering = True
574
+
575
+ if self.verbose:
576
+ print("Concatenating and interpolating xarray dataset")
577
+ if has_post_cutoff_data and apply_tidal_filtering:
578
+ print('... Dataset contains sub-daily data post-2024-08-10 (HYCOM ESPC-D-V02), applying tidal filtering.')
579
+
580
+ # Merge variables for each start date group
581
+ merged_by_date = []
582
+ for date_group in grouped_files.values():
583
+ if date_group:
584
+ interpolated = []
585
+ for ds in date_group:
586
+ # Interpolate to common grid (2D or 3D as appropriate)
587
+ if target_depth is not None and 'depth' in ds.dims:
588
+ ds_interp = ds.interp(lon=target_lon, lat=target_lat, depth=target_depth,
589
+ method='nearest', kwargs=dict(fill_value='extrapolate'))
590
+ else:
591
+ ds_interp = ds.interp(lon=target_lon, lat=target_lat,
592
+ method='nearest', kwargs=dict(fill_value='extrapolate'))
593
+ interpolated.append(ds_interp)
594
+
595
+ merged_by_date.append(xr.merge(interpolated, compat='override'))
596
+
597
+ # Concatenate along time dimension
598
+ merged = xr.concat(merged_by_date, dim="time", combine_attrs="override",
599
+ data_vars="minimal", coords="minimal", compat="override")
600
+
601
+ # Apply tidal filtering to the merged dataset if needed
602
+ if apply_tidal_filtering:
603
+ merged = self._apply_tidal_filtering(merged)
604
+
605
+ # Final cleanup
606
+ merged = merged.rename({'lon': 'longitude', 'lat': 'latitude'})
607
+ merged = merged.sortby('time')
608
+ _, unique_idx = np.unique(merged['time'], return_index=True)
609
+ merged = merged.isel(time=unique_idx)
610
+
611
+ return merged, skipped_files