tfv-get-tools 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. tfv_get_tools/__init__.py +4 -0
  2. tfv_get_tools/_standard_attrs.py +107 -0
  3. tfv_get_tools/atmos.py +167 -0
  4. tfv_get_tools/cli/_cli_base.py +173 -0
  5. tfv_get_tools/cli/atmos_cli.py +192 -0
  6. tfv_get_tools/cli/ocean_cli.py +204 -0
  7. tfv_get_tools/cli/tide_cli.py +118 -0
  8. tfv_get_tools/cli/wave_cli.py +183 -0
  9. tfv_get_tools/fvc/__init__.py +3 -0
  10. tfv_get_tools/fvc/_atmos.py +230 -0
  11. tfv_get_tools/fvc/_fvc.py +218 -0
  12. tfv_get_tools/fvc/_ocean.py +171 -0
  13. tfv_get_tools/fvc/_tide.py +195 -0
  14. tfv_get_tools/ocean.py +170 -0
  15. tfv_get_tools/providers/__init__.py +0 -0
  16. tfv_get_tools/providers/_custom_conversions.py +34 -0
  17. tfv_get_tools/providers/_downloader.py +566 -0
  18. tfv_get_tools/providers/_merger.py +520 -0
  19. tfv_get_tools/providers/_utilities.py +255 -0
  20. tfv_get_tools/providers/atmos/barra2.py +209 -0
  21. tfv_get_tools/providers/atmos/cfgs/barra2_c2.yaml +52 -0
  22. tfv_get_tools/providers/atmos/cfgs/barra2_r2.yaml +85 -0
  23. tfv_get_tools/providers/atmos/cfgs/barra2_re2.yaml +70 -0
  24. tfv_get_tools/providers/atmos/cfgs/cfsr.yaml +68 -0
  25. tfv_get_tools/providers/atmos/cfgs/era5.yaml +77 -0
  26. tfv_get_tools/providers/atmos/cfgs/era5_gcp.yaml +77 -0
  27. tfv_get_tools/providers/atmos/cfsr.py +207 -0
  28. tfv_get_tools/providers/atmos/era5.py +20 -0
  29. tfv_get_tools/providers/atmos/era5_gcp.py +20 -0
  30. tfv_get_tools/providers/ocean/cfgs/copernicus_blk.yaml +64 -0
  31. tfv_get_tools/providers/ocean/cfgs/copernicus_glo.yaml +67 -0
  32. tfv_get_tools/providers/ocean/cfgs/copernicus_nws.yaml +62 -0
  33. tfv_get_tools/providers/ocean/cfgs/hycom.yaml +73 -0
  34. tfv_get_tools/providers/ocean/copernicus_ocean.py +457 -0
  35. tfv_get_tools/providers/ocean/hycom.py +611 -0
  36. tfv_get_tools/providers/wave/cawcr.py +166 -0
  37. tfv_get_tools/providers/wave/cfgs/cawcr_aus_10m.yaml +39 -0
  38. tfv_get_tools/providers/wave/cfgs/cawcr_aus_4m.yaml +39 -0
  39. tfv_get_tools/providers/wave/cfgs/cawcr_glob_24m.yaml +39 -0
  40. tfv_get_tools/providers/wave/cfgs/cawcr_pac_10m.yaml +39 -0
  41. tfv_get_tools/providers/wave/cfgs/cawcr_pac_4m.yaml +39 -0
  42. tfv_get_tools/providers/wave/cfgs/copernicus_glo.yaml +56 -0
  43. tfv_get_tools/providers/wave/cfgs/copernicus_nws.yaml +51 -0
  44. tfv_get_tools/providers/wave/cfgs/era5.yaml +48 -0
  45. tfv_get_tools/providers/wave/cfgs/era5_gcp.yaml +48 -0
  46. tfv_get_tools/providers/wave/copernicus_wave.py +38 -0
  47. tfv_get_tools/providers/wave/era5.py +232 -0
  48. tfv_get_tools/providers/wave/era5_gcp.py +169 -0
  49. tfv_get_tools/tide/__init__.py +2 -0
  50. tfv_get_tools/tide/_nodestring.py +214 -0
  51. tfv_get_tools/tide/_tidal_base.py +568 -0
  52. tfv_get_tools/utilities/_tfv_bc.py +78 -0
  53. tfv_get_tools/utilities/horizontal_padding.py +89 -0
  54. tfv_get_tools/utilities/land_masking.py +93 -0
  55. tfv_get_tools/utilities/parsers.py +44 -0
  56. tfv_get_tools/utilities/warnings.py +38 -0
  57. tfv_get_tools/wave.py +179 -0
  58. tfv_get_tools-0.2.0.dist-info/METADATA +286 -0
  59. tfv_get_tools-0.2.0.dist-info/RECORD +62 -0
  60. tfv_get_tools-0.2.0.dist-info/WHEEL +5 -0
  61. tfv_get_tools-0.2.0.dist-info/entry_points.txt +5 -0
  62. tfv_get_tools-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,232 @@
1
+ """
2
+ ERA5 Wave downloader
3
+ """
4
+
5
+ import shutil
6
+ import zipfile
7
+ from pathlib import Path
8
+ from typing import List, Tuple
9
+
10
+ import cdsapi
11
+ import numpy as np
12
+ import pandas as pd
13
+ import xarray as xr
14
+ from pandas.tseries.offsets import MonthEnd
15
+ from tqdm import tqdm
16
+
17
+ from tfv_get_tools.providers._downloader import BaseDownloader
18
+ from tfv_get_tools.providers._merger import BaseMerger
19
+ from tfv_get_tools.providers._utilities import todstr
20
+
21
+ class DownloadERA5Wave(BaseDownloader):
22
+ """ERA5 Wave downloader via CDS API"""
23
+
24
+ def _init_specific(self, **kwargs):
25
+ """Set source and mode - matches original interface"""
26
+ self.source = "ERA5"
27
+ self.mode = "WAVE"
28
+ self._load_config()
29
+
30
+ def _get_output_filename(self, ts: pd.Timestamp, te: pd.Timestamp) -> Path:
31
+ """ERA5 Wave filename pattern (no variable in filename)"""
32
+ fname = f"{self.prefix}_{todstr(ts)}_{todstr(te)}.nc"
33
+ return self.outdir / fname
34
+
35
+ def _construct_cds_request(self, date: pd.Timestamp) -> dict:
36
+ """Construct CDS API request parameters"""
37
+ limstr = self._to_limstr(self.xlims, self.ylims)
38
+
39
+ year = date.year
40
+ month = date.month
41
+ times = [f"{x:02}:00" for x in range(0, 24, 1)] # Hourly
42
+ days = [str(x) for x in range(1, 32)]
43
+
44
+ return {
45
+ "product_type": "reanalysis",
46
+ "variable": self.variables,
47
+ "year": [year],
48
+ "month": [month],
49
+ "day": days,
50
+ "time": times,
51
+ "area": limstr,
52
+ "data_format": "netcdf",
53
+ "download_format": "unarchived",
54
+ }
55
+
56
+ @staticmethod
57
+ def _to_limstr(x, y):
58
+ """Convert coordinate bounds to ERA5 area string format"""
59
+ return f"{y[1]}/{x[0]}/{y[0]}/{x[1]}"
60
+
61
+ def _download_single_file(self, temp_file: Path, final_file: Path, cds_request: dict) -> bool:
62
+ """Download single file via CDS API"""
63
+ try:
64
+ c = cdsapi.Client()
65
+
66
+ # Download to temporary file first
67
+ c.retrieve("reanalysis-era5-single-levels", cds_request, temp_file)
68
+
69
+ if temp_file.exists():
70
+ # ERA5 sometimes returns zip files, handle both cases
71
+ self._convert_split_netcdf_data(temp_file, final_file)
72
+ return True
73
+ else:
74
+ return False
75
+
76
+ except Exception as e:
77
+ error_msg = str(e).lower()
78
+
79
+ # Check for specific error types and provide helpful messages
80
+ if "missing/incomplete configuration file" in error_msg or ".cdsapirc" in error_msg:
81
+ print("\n" + "="*60)
82
+ print("CDS API CONFIGURATION ERROR")
83
+ print("="*60)
84
+ print("The CDS API configuration file is missing or incomplete.")
85
+ print("To fix this issue:")
86
+ print("1. Register at https://cds.climate.copernicus.eu/")
87
+ print("2. Go to your profile page and copy your API key")
88
+ print("3. Create a file called '.cdsapirc' in your home directory with:")
89
+ print(" url: https://cds.climate.copernicus.eu/api")
90
+ print(" key: YOUR_API_KEY_HERE")
91
+ print("="*60)
92
+
93
+ elif "authentication" in error_msg or "invalid key" in error_msg:
94
+ print("\n" + "="*60)
95
+ print("CDS API AUTHENTICATION ERROR")
96
+ print("="*60)
97
+ print("Your CDS API key appears to be invalid.")
98
+ print("Please check your .cdsapirc file and ensure your API key is correct.")
99
+ print("You can find your key at: https://cds.climate.copernicus.eu/user/")
100
+ print("="*60)
101
+
102
+ else:
103
+ # For any other errors, show the original message if verbose
104
+ if self.verbose:
105
+ print(f"Failed to download via CDS API: {e}")
106
+ else:
107
+ # Always show some info for unhandled errors
108
+ print(f"\nCDS API Error: {e}")
109
+
110
+ return False
111
+
112
+ @staticmethod
113
+ def _convert_split_netcdf_data(file_handle_temp: Path, file_handle: Path) -> bool:
114
+ """
115
+ Handle ERA5 zip files or direct NetCDF files
116
+ """
117
+ file_path = Path(file_handle_temp)
118
+ file_path_out = Path(file_handle)
119
+
120
+ # Check if file is a zip file
121
+ if zipfile.is_zipfile(file_path):
122
+ datasets = []
123
+ with zipfile.ZipFile(file_path, 'r') as zip_ref:
124
+ # Extract all files to a temporary directory
125
+ temp_dir = file_path.parent / 'temp_netcdf'
126
+ temp_dir.mkdir(exist_ok=True)
127
+ zip_ref.extractall(temp_dir)
128
+
129
+ # Load all NetCDF files
130
+ for extracted_file in temp_dir.glob('*.nc'):
131
+ with xr.open_dataset(extracted_file) as dsx:
132
+ dsx.load()
133
+ datasets.append(dsx)
134
+
135
+ # Clean up extracted files
136
+ for file in temp_dir.iterdir():
137
+ file.unlink()
138
+ temp_dir.rmdir()
139
+
140
+ if not datasets:
141
+ raise ValueError("No NetCDF files found in zip archive")
142
+
143
+ # Combine all datasets
144
+ ds = xr.merge(datasets)
145
+ ds.to_netcdf(file_path_out)
146
+
147
+ # Delete the zip file
148
+ file_path.unlink()
149
+
150
+ else:
151
+ # Move the NetCDF file to the output path
152
+ shutil.move(file_path, file_path_out)
153
+
154
+ return True
155
+
156
+ def download(self):
157
+ """ERA5 Wave-specific download loop - yields tasks for new base class"""
158
+ for ts in self.times:
159
+ te = ts + MonthEnd() + pd.Timedelta("23.9h")
160
+
161
+ final_file = self._get_output_filename(ts, te)
162
+ temp_file = self.outdir / '_temp_era5_file'
163
+ cds_request = self._construct_cds_request(ts)
164
+
165
+ yield {
166
+ 'file_path': final_file,
167
+ 'url': 'CDS_API', # Not a URL but API call
168
+ 'timestamp': ts,
169
+ 'variable': 'all_variables', # ERA5 downloads all vars together
170
+ 'download_func': lambda tf=temp_file, ff=final_file, req=cds_request:
171
+ self._download_single_file(tf, ff, req)
172
+ }
173
+
174
+
175
+ class MergeERA5Wave(BaseMerger):
176
+ def _init_specific(self) -> None:
177
+ self.source = "ERA5"
178
+ self.mode = "WAVE"
179
+ self._load_config()
180
+
181
+ def _process_era5_dataset(self, ds: xr.Dataset) -> xr.Dataset:
182
+ """Apply ERA5-specific coordinate and dimension processing."""
183
+ # Rename time coordinate
184
+ if "valid_time" in ds.coords:
185
+ ds = ds.rename({"valid_time": "time"})
186
+
187
+ # Handle experiment version dimension
188
+ if "expver" in ds.dims:
189
+ ds = ds.mean(dim="expver", keep_attrs=True)
190
+ ds = ds.drop_vars("expver", errors='ignore')
191
+
192
+ # Standardise coordinate names
193
+ coord_mappings = {'lon': 'longitude', 'lng': 'longitude', 'lat': 'latitude'}
194
+ rename_dict = {old: new for old, new in coord_mappings.items() if old in ds.coords}
195
+ if rename_dict:
196
+ ds = ds.rename(rename_dict)
197
+
198
+ return ds
199
+
200
+ def merge_files(self, file_list: List[Path]) -> Tuple[xr.Dataset, List[Path]]:
201
+ """Merge ERA5 wave files with time concatenation."""
202
+ if not file_list:
203
+ raise ValueError("No files provided for merging")
204
+
205
+ datasets = []
206
+ skipped_files = []
207
+
208
+ for file_path in tqdm(file_list, disable=not self.verbose):
209
+ ds = self._open_subset_netcdf(file_path, time=("time", "valid_time"))
210
+ if ds is not None:
211
+ ds = self._process_era5_dataset(ds)
212
+ datasets.append(ds)
213
+ else:
214
+ skipped_files.append(file_path)
215
+
216
+ if not datasets:
217
+ raise ValueError("No valid datasets could be loaded")
218
+
219
+ # Concatenate and clean up
220
+ merged = xr.concat(datasets, dim="time", combine_attrs="override",
221
+ data_vars="minimal", coords="minimal", compat="override")
222
+
223
+ # Remove duplicates and sort
224
+ merged = merged.sortby("time")
225
+ _, unique_idx = np.unique(merged["time"], return_index=True)
226
+ merged = merged.isel(time=np.sort(unique_idx))
227
+
228
+ # Sort latitudes south to north
229
+ if 'latitude' in merged.coords:
230
+ merged = merged.sortby("latitude")
231
+
232
+ return merged, skipped_files
@@ -0,0 +1,169 @@
1
+ """
2
+ AEW - This is an alternative source ERA5 that we used for awhile while CDSAPI wasn't playing ball.
3
+ It's marginally faster, requires no registration, but runs several months behind CDSAPI which
4
+ got called out a few times, so we should stick with CDSAPI. Also generally believe it's good form to
5
+ have users go through CDSAPI as it is their data.
6
+ """
7
+
8
+ from pathlib import Path
9
+
10
+ import numpy as np
11
+ import xarray as xr
12
+ import pandas as pd
13
+ from tqdm import tqdm
14
+
15
+ from pandas.tseries.offsets import MonthEnd
16
+
17
+ from tfv_get_tools.providers._utilities import todstr
18
+ from tfv_get_tools.providers._downloader import BaseDownloader
19
+ from tfv_get_tools.providers._merger import BaseMerger
20
+
21
+
22
+ class DownloadERA5WaveGCP(BaseDownloader):
23
+ """Updated version that makes use of the public GCP Zarr ERA5 release"""
24
+
25
+ def _init_specific(self, **kwargs):
26
+ """Set source and mode - matches original interface"""
27
+ self.source = "ERA5_GCP"
28
+ self.mode = "WAVE"
29
+ self._load_config()
30
+
31
+ # Initialize the Zarr dataset connection
32
+ self.era5_ds = None
33
+ self.last_valid = None
34
+
35
+ def _initialize_zarr_connection(self):
36
+ """Initialize connection to ERA5 Zarr dataset"""
37
+ if self.era5_ds is None:
38
+ if self.verbose:
39
+ print('Opening connection to ERA5 data - please wait')
40
+
41
+ self.era5_ds = xr.open_zarr(
42
+ self.base_url,
43
+ chunks=dict(time=744), # 31 day chunk
44
+ storage_options=dict(token='anon'),
45
+ )
46
+
47
+ self.last_valid = pd.Timestamp(self.era5_ds.attrs['valid_time_stop']).floor('1d')
48
+
49
+ self.era5_ds = self.era5_ds.sel(time=slice(
50
+ pd.Timestamp(1940, 1, 1), self.last_valid
51
+ ))
52
+
53
+ def _get_output_filename(self, ts: pd.Timestamp, te: pd.Timestamp) -> Path:
54
+ """ERA5 GCP filename pattern (no variable in filename - downloads all together)"""
55
+ fname = f"{self.prefix}_{todstr(ts)}_{todstr(te)}.nc"
56
+ return self.outdir / fname
57
+
58
+ def _download_single_time_period(self, ts: pd.Timestamp, te: pd.Timestamp, output_file: Path) -> bool:
59
+ """Download single time period from Zarr dataset"""
60
+ try:
61
+ # Check if we're past the valid time range
62
+ if ts > self.last_valid:
63
+ if self.verbose:
64
+ print(f'The final valid ERA5 time on this database is {self.last_valid.strftime("%Y-%m-%d")}.')
65
+ print('Skipping this time period')
66
+ return False
67
+
68
+ # Apply slices and variable filters
69
+ dsx = self.era5_ds.sel(
70
+ time=slice(ts, te),
71
+ latitude=slice(*self.ylims[::-1]),
72
+ longitude=slice(*self.xlims),
73
+ )[self.variables]
74
+
75
+ if self.verbose:
76
+ print(f"... Downloading {output_file.name}")
77
+
78
+ # Wrap the cut-down piece back to -180 to 180
79
+ dsx = dsx.assign_coords({'longitude': (dsx['longitude'] + 180) % 360 - 180})
80
+ dsx = dsx.sortby('longitude')
81
+
82
+ dsx.to_netcdf(output_file)
83
+ return True
84
+
85
+ except Exception as e:
86
+ if self.verbose:
87
+ print(f"Failed to download {output_file.name}: {e}")
88
+ return False
89
+
90
+ def download(self):
91
+ """ERA5 GCP-specific download loop - yields tasks for new base class"""
92
+ # Initialize the Zarr connection once
93
+ self._initialize_zarr_connection()
94
+
95
+ if self.verbose:
96
+ print('Starting downloading loop')
97
+
98
+ for ts in self.times:
99
+ te = ts + MonthEnd() + pd.Timedelta("23.9h")
100
+
101
+ # Check if we're past the valid time range
102
+ if ts > self.last_valid:
103
+ if self.verbose:
104
+ print(f'The final valid ERA5 time on this database is {self.last_valid.strftime("%Y-%m-%d")}.')
105
+ print('Exiting download loop early')
106
+ break
107
+
108
+ output_file = self._get_output_filename(ts, te)
109
+
110
+ yield {
111
+ 'file_path': output_file,
112
+ 'url': f'zarr://{self.base_url}', # Pseudo-URL for logging
113
+ 'timestamp': ts,
114
+ 'variable': 'all_variables', # ERA5 downloads all vars together
115
+ 'download_func': lambda start=ts, end=te, out=output_file:
116
+ self._download_single_time_period(start, end, out)
117
+ }
118
+
119
+ class MergeERA5WaveGCP(BaseMerger):
120
+ def _init_specific(self):
121
+ self.source = "ERA5"
122
+ self.mode = "WAVE"
123
+ self._load_config()
124
+
125
+ def merge_files(self, file_list):
126
+ """
127
+ ERA5 merging logic.
128
+
129
+ ERA5 names the time variable "valid_time" which we rename after opening.
130
+
131
+ Args:
132
+ file_list (list): list of path objects to open and concat.
133
+
134
+ Returns:
135
+ xr.Dataset: merged xarray dataset
136
+ list: files unable to be merged
137
+ """
138
+ dsset = []
139
+ skipped_list = []
140
+ for f in tqdm(file_list):
141
+ dsx = self._open_subset_netcdf(f, time=("time", "valid_time"))
142
+ if dsx is not None:
143
+ if "valid_time" in dsx:
144
+ dsx = dsx.rename(valid_time="time")
145
+ if "expver" in dsx.dims:
146
+ dsx = dsx.mean(dim="expver", keep_attrs=True)
147
+ dsx = dsx.drop("expver", errors='ignore')
148
+ dsset.append(dsx)
149
+ else:
150
+ skipped_list.append(f)
151
+
152
+ print("Concatenating xarray dataset")
153
+ ds = xr.concat(
154
+ dsset,
155
+ dim="time",
156
+ combine_attrs="override",
157
+ data_vars="minimal",
158
+ coords="minimal",
159
+ compat="override",
160
+ )
161
+
162
+ # Sort by time and drop duplicates (from overlaps)
163
+ ds = ds.sortby("time")
164
+ _, idx = np.unique(ds["time"], return_index=True)
165
+ ds = ds.isel(time=idx)
166
+
167
+ ds = ds.sortby("latitude") # Latitudes should go south to north
168
+
169
+ return ds, skipped_list
@@ -0,0 +1,2 @@
1
+ from ._tidal_base import get_constituents, predict_waterlevel_timeseries, ExtractTide
2
+ from ._nodestring import load_nodestring_shapefile, process_nodestring_gdf
@@ -0,0 +1,214 @@
1
+ from pathlib import Path
2
+ from typing import Union, List, Any
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import geopandas as gpd
7
+ from pyproj import CRS
8
+ from geopandas import GeoDataFrame
9
+ from shapely.geometry import LineString, Point
10
+
11
+
12
+ def _convert_id(x: Any):
13
+ """Simple ID entry conversion
14
+
15
+ Try to make each ID an integer, but fall back on a string.
16
+
17
+ Args:
18
+ x (Any): ID Field (e.g., 2, 'NS1')
19
+
20
+ Returns:
21
+ Union[int, str]: ID as either an int or str, with preference on integer.
22
+ """
23
+ try:
24
+ x = int(x)
25
+ except:
26
+ x = str(x)
27
+ return x
28
+
29
+
30
+ def load_nodestring_shapefile(
31
+ filename: Union[Path, str], crs: int = None, process_ids: Union[tuple, list] = None
32
+ ) -> GeoDataFrame:
33
+ """Load a TUFLOW FV nodestring shapefile as a GeoDataFrame.
34
+
35
+ The CRS will be read from the .prj file if present. Otherwise, `crs=X` can be passed as an EPSG integer code.
36
+
37
+ By default, all the features in the nodestring will be loaded. Use `process_ids` arg to filter to only certain features.
38
+
39
+ Args:
40
+ filename (Union[Path, str]): Path to the nodestring .shp file
41
+ crs (int, optional): Coordinate reference system EPSG code. Defaults to None.
42
+ process_ids (Union[tuple, list], optional): List of ID's to process. Defaults to None.
43
+
44
+ Returns:
45
+ GeoDataFrame: Frame containing the geometry and ID features ready for processing.
46
+ """
47
+
48
+ gdf = gpd.read_file(filename, columns=["ID"])
49
+
50
+ # Set the CRS of the geodataframe. Assume 4326 as backup.
51
+ if crs is None:
52
+ if not gdf.crs:
53
+ print(
54
+ "No CRS could be read from the shapefile. Assuming nodestring is in latitude/longitude (EPSG 4326)"
55
+ )
56
+ gdf.set_crs(4326)
57
+ else:
58
+ try:
59
+ crs = CRS.from_epsg(crs)
60
+ gdf = gdf.set_crs(crs)
61
+ except:
62
+ raise ValueError(
63
+ f"Supplied CRS `{crs}` is not valid. Please provide an EPSG code as an integer, e.g., 7856"
64
+ )
65
+
66
+ # Parse the process ID's
67
+ assert "ID" in gdf.columns, "There must be an `ID` column present in the shapefile"
68
+ shp_ids = gdf["ID"].apply(_convert_id).tolist()
69
+
70
+ if process_ids is None:
71
+ # If no id's are supplied, then we take everything except obvious nan's.
72
+ process_ids = [x for x in shp_ids if x != "nan"]
73
+ else:
74
+ # If they are supplied, then we first convert to INT or STR
75
+ process_ids = [_convert_id(x) for x in process_ids]
76
+ for x in process_ids:
77
+ assert (
78
+ x in shp_ids
79
+ ), f"Nodestring feature ID `{x}` was not found in the shapefile"
80
+
81
+ # Do a check on geometry before moving on
82
+ checked_ids = []
83
+ for x in process_ids:
84
+ idx = shp_ids.index(x)
85
+ geo = gdf.loc[idx, "geometry"]
86
+
87
+ if geo is None:
88
+ print(
89
+ f"Warning - No geometry detected for Nodestring ID {x}. Skipping this feature..."
90
+ )
91
+ elif not isinstance(geo, LineString):
92
+ print(
93
+ f"Warning - Invalid geometry detected for Nodestring ID {x}. Must be a Linestring type. Skipping this feature..."
94
+ )
95
+ else:
96
+ checked_ids.append(x)
97
+
98
+ msk = [shp_ids.index(x) for x in checked_ids]
99
+ gdf = gdf.loc[msk]
100
+
101
+ return gdf
102
+
103
+
104
+ def process_nodestring_gdf(gdf: GeoDataFrame, spacing=2500.0) -> dict:
105
+ """Generates a dictionary containing a Nx2 array of lat/lon coordinates from a nodestring geodataframe.
106
+
107
+ The geodataframe must have a `crs` set. It does not need to be lat/lon.
108
+
109
+ All features in the geodataframe will be processed - it is assumed that filtering has already happened.
110
+
111
+ Args:
112
+ gdf (GeoDataFrame): nodestring geodataframe.
113
+ spacing (float, optional): Spacing in meters. Defaults to 2500.0
114
+
115
+ Returns:
116
+ dict: nodestring dictionary where key is the `ID` and values are a Nx2 np.ndarray.
117
+ """
118
+
119
+ if gdf.crs is None:
120
+ raise ValueError("The nodestring geodataframe must have a CRS defined.")
121
+
122
+ coords = sample_coordinates_along_linestring(gdf, spacing)
123
+
124
+ process_ids = [_convert_id(x) for x in gdf["ID"]]
125
+
126
+ ns_dat = {}
127
+ for i, k in enumerate(process_ids):
128
+ coord_array = np.squeeze(np.asarray([x.xy for x in coords.iloc[i]]))
129
+ ns_dat[k] = coord_array
130
+
131
+ return ns_dat
132
+
133
+
134
+ def sample_coordinates_along_linestring(
135
+ gdf: gpd.GeoDataFrame, spacing: float
136
+ ) -> gpd.GeoSeries:
137
+ """
138
+ Sample coordinates along LineString geometries in a GeoDataFrame.
139
+
140
+ Args:
141
+ gdf (gpd.GeoDataFrame): The input GeoDataFrame containing LineString geometries.
142
+ spacing (float): The desired spacing between sampled points in meters.
143
+
144
+ Returns:
145
+ gpd.GeoSeries: A GeoSeries where each element is a list of sampled Points in WGS84 coordinates.
146
+ """
147
+ return gdf.apply(lambda row: process_row(row, spacing, gdf.crs), axis=1)
148
+
149
+
150
+ def sample_linestring(line: LineString, spacing_meters: float) -> List[Point]:
151
+ """
152
+ Sample points along a LineString at a specified spacing.
153
+
154
+ Args:
155
+ line (LineString): The LineString to sample points from.
156
+ spacing_meters (float): The spacing between points in meters.
157
+
158
+ Returns:
159
+ List[Point]: A list of sampled points along the LineString.
160
+
161
+ Raises:
162
+ ValueError: If the line is empty or the spacing is not positive.
163
+ """
164
+ if line.is_empty:
165
+ raise ValueError("The LineString is empty.")
166
+ if spacing_meters <= 0:
167
+ raise ValueError("Spacing must be a positive number.")
168
+
169
+ line_length = line.length
170
+ num_points = int(line_length / spacing_meters) + 1
171
+ distances = np.linspace(0, line_length, num_points)
172
+ return [line.interpolate(distance) for distance in distances]
173
+
174
+
175
+ def process_row(row: pd.Series, spacing: float, crs: CRS) -> List[Point]:
176
+ """
177
+ Process a single row of a GeoDataFrame, sampling points along its LineString geometry.
178
+
179
+ This function handles both geographic (lat/lon) and projected coordinate systems.
180
+ It always returns the sampled points in WGS84 (EPSG:4326) lat/lon coordinates.
181
+
182
+ Args:
183
+ row (pd.Series): A row from a GeoDataFrame containing a 'geometry' column
184
+ with a LineString object.
185
+ spacing (float): The desired spacing between sampled points in meters.
186
+ crs (CRS): The coordinate reference system of the input geometry.
187
+
188
+ Returns:
189
+ List[Point]: A list of sampled points along the LineString, in lat/lon coordinates.
190
+
191
+ Raises:
192
+ ValueError: If the input geometry is not a LineString.
193
+ """
194
+ if not isinstance(row.geometry, LineString):
195
+ raise ValueError("The geometry must be a LineString.")
196
+
197
+ if crs.is_geographic:
198
+ # Convert to UTM for accurate distance calculations
199
+ centroid = row.geometry.centroid
200
+ utm_zone = int((centroid.x + 180) // 6 + 1)
201
+ utm_crs = CRS.from_proj4(
202
+ f"+proj=utm +zone={utm_zone} +datum=WGS84 +units=m +no_defs"
203
+ )
204
+ utm_geometry = gpd.GeoSeries([row.geometry], crs=crs).to_crs(utm_crs)[0]
205
+ sampled_points = sample_linestring(utm_geometry, spacing)
206
+ points_gdf = gpd.GeoDataFrame(geometry=sampled_points, crs=utm_crs)
207
+ else:
208
+ # If already projected, sample directly
209
+ sampled_points = sample_linestring(row.geometry, spacing)
210
+ points_gdf = gpd.GeoDataFrame(geometry=sampled_points, crs=crs)
211
+
212
+ # Convert sampled points to lat/lon (WGS84)
213
+ points_gdf = points_gdf.to_crs(epsg=4326)
214
+ return points_gdf.geometry.tolist()