water-column-sonar-processing 25.3.1__py3-none-any.whl → 25.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (32) hide show
  1. water_column_sonar_processing/aws/dynamodb_manager.py +6 -6
  2. water_column_sonar_processing/aws/s3_manager.py +95 -90
  3. water_column_sonar_processing/aws/s3fs_manager.py +5 -3
  4. water_column_sonar_processing/aws/sqs_manager.py +1 -1
  5. water_column_sonar_processing/cruise/__init__.py +2 -1
  6. water_column_sonar_processing/cruise/create_empty_zarr_store.py +49 -43
  7. water_column_sonar_processing/cruise/create_empty_zarr_store_level_3.py +161 -0
  8. water_column_sonar_processing/cruise/datatree_manager.py +21 -21
  9. water_column_sonar_processing/cruise/resample_regrid.py +57 -47
  10. water_column_sonar_processing/dataset/__init__.py +3 -0
  11. water_column_sonar_processing/dataset/dataset_manager.py +205 -0
  12. water_column_sonar_processing/dataset/feature_manager.py +32 -0
  13. water_column_sonar_processing/geometry/geometry_manager.py +11 -12
  14. water_column_sonar_processing/geometry/line_simplification.py +26 -1
  15. water_column_sonar_processing/geometry/pmtile_generation.py +211 -247
  16. water_column_sonar_processing/index/index_manager.py +18 -17
  17. water_column_sonar_processing/model/zarr_manager.py +504 -256
  18. water_column_sonar_processing/processing/__init__.py +3 -2
  19. water_column_sonar_processing/processing/batch_downloader.py +11 -11
  20. water_column_sonar_processing/processing/raw_to_netcdf.py +319 -0
  21. water_column_sonar_processing/processing/raw_to_zarr.py +41 -31
  22. water_column_sonar_processing/utility/__init__.py +9 -2
  23. water_column_sonar_processing/utility/cleaner.py +1 -2
  24. water_column_sonar_processing/utility/constants.py +26 -7
  25. water_column_sonar_processing/utility/timestamp.py +1 -0
  26. water_column_sonar_processing-25.8.0.dist-info/METADATA +162 -0
  27. water_column_sonar_processing-25.8.0.dist-info/RECORD +39 -0
  28. {water_column_sonar_processing-25.3.1.dist-info → water_column_sonar_processing-25.8.0.dist-info}/WHEEL +1 -1
  29. water_column_sonar_processing-25.3.1.dist-info/licenses/LICENSE → water_column_sonar_processing-25.8.0.dist-info/licenses/LICENSE-MIT +1 -1
  30. water_column_sonar_processing-25.3.1.dist-info/METADATA +0 -170
  31. water_column_sonar_processing-25.3.1.dist-info/RECORD +0 -34
  32. {water_column_sonar_processing-25.3.1.dist-info → water_column_sonar_processing-25.8.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,161 @@
1
+ import os
2
+ import tempfile
3
+
4
+ import numpy as np
5
+
6
+ from water_column_sonar_processing.aws import DynamoDBManager, S3Manager
7
+ from water_column_sonar_processing.model import ZarrManager
8
+ from water_column_sonar_processing.utility import Cleaner
9
+
10
+
11
+ class CreateEmptyZarrStoreLevel3:
12
+ #######################################################
13
+ def __init__(
14
+ self,
15
+ ):
16
+ self.__overwrite = True
17
+
18
+ #######################################################
19
+ # TODO: move this to the s3_manager
20
+ def upload_zarr_store_to_s3(
21
+ self,
22
+ output_bucket_name: str,
23
+ local_directory: str,
24
+ object_prefix: str, # TODO: add level
25
+ cruise_name: str,
26
+ ) -> None:
27
+ print("uploading model store to s3")
28
+ s3_manager = S3Manager()
29
+ #
30
+ print("Starting upload with thread pool executor.")
31
+ # # 'all_files' is passed a list of lists: [[local_path, s3_key], [...], ...]
32
+ all_files = []
33
+ for subdir, dirs, files in os.walk(f"{local_directory}/{cruise_name}.zarr"):
34
+ for file in files:
35
+ local_path = os.path.join(subdir, file)
36
+ # TODO: find a better method for splitting strings here:
37
+ # 'level_2/Henry_B._Bigelow/HB0806/EK60/HB0806.zarr/.zattrs'
38
+ s3_key = f"{object_prefix}/{cruise_name}.zarr{local_path.split(f'{cruise_name}.zarr')[-1]}"
39
+ all_files.append([local_path, s3_key])
40
+ #
41
+ # print(all_files)
42
+ s3_manager.upload_files_with_thread_pool_executor(
43
+ output_bucket_name=output_bucket_name,
44
+ all_files=all_files,
45
+ )
46
+ print("Done uploading with thread pool executor.")
47
+ # TODO: move to common place
48
+
49
+ #######################################################
50
+ def create_cruise_level_zarr_store_level_3(
51
+ self,
52
+ output_bucket_name: str,
53
+ ship_name: str,
54
+ cruise_name: str,
55
+ sensor_name: str,
56
+ table_name: str,
57
+ ) -> None:
58
+ tempdir = tempfile.TemporaryDirectory()
59
+ try:
60
+ dynamo_db_manager = DynamoDBManager()
61
+ s3_manager = S3Manager()
62
+ df = dynamo_db_manager.get_table_as_df(
63
+ table_name=table_name,
64
+ cruise_name=cruise_name,
65
+ )
66
+
67
+ # TODO: filter the dataframe just for enums >= LEVEL_1_PROCESSING
68
+
69
+ print(f"DataFrame shape: {df.shape}")
70
+ cruise_channels = list(
71
+ set([i for sublist in df["CHANNELS"].dropna() for i in sublist])
72
+ )
73
+ cruise_channels.sort()
74
+
75
+ consolidated_zarr_width = np.sum(
76
+ df["NUM_PING_TIME_DROPNA"].dropna().astype(int)
77
+ )
78
+
79
+ # [3] calculate the max/min measurement resolutions for the whole cruise
80
+ cruise_min_echo_range = np.min(
81
+ (df["MIN_ECHO_RANGE"] + df["WATER_LEVEL"]).dropna().astype(float)
82
+ )
83
+
84
+ # [4] calculate the maximum of the max depth values
85
+ cruise_max_echo_range = np.max(
86
+ (df["MAX_ECHO_RANGE"] + df["WATER_LEVEL"]).dropna().astype(float)
87
+ )
88
+ cruise_max_echo_range = np.ceil(cruise_max_echo_range)
89
+ cruise_min_epsilon = 1.0 # np.min(df["MIN_ECHO_RANGE"].dropna().astype(float)) # TODO: set to 1m
90
+
91
+ print(
92
+ f"cruise_min_echo_range: {cruise_min_echo_range}, cruise_max_echo_range: {cruise_max_echo_range}"
93
+ )
94
+
95
+ # [5] get number of channels
96
+ cruise_frequencies = [
97
+ float(i) for i in df["FREQUENCIES"].dropna().values.flatten()[0]
98
+ ]
99
+ print(cruise_frequencies)
100
+
101
+ new_width = int(consolidated_zarr_width)
102
+ print(f"new_width: {new_width}")
103
+ #################################################################
104
+ store_name = f"{cruise_name}.zarr"
105
+ print(store_name)
106
+ ################################################################
107
+ # Delete existing model store if it exists
108
+ zarr_prefix = os.path.join("level_3", ship_name, cruise_name, sensor_name)
109
+ child_objects = s3_manager.get_child_objects(
110
+ bucket_name=output_bucket_name,
111
+ sub_prefix=zarr_prefix,
112
+ )
113
+ if len(child_objects) > 0:
114
+ s3_manager.delete_nodd_objects(
115
+ bucket_name=output_bucket_name,
116
+ objects=child_objects,
117
+ )
118
+ ################################################################
119
+ # Create new model store
120
+ zarr_manager = ZarrManager()
121
+ new_height = len(
122
+ zarr_manager.get_depth_values(
123
+ # min_echo_range=cruise_min_echo_range,
124
+ max_echo_range=cruise_max_echo_range,
125
+ cruise_min_epsilon=cruise_min_epsilon,
126
+ )
127
+ )
128
+ print(f"new_height: {new_height}")
129
+
130
+ zarr_manager.create_zarr_store_level_3(
131
+ path=tempdir.name, # TODO: need to use .name or problem
132
+ ship_name=ship_name,
133
+ cruise_name=cruise_name,
134
+ sensor_name=sensor_name,
135
+ frequencies=cruise_frequencies,
136
+ width=new_width,
137
+ min_echo_range=cruise_min_echo_range,
138
+ max_echo_range=cruise_max_echo_range,
139
+ cruise_min_epsilon=cruise_min_epsilon,
140
+ calibration_status=True,
141
+ )
142
+ #################################################################
143
+ self.upload_zarr_store_to_s3(
144
+ output_bucket_name=output_bucket_name,
145
+ local_directory=tempdir.name, # TODO: need to use .name or problem
146
+ object_prefix=zarr_prefix,
147
+ cruise_name=cruise_name,
148
+ )
149
+ print("Done creating cruise level zarr store.")
150
+ #################################################################
151
+ except Exception as err:
152
+ raise RuntimeError(
153
+ f"Problem trying to create new cruise model store, {err}"
154
+ )
155
+ finally:
156
+ cleaner = Cleaner()
157
+ cleaner.delete_local_files()
158
+ print("Done creating cruise level model store")
159
+
160
+
161
+ ###########################################################
@@ -1,21 +1,21 @@
1
- ### https://xarray-datatree.readthedocs.io/en/latest/data-structures.html
2
- import xarray as xr
3
- from datatree import DataTree
4
-
5
-
6
- class DatatreeManager:
7
- #######################################################
8
- def __init__(
9
- self,
10
- ):
11
- self.dtype = "float32"
12
-
13
- #################################################################
14
- def create_datatree(
15
- self,
16
- input_ds,
17
- ) -> None:
18
- ds1 = xr.Dataset({"foo": "orange"})
19
- dt = DataTree(name="root", data=ds1) # create root node
20
- # ds2 = xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])})
21
- return dt
1
+ # ### https://xarray-datatree.readthedocs.io/en/latest/data-structures.html
2
+ # import xarray as xr
3
+ # from datatree import DataTree
4
+ #
5
+ #
6
+ # class DatatreeManager:
7
+ # #######################################################
8
+ # def __init__(
9
+ # self,
10
+ # ):
11
+ # self.dtype = "float32"
12
+ #
13
+ # #################################################################
14
+ # def create_datatree(
15
+ # self,
16
+ # input_ds,
17
+ # ) -> None:
18
+ # ds1 = xr.Dataset({"foo": "orange"})
19
+ # dt = DataTree(name="root", dataset=ds1) # create root node
20
+ # # ds2 = xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])})
21
+ # return dt
@@ -1,4 +1,5 @@
1
1
  import gc
2
+ import warnings
2
3
  from pathlib import Path
3
4
 
4
5
  import numcodecs
@@ -10,13 +11,15 @@ from water_column_sonar_processing.aws import DynamoDBManager
10
11
  from water_column_sonar_processing.geometry import GeometryManager
11
12
  from water_column_sonar_processing.model import ZarrManager
12
13
 
14
+ warnings.simplefilter("ignore", category=RuntimeWarning)
15
+
13
16
  numcodecs.blosc.use_threads = False
14
17
  numcodecs.blosc.set_nthreads(1)
15
18
 
16
19
 
17
20
  # TODO: when ready switch to version 3 of model spec
18
21
  # ZARR_V3_EXPERIMENTAL_API = 1
19
- # creates the latlon data: foo = ep.consolidate.add_location(ds_Sv, echodata)
22
+ # creates the latlon dataset: foo = ep.consolidate.add_location(ds_Sv, echodata)
20
23
 
21
24
 
22
25
  class ResampleRegrid:
@@ -34,10 +37,13 @@ class ResampleRegrid:
34
37
  self,
35
38
  input_xr,
36
39
  ping_times,
37
- all_cruise_depth_values,
38
- water_level,
40
+ all_cruise_depth_values, # includes water_level offset
41
+ water_level, # this is the offset that will be added to each respective file
39
42
  ) -> np.ndarray:
40
- print("Interpolating data.")
43
+ """
44
+ What gets passed into interpolate data
45
+ """
46
+ print("Interpolating dataset.")
41
47
  try:
42
48
  data = np.empty(
43
49
  (
@@ -50,31 +56,38 @@ class ResampleRegrid:
50
56
 
51
57
  data[:] = np.nan
52
58
 
53
- regrid_resample = xr.DataArray(
59
+ regrid_resample = xr.DataArray( # where data will be written to
54
60
  data=data,
55
61
  dims=("depth", "time", "frequency"),
56
62
  coords={
57
- "depth": all_cruise_depth_values, # TODO: these should be on interval from 7.7 meters to 507 meters
63
+ "depth": all_cruise_depth_values,
58
64
  "time": ping_times,
59
65
  "frequency": input_xr.frequency_nominal.values,
60
66
  },
61
67
  )
62
68
 
69
+ # shift the input data by water_level
70
+ input_xr.echo_range.values = (
71
+ input_xr.echo_range.values + water_level
72
+ ) # water_level # TODO: change
73
+
63
74
  channels = input_xr.channel.values
64
75
  for channel in range(
65
76
  len(channels)
66
77
  ): # ?TODO: leaving off here, need to subset for just indices in time axis
67
78
  gc.collect()
68
79
  max_depths = np.nanmax(
69
- a=input_xr.echo_range.sel(channel=input_xr.channel[channel]).values
70
- + water_level,
80
+ a=input_xr.echo_range.sel(channel=input_xr.channel[channel]).values,
81
+ # + water_level,
71
82
  axis=1,
72
83
  )
73
- superset_of_max_depths = set(max_depths)
84
+ superset_of_max_depths = set(
85
+ max_depths
86
+ ) # HB1501, D20150503-T102035.raw, TypeError: unhashable type: 'numpy.ndarray'
74
87
  set_of_max_depths = list(
75
88
  {x for x in superset_of_max_depths if x == x}
76
89
  ) # removes nan's
77
- # iterate through partitions of data with similar depths and resample
90
+ # iterate through partitions of dataset with similar depths and resample
78
91
  for select_max_depth in set_of_max_depths:
79
92
  # TODO: for nan just skip and leave all nan's
80
93
  select_indices = [
@@ -120,9 +133,8 @@ class ResampleRegrid:
120
133
  print(f"updated {len(times_select)} ping times")
121
134
  gc.collect()
122
135
  except Exception as err:
123
- print(f"Problem finding the dynamodb table: {err}")
124
- raise err
125
- print("Done interpolating data.")
136
+ raise RuntimeError(f"Problem finding the dynamodb table, {err}")
137
+ print("Done interpolating dataset.")
126
138
  return regrid_resample.values.copy()
127
139
 
128
140
  #################################################################
@@ -132,18 +144,18 @@ class ResampleRegrid:
132
144
  cruise_name,
133
145
  sensor_name,
134
146
  table_name,
135
- # TODO: file_name?,
136
- bucket_name, # TODO: this is the same bucket
147
+ bucket_name,
137
148
  override_select_files=None,
149
+ # override_cruise_min_epsilon=None,
138
150
  endpoint_url=None,
139
151
  ) -> None:
140
152
  """
141
- The goal here is to interpolate the data against the depth values already populated
153
+ The goal here is to interpolate the dataset against the depth values already populated
142
154
  in the existing file level model stores. We open the cruise-level store with model for
143
155
  read/write operations. We open the file-level store with Xarray to leverage tools for
144
- resampling and subsetting the data.
156
+ resampling and subsetting the dataset.
145
157
  """
146
- print("Resample Regrid, Interpolating data.")
158
+ print("Resample Regrid, Interpolating dataset.")
147
159
  try:
148
160
  zarr_manager = ZarrManager()
149
161
  geo_manager = GeometryManager()
@@ -192,7 +204,7 @@ class ResampleRegrid:
192
204
  ]
193
205
  )
194
206
 
195
- # Get input store
207
+ # Get input store — this is unadjusted for water_level
196
208
  input_xr_zarr_store = zarr_manager.open_s3_zarr_store_with_xarray(
197
209
  ship_name=ship_name,
198
210
  cruise_name=cruise_name,
@@ -202,12 +214,15 @@ class ResampleRegrid:
202
214
  endpoint_url=endpoint_url,
203
215
  )
204
216
 
205
- # This is the horizontal offset of the measurement.
217
+ # This is the vertical offset of the sensor related to the ocean surface
206
218
  # See https://echopype.readthedocs.io/en/stable/data-proc-additional.html
207
- water_level = input_xr_zarr_store.water_level.values
219
+ if "water_level" in input_xr_zarr_store.keys():
220
+ water_level = input_xr_zarr_store.water_level.values
221
+ else:
222
+ water_level = 0.0
208
223
  #########################################################################
209
- # [3] Get needed indices
210
- # Offset from start index to insert new data. Note that missing values are excluded.
224
+ # [3] Get needed time indices — along the x-axis
225
+ # Offset from start index to insert new dataset. Note that missing values are excluded.
211
226
  ping_time_cumsum = np.insert(
212
227
  np.cumsum(
213
228
  cruise_df["NUM_PING_TIME_DROPNA"].dropna().to_numpy(dtype=int)
@@ -218,11 +233,6 @@ class ResampleRegrid:
218
233
  start_ping_time_index = ping_time_cumsum[index]
219
234
  end_ping_time_index = ping_time_cumsum[index + 1]
220
235
 
221
- min_echo_range = np.min(
222
- (cruise_df["MIN_ECHO_RANGE"] + cruise_df["WATER_LEVEL"])
223
- .dropna()
224
- .astype(float)
225
- )
226
236
  max_echo_range = np.max(
227
237
  (cruise_df["MAX_ECHO_RANGE"] + cruise_df["WATER_LEVEL"])
228
238
  .dropna()
@@ -233,9 +243,9 @@ class ResampleRegrid:
233
243
  )
234
244
 
235
245
  # Note: cruise dims (depth, time, frequency)
236
- all_cruise_depth_values = zarr_manager.get_depth_values(
237
- min_echo_range=min_echo_range,
238
- max_echo_range=max_echo_range,
246
+ all_cruise_depth_values = zarr_manager.get_depth_values( # needs to integrate water_level
247
+ # min_echo_range=min_echo_range,
248
+ max_echo_range=max_echo_range, # does it here
239
249
  cruise_min_epsilon=cruise_min_epsilon, # remove this & integrate into min_echo_range
240
250
  ) # with offset of 7.5 meters, 0 meter measurement should now start at 7.5 meters
241
251
 
@@ -257,7 +267,9 @@ class ResampleRegrid:
257
267
  output_bucket_name=bucket_name,
258
268
  )
259
269
 
260
- input_xr = input_xr_zarr_store.isel(ping_time=indices)
270
+ input_xr = input_xr_zarr_store.isel(
271
+ ping_time=indices
272
+ ) # Problem with HB200802-D20080310-T174959.zarr/
261
273
 
262
274
  ping_times = input_xr.ping_time.values
263
275
  # Date format: numpy.datetime64('2007-07-20T02:10:25.845073920') converts to "1184897425.845074"
@@ -270,13 +282,11 @@ class ResampleRegrid:
270
282
  )
271
283
 
272
284
  # --- UPDATING --- #
273
- regrid_resample = (
274
- self.interpolate_data( # TODO: need to add water_level here
275
- input_xr=input_xr,
276
- ping_times=ping_times,
277
- all_cruise_depth_values=all_cruise_depth_values,
278
- water_level=water_level,
279
- )
285
+ regrid_resample = self.interpolate_data(
286
+ input_xr=input_xr,
287
+ ping_times=ping_times,
288
+ all_cruise_depth_values=all_cruise_depth_values, # should accommodate the water_level already
289
+ water_level=water_level, # not applied to anything yet
280
290
  )
281
291
 
282
292
  print(
@@ -296,15 +306,16 @@ class ResampleRegrid:
296
306
  # TODO: Only checking the first channel for now. Need to average across all channels
297
307
  # in the future. See https://github.com/CI-CMG/water-column-sonar-processing/issues/11
298
308
  if "detected_seafloor_depth" in input_xr.variables:
299
- print("Found detected_seafloor_depth, adding data to output store.")
309
+ print(
310
+ "Found detected_seafloor_depth, adding dataset to output store."
311
+ )
300
312
  detected_seafloor_depth = input_xr.detected_seafloor_depth.values
301
313
  detected_seafloor_depth[detected_seafloor_depth == 0.0] = np.nan
302
314
  # TODO: problem here: Processing file: D20070711-T210709.
303
315
 
304
- detected_seafloor_depths = np.nanmean(
305
- a=detected_seafloor_depth, axis=0
306
- )
307
- # RuntimeWarning: Mean of empty slice detected_seafloor_depths = np.nanmean(detected_seafloor_depth, 0)
316
+ # Use the lowest frequencies to determine bottom
317
+ detected_seafloor_depths = detected_seafloor_depth[0, :]
318
+
308
319
  detected_seafloor_depths[detected_seafloor_depths == 0.0] = np.nan
309
320
  print(f"min depth measured: {np.nanmin(detected_seafloor_depths)}")
310
321
  print(f"max depth measured: {np.nanmax(detected_seafloor_depths)}")
@@ -326,11 +337,10 @@ class ResampleRegrid:
326
337
  #########################################################################
327
338
  #########################################################################
328
339
  except Exception as err:
329
- print(f"Problem with resample_regrid: {err}")
330
- raise err
340
+ raise RuntimeError(f"Problem with resample_regrid, {err}")
331
341
  finally:
332
342
  print("Exiting resample_regrid.")
333
- # TODO: read across times and verify data was written?
343
+ # TODO: read across times and verify dataset was written?
334
344
 
335
345
  #######################################################
336
346
 
@@ -0,0 +1,3 @@
1
+ from .dataset_manager import DatasetManager
2
+
3
+ __all__ = ["DatasetManager"]
@@ -0,0 +1,205 @@
1
+ from typing import Optional
2
+
3
+ import numpy as np
4
+ import xarray as xr
5
+ import xbatcher
6
+
7
+ from water_column_sonar_processing.aws import S3FSManager
8
+ from water_column_sonar_processing.utility.constants import BatchShape
9
+
10
+
11
+ class DatasetManager:
12
+ """
13
+ Dataset manager does three things.
14
+ 1) Opens zarr store in s3 bucket with xarray and returns masked dataset
15
+ 2) Loads Xarray DataSet with Xbatcher
16
+ 3) Loads Xbatcher batches into tensorflow dataset
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ bucket_name: str,
22
+ ship_name: str,
23
+ cruise_name: str,
24
+ sensor_name: str,
25
+ endpoint_url: Optional[str] = None,
26
+ ):
27
+ self.bucket_name = bucket_name
28
+ self.ship_name = ship_name
29
+ self.cruise_name = cruise_name
30
+ self.sensor_name = sensor_name
31
+ self.endpoint_url = endpoint_url
32
+ self.dtype = "float32"
33
+
34
+ def open_xarray_dataset(
35
+ self,
36
+ mask: bool = True,
37
+ ) -> xr.Dataset:
38
+ # Opens Zarr store in s3 bucket as Xarray Dataset and masks as needed
39
+ try:
40
+ s3_path = f"s3://{self.bucket_name}/level_2/{self.ship_name}/{self.cruise_name}/{self.sensor_name}/{self.cruise_name}.zarr"
41
+
42
+ s3fs_manager = S3FSManager(endpoint_url=self.endpoint_url)
43
+ store_s3_map = s3fs_manager.s3_map(s3_zarr_store_path=s3_path)
44
+
45
+ ds = xr.open_dataset(
46
+ filename_or_obj=store_s3_map,
47
+ engine="zarr",
48
+ # backend_kwargs={'storage_options': {'anon': True}},
49
+ chunks={},
50
+ cache=False,
51
+ )
52
+
53
+ # Mask all sub-bottom dataset
54
+ if mask:
55
+ return ds.where(ds.depth < ds.bottom)
56
+
57
+ return ds
58
+ except Exception as err:
59
+ raise RuntimeError(f"Problem opening Zarr store from S3 with Xarray, {err}")
60
+
61
+ def vector_indices(
62
+ self,
63
+ first_index: int,
64
+ last_index: int,
65
+ step: int,
66
+ ):
67
+ starts = np.arange(first_index, last_index, step)
68
+ ends = np.arange(step, last_index + 1, step)
69
+ return list(zip(starts, ends))
70
+
71
+ def dataset_batcher(
72
+ self,
73
+ ):
74
+ """
75
+ Opens a dataset and creates a generator that returns different chunks of data for processing.
76
+ # TODO: get subset of cruise
77
+ # TODO: if beneath bottom skip
78
+ # TODO: preprocess? scale/normalize?
79
+ # TODO: add in features
80
+ # TODO: pass sv dataset
81
+ """
82
+ try:
83
+ # open zarr store
84
+ # sv_dataset = self.open_xarray_dataset(mask=True)
85
+
86
+ # patch_input_dims = {"depth": 1, "time": 2, "frequency": 3}
87
+
88
+ # define bounds
89
+ outline_dims = {"depth": 7, "time": 4, "frequency": 2}
90
+
91
+ bottom = np.array([5, np.nan, 3, 2]) # for nan should sample all depths
92
+
93
+ for f in self.vector_indices(0, outline_dims["frequency"] + 1, 2):
94
+ for t in self.vector_indices(0, outline_dims["time"] + 1, 2):
95
+ for d in self.vector_indices(0, outline_dims["depth"] + 1, 2):
96
+ indices = f"[d: {d}, t: {t}, f: {f}]"
97
+
98
+ if np.isnan(bottom[t]) or d > bottom[t]:
99
+ print("_+_+_+subbottom_+_+_+")
100
+ continue
101
+
102
+ yield indices
103
+ # # generate
104
+ # for f in np.arange(0, outline_dims['frequency'] + 1, 2):
105
+ # for t in np.arange(0, outline_dims['time'] + 1, 2):
106
+ # for d in np.arange(0, outline_dims['depth'] + 1, 2):
107
+ # indices = f"[d: {d}, t: {t}, f: {f}]"
108
+ # # TODO: get subset of cruise
109
+ # # TODO: if beneath bottom skip
110
+ # if np.isnan(bottom[t]) or d > bottom[t]:
111
+ # print('_+_+_+subbottom_+_+_+')
112
+ # continue
113
+ # # TODO: preprocess? scale/normalize?
114
+ # # TODO: add in features
115
+ # # TODO: pass sv dataset
116
+ # yield indices
117
+
118
+ except Exception as err:
119
+ raise RuntimeError(f"Problem defining dataset_batcher, {err}")
120
+
121
+ # @deprecated("We cannot use xbatcher")
122
+ def setup_xbatcher(
123
+ self,
124
+ bucket_name: str,
125
+ ship_name: str,
126
+ cruise_name: str,
127
+ sensor_name: str,
128
+ endpoint_url: str = None,
129
+ ):
130
+ # -> xbatcher.generators.BatchGenerator:
131
+ try:
132
+ sv_dataset = self.open_xarray_dataset(
133
+ bucket_name=bucket_name,
134
+ ship_name=ship_name,
135
+ cruise_name=cruise_name,
136
+ sensor_name=sensor_name,
137
+ endpoint_url=endpoint_url,
138
+ )
139
+ patch_input_dims = dict(
140
+ depth=BatchShape.DEPTH.value,
141
+ time=BatchShape.TIME.value,
142
+ frequency=BatchShape.FREQUENCY.value,
143
+ )
144
+ patch_input_overlap = dict(depth=0, time=0, frequency=0)
145
+ batch_generator = xbatcher.generators.BatchGenerator(
146
+ ds=sv_dataset.Sv, # TODO: need to get the depth out of this somehow?
147
+ input_dims=patch_input_dims,
148
+ input_overlap=patch_input_overlap,
149
+ # batch_dims={ "depth": 8, "time": 8, "frequency": 4 }, # no idea what this is doing
150
+ concat_input_dims=False,
151
+ preload_batch=False, # Load each batch dynamically
152
+ cache=None, # TODO: figure this out
153
+ # cache_preprocess=preprocess_batch, # https://xbatcher.readthedocs.io/en/latest/user-guide/caching.html
154
+ )
155
+ return batch_generator
156
+ except Exception as err:
157
+ raise RuntimeError(f"Problem setting up xbatcher, {err}")
158
+
159
+ # @deprecated("We cannot use xbatcher")
160
+ # def create_keras_dataloader(
161
+ # self,
162
+ # bucket_name: str,
163
+ # ship_name: str,
164
+ # cruise_name: str,
165
+ # sensor_name: str,
166
+ # endpoint_url: str = None,
167
+ # batch_size: int = 3,
168
+ # ):
169
+ # pass
170
+ # x_batch_generator = self.setup_xbatcher(
171
+ # bucket_name=bucket_name,
172
+ # ship_name=ship_name,
173
+ # cruise_name=cruise_name, # TODO: move all these to constructor
174
+ # sensor_name=sensor_name,
175
+ # endpoint_url=endpoint_url,
176
+ # )
177
+ #
178
+ # def transform(
179
+ # x,
180
+ # ): # TODO: do clip and normalize here... [-100, 0] w mean at -65, clip?
181
+ # # return x + 1e-6 # (x + 50.) / 100.
182
+ # # return np.clip(x, -60, -50)
183
+ # return (x + 50.) / 100.
184
+ #
185
+ # keras_dataset = xbatcher.loaders.keras.CustomTFDataset(
186
+ # X_generator=x_batch_generator,
187
+ # y_generator=x_batch_generator,
188
+ # transform=transform,
189
+ # target_transform=transform,
190
+ # )
191
+ #
192
+ # output_signature = tensorflow.TensorSpec(
193
+ # shape=(
194
+ # BatchShape.DEPTH.value, # 2
195
+ # BatchShape.TIME.value, # 3
196
+ # BatchShape.FREQUENCY.value, # 4
197
+ # ),
198
+ # dtype=tensorflow.float32,
199
+ # )
200
+ # train_dataloader = tensorflow.data.Dataset.from_generator(
201
+ # generator=lambda: iter(keras_dataset),
202
+ # output_signature=(output_signature, output_signature),
203
+ # )
204
+ #
205
+ # return train_dataloader.batch(batch_size=BatchShape.BATCH_SIZE.value) # 5
@@ -0,0 +1,32 @@
1
+ from typing import Optional
2
+
3
+ import xarray as xr
4
+
5
+
6
+ class DatasetManager:
7
+ """
8
+ Enrich the dataset with features
9
+ """
10
+
11
+ def __init__(
12
+ self,
13
+ bucket_name: str,
14
+ ship_name: str,
15
+ cruise_name: str,
16
+ sensor_name: str,
17
+ endpoint_url: Optional[str] = None,
18
+ ):
19
+ self.bucket_name = bucket_name
20
+ self.ship_name = ship_name
21
+ self.cruise_name = cruise_name
22
+ self.sensor_name = sensor_name
23
+ self.endpoint_url = endpoint_url
24
+
25
+ def add_features(
26
+ self,
27
+ ) -> xr.Dataset:
28
+ # Opens Zarr store in s3 bucket as Xarray Dataset and masks as needed
29
+ try:
30
+ pass
31
+ except Exception as err:
32
+ raise RuntimeError(f"Problem opening Zarr store from S3 with Xarray, {err}")