PyPI - water-column-sonar-processing - Versions diffs - 25.3.1__py3-none-any.whl → 25.8.0__py3-none-any.whl - Mend

water-column-sonar-processing 25.3.1py3-none-any.whl → 25.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (32) hide show

water_column_sonar_processing/cruise/create_empty_zarr_store_level_3.py ADDED Viewed

@@ -0,0 +1,161 @@
+import os
+import tempfile
+import numpy as np
+from water_column_sonar_processing.aws import DynamoDBManager, S3Manager
+from water_column_sonar_processing.model import ZarrManager
+from water_column_sonar_processing.utility import Cleaner
+class CreateEmptyZarrStoreLevel3:
+    #######################################################
+    def __init__(
+        self,
+    ):
+        self.__overwrite = True
+    #######################################################
+    # TODO: move this to the s3_manager
+    def upload_zarr_store_to_s3(
+        self,
+        output_bucket_name: str,
+        local_directory: str,
+        object_prefix: str,  # TODO: add level
+        cruise_name: str,
+    ) -> None:
+        print("uploading model store to s3")
+        s3_manager = S3Manager()
+        #
+        print("Starting upload with thread pool executor.")
+        # # 'all_files' is passed a list of lists: [[local_path, s3_key], [...], ...]
+        all_files = []
+        for subdir, dirs, files in os.walk(f"{local_directory}/{cruise_name}.zarr"):
+            for file in files:
+                local_path = os.path.join(subdir, file)
+                # TODO: find a better method for splitting strings here:
+                # 'level_2/Henry_B._Bigelow/HB0806/EK60/HB0806.zarr/.zattrs'
+                s3_key = f"{object_prefix}/{cruise_name}.zarr{local_path.split(f'{cruise_name}.zarr')[-1]}"
+                all_files.append([local_path, s3_key])
+        #
+        # print(all_files)
+        s3_manager.upload_files_with_thread_pool_executor(
+            output_bucket_name=output_bucket_name,
+            all_files=all_files,
+        )
+        print("Done uploading with thread pool executor.")
+        # TODO: move to common place
+    #######################################################
+    def create_cruise_level_zarr_store_level_3(
+        self,
+        output_bucket_name: str,
+        ship_name: str,
+        cruise_name: str,
+        sensor_name: str,
+        table_name: str,
+    ) -> None:
+        tempdir = tempfile.TemporaryDirectory()
+        try:
+            dynamo_db_manager = DynamoDBManager()
+            s3_manager = S3Manager()
+            df = dynamo_db_manager.get_table_as_df(
+                table_name=table_name,
+                cruise_name=cruise_name,
+            )
+            # TODO: filter the dataframe just for enums >= LEVEL_1_PROCESSING
+            print(f"DataFrame shape: {df.shape}")
+            cruise_channels = list(
+                set([i for sublist in df["CHANNELS"].dropna() for i in sublist])
+            )
+            cruise_channels.sort()
+            consolidated_zarr_width = np.sum(
+                df["NUM_PING_TIME_DROPNA"].dropna().astype(int)
+            )
+            # [3] calculate the max/min measurement resolutions for the whole cruise
+            cruise_min_echo_range = np.min(
+                (df["MIN_ECHO_RANGE"] + df["WATER_LEVEL"]).dropna().astype(float)
+            )
+            # [4] calculate the maximum of the max depth values
+            cruise_max_echo_range = np.max(
+                (df["MAX_ECHO_RANGE"] + df["WATER_LEVEL"]).dropna().astype(float)
+            )
+            cruise_max_echo_range = np.ceil(cruise_max_echo_range)
+            cruise_min_epsilon = 1.0  # np.min(df["MIN_ECHO_RANGE"].dropna().astype(float)) # TODO: set to 1m
+            print(
+                f"cruise_min_echo_range: {cruise_min_echo_range}, cruise_max_echo_range: {cruise_max_echo_range}"
+            )
+            # [5] get number of channels
+            cruise_frequencies = [
+                float(i) for i in df["FREQUENCIES"].dropna().values.flatten()[0]
+            ]
+            print(cruise_frequencies)
+            new_width = int(consolidated_zarr_width)
+            print(f"new_width: {new_width}")
+            #################################################################
+            store_name = f"{cruise_name}.zarr"
+            print(store_name)
+            ################################################################
+            # Delete existing model store if it exists
+            zarr_prefix = os.path.join("level_3", ship_name, cruise_name, sensor_name)
+            child_objects = s3_manager.get_child_objects(
+                bucket_name=output_bucket_name,
+                sub_prefix=zarr_prefix,
+            )
+            if len(child_objects) > 0:
+                s3_manager.delete_nodd_objects(
+                    bucket_name=output_bucket_name,
+                    objects=child_objects,
+                )
+            ################################################################
+            # Create new model store
+            zarr_manager = ZarrManager()
+            new_height = len(
+                zarr_manager.get_depth_values(
+                    # min_echo_range=cruise_min_echo_range,
+                    max_echo_range=cruise_max_echo_range,
+                    cruise_min_epsilon=cruise_min_epsilon,
+                )
+            )
+            print(f"new_height: {new_height}")
+            zarr_manager.create_zarr_store_level_3(
+                path=tempdir.name,  # TODO: need to use .name or problem
+                ship_name=ship_name,
+                cruise_name=cruise_name,
+                sensor_name=sensor_name,
+                frequencies=cruise_frequencies,
+                width=new_width,
+                min_echo_range=cruise_min_echo_range,
+                max_echo_range=cruise_max_echo_range,
+                cruise_min_epsilon=cruise_min_epsilon,
+                calibration_status=True,
+            )
+            #################################################################
+            self.upload_zarr_store_to_s3(
+                output_bucket_name=output_bucket_name,
+                local_directory=tempdir.name,  # TODO: need to use .name or problem
+                object_prefix=zarr_prefix,
+                cruise_name=cruise_name,
+            )
+            print("Done creating cruise level zarr store.")
+            #################################################################
+        except Exception as err:
+            raise RuntimeError(
+                f"Problem trying to create new cruise model store, {err}"
+            )
+        finally:
+            cleaner = Cleaner()
+            cleaner.delete_local_files()
+        print("Done creating cruise level model store")
+###########################################################

water_column_sonar_processing/cruise/datatree_manager.py CHANGED Viewed

@@ -1,21 +1,21 @@
-### https://xarray-datatree.readthedocs.io/en/latest/data-structures.html
-import xarray as xr
-from datatree import DataTree
-class DatatreeManager:
-    #######################################################
-    def __init__(
-        self,
-    ):
-        self.dtype = "float32"
-    #################################################################
-    def create_datatree(
-        self,
-        input_ds,
-    ) -> None:
-        ds1 = xr.Dataset({"foo": "orange"})
-        dt = DataTree(name="root", data=ds1)  # create root node
-        # ds2 = xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])})
-        return dt
+# ### https://xarray-datatree.readthedocs.io/en/latest/data-structures.html
+# import xarray as xr
+# from datatree import DataTree
+#
+#
+# class DatatreeManager:
+#     #######################################################
+#     def __init__(
+#         self,
+#     ):
+#         self.dtype = "float32"
+#
+#     #################################################################
+#     def create_datatree(
+#         self,
+#         input_ds,
+#     ) -> None:
+#         ds1 = xr.Dataset({"foo": "orange"})
+#         dt = DataTree(name="root", dataset=ds1)  # create root node
+#         # ds2 = xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])})
+#         return dt

water_column_sonar_processing/cruise/resample_regrid.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gc
+import warnings
 from pathlib import Path
 import numcodecs
@@ -10,13 +11,15 @@ from water_column_sonar_processing.aws import DynamoDBManager
 from water_column_sonar_processing.geometry import GeometryManager
 from water_column_sonar_processing.model import ZarrManager
+warnings.simplefilter("ignore", category=RuntimeWarning)
 numcodecs.blosc.use_threads = False
 numcodecs.blosc.set_nthreads(1)
 # TODO: when ready switch to version 3 of model spec
 #  ZARR_V3_EXPERIMENTAL_API = 1
-#  creates the latlon data: foo = ep.consolidate.add_location(ds_Sv, echodata)
+#  creates the latlon dataset: foo = ep.consolidate.add_location(ds_Sv, echodata)
 class ResampleRegrid:
@@ -34,10 +37,13 @@ class ResampleRegrid:
         self,
         input_xr,
         ping_times,
-        all_cruise_depth_values,
-        water_level,
+        all_cruise_depth_values,  # includes water_level offset
+        water_level,  # this is the offset that will be added to each respective file
     ) -> np.ndarray:
-        print("Interpolating data.")
+        """
+        What gets passed into interpolate data
+        """
+        print("Interpolating dataset.")
         try:
             data = np.empty(
                 (
@@ -50,31 +56,38 @@ class ResampleRegrid:
             data[:] = np.nan
-            regrid_resample = xr.DataArray(
+            regrid_resample = xr.DataArray(  # where data will be written to
                 data=data,
                 dims=("depth", "time", "frequency"),
                 coords={
-                    "depth": all_cruise_depth_values,  # TODO: these should be on interval from 7.7 meters to 507 meters
+                    "depth": all_cruise_depth_values,
                     "time": ping_times,
                     "frequency": input_xr.frequency_nominal.values,
                 },
             )
+            # shift the input data by water_level
+            input_xr.echo_range.values = (
+                input_xr.echo_range.values + water_level
+            )  # water_level # TODO: change
             channels = input_xr.channel.values
             for channel in range(
                 len(channels)
             ):  # ?TODO: leaving off here, need to subset for just indices in time axis
                 gc.collect()
                 max_depths = np.nanmax(
-                    a=input_xr.echo_range.sel(channel=input_xr.channel[channel]).values
-                    + water_level,
+                    a=input_xr.echo_range.sel(channel=input_xr.channel[channel]).values,
+                    # + water_level,
                     axis=1,
                 )
-                superset_of_max_depths = set(max_depths)
+                superset_of_max_depths = set(
+                    max_depths
+                )  # HB1501, D20150503-T102035.raw, TypeError: unhashable type: 'numpy.ndarray'
                 set_of_max_depths = list(
                     {x for x in superset_of_max_depths if x == x}
                 )  # removes nan's
-                # iterate through partitions of data with similar depths and resample
+                # iterate through partitions of dataset with similar depths and resample
                 for select_max_depth in set_of_max_depths:
                     # TODO: for nan just skip and leave all nan's
                     select_indices = [
@@ -120,9 +133,8 @@ class ResampleRegrid:
                     print(f"updated {len(times_select)} ping times")
                     gc.collect()
         except Exception as err:
-            print(f"Problem finding the dynamodb table: {err}")
-            raise err
-        print("Done interpolating data.")
+            raise RuntimeError(f"Problem finding the dynamodb table, {err}")
+        print("Done interpolating dataset.")
         return regrid_resample.values.copy()
     #################################################################
@@ -132,18 +144,18 @@ class ResampleRegrid:
         cruise_name,
         sensor_name,
         table_name,
-        # TODO: file_name?,
-        bucket_name,  # TODO: this is the same bucket
+        bucket_name,
         override_select_files=None,
+        # override_cruise_min_epsilon=None,
         endpoint_url=None,
     ) -> None:
         """
-        The goal here is to interpolate the data against the depth values already populated
+        The goal here is to interpolate the dataset against the depth values already populated
         in the existing file level model stores. We open the cruise-level store with model for
         read/write operations. We open the file-level store with Xarray to leverage tools for
-        resampling and subsetting the data.
+        resampling and subsetting the dataset.
         """
-        print("Resample Regrid, Interpolating data.")
+        print("Resample Regrid, Interpolating dataset.")
         try:
             zarr_manager = ZarrManager()
             geo_manager = GeometryManager()
@@ -192,7 +204,7 @@ class ResampleRegrid:
                     ]
                 )
-                # Get input store
+                # Get input store — this is unadjusted for water_level
                 input_xr_zarr_store = zarr_manager.open_s3_zarr_store_with_xarray(
                     ship_name=ship_name,
                     cruise_name=cruise_name,
@@ -202,12 +214,15 @@ class ResampleRegrid:
                     endpoint_url=endpoint_url,
                 )
-                # This is the horizontal offset of the measurement.
+                # This is the vertical offset of the sensor related to the ocean surface
                 # See https://echopype.readthedocs.io/en/stable/data-proc-additional.html
-                water_level = input_xr_zarr_store.water_level.values
+                if "water_level" in input_xr_zarr_store.keys():
+                    water_level = input_xr_zarr_store.water_level.values
+                else:
+                    water_level = 0.0
                 #########################################################################
-                # [3] Get needed indices
-                # Offset from start index to insert new data. Note that missing values are excluded.
+                # [3] Get needed time indices — along the x-axis
+                # Offset from start index to insert new dataset. Note that missing values are excluded.
                 ping_time_cumsum = np.insert(
                     np.cumsum(
                         cruise_df["NUM_PING_TIME_DROPNA"].dropna().to_numpy(dtype=int)
@@ -218,11 +233,6 @@ class ResampleRegrid:
                 start_ping_time_index = ping_time_cumsum[index]
                 end_ping_time_index = ping_time_cumsum[index + 1]
-                min_echo_range = np.min(
-                    (cruise_df["MIN_ECHO_RANGE"] + cruise_df["WATER_LEVEL"])
-                    .dropna()
-                    .astype(float)
-                )
                 max_echo_range = np.max(
                     (cruise_df["MAX_ECHO_RANGE"] + cruise_df["WATER_LEVEL"])
                     .dropna()
@@ -233,9 +243,9 @@ class ResampleRegrid:
                 )
                 # Note: cruise dims (depth, time, frequency)
-                all_cruise_depth_values = zarr_manager.get_depth_values(
-                    min_echo_range=min_echo_range,
-                    max_echo_range=max_echo_range,
+                all_cruise_depth_values = zarr_manager.get_depth_values(  # needs to integrate water_level
+                    # min_echo_range=min_echo_range,
+                    max_echo_range=max_echo_range,  # does it here
                     cruise_min_epsilon=cruise_min_epsilon,  # remove this & integrate into min_echo_range
                 )  # with offset of 7.5 meters, 0 meter measurement should now start at 7.5 meters
@@ -257,7 +267,9 @@ class ResampleRegrid:
                     output_bucket_name=bucket_name,
                 )
-                input_xr = input_xr_zarr_store.isel(ping_time=indices)
+                input_xr = input_xr_zarr_store.isel(
+                    ping_time=indices
+                )  # Problem with HB200802-D20080310-T174959.zarr/
                 ping_times = input_xr.ping_time.values
                 # Date format: numpy.datetime64('2007-07-20T02:10:25.845073920') converts to "1184897425.845074"
@@ -270,13 +282,11 @@ class ResampleRegrid:
                 )
                 # --- UPDATING --- #
-                regrid_resample = (
-                    self.interpolate_data(  # TODO: need to add water_level here
-                        input_xr=input_xr,
-                        ping_times=ping_times,
-                        all_cruise_depth_values=all_cruise_depth_values,
-                        water_level=water_level,
-                    )
+                regrid_resample = self.interpolate_data(
+                    input_xr=input_xr,
+                    ping_times=ping_times,
+                    all_cruise_depth_values=all_cruise_depth_values,  # should accommodate the water_level already
+                    water_level=water_level,  # not applied to anything yet
                 )
                 print(
@@ -296,15 +306,16 @@ class ResampleRegrid:
                 # TODO: Only checking the first channel for now. Need to average across all channels
                 #  in the future. See https://github.com/CI-CMG/water-column-sonar-processing/issues/11
                 if "detected_seafloor_depth" in input_xr.variables:
-                    print("Found detected_seafloor_depth, adding data to output store.")
+                    print(
+                        "Found detected_seafloor_depth, adding dataset to output store."
+                    )
                     detected_seafloor_depth = input_xr.detected_seafloor_depth.values
                     detected_seafloor_depth[detected_seafloor_depth == 0.0] = np.nan
                     # TODO: problem here: Processing file: D20070711-T210709.
-                    detected_seafloor_depths = np.nanmean(
-                        a=detected_seafloor_depth, axis=0
-                    )
-                    # RuntimeWarning: Mean of empty slice detected_seafloor_depths = np.nanmean(detected_seafloor_depth, 0)
+                    # Use the lowest frequencies to determine bottom
+                    detected_seafloor_depths = detected_seafloor_depth[0, :]
                     detected_seafloor_depths[detected_seafloor_depths == 0.0] = np.nan
                     print(f"min depth measured: {np.nanmin(detected_seafloor_depths)}")
                     print(f"max depth measured: {np.nanmax(detected_seafloor_depths)}")
@@ -326,11 +337,10 @@ class ResampleRegrid:
                 #########################################################################
                 #########################################################################
         except Exception as err:
-            print(f"Problem with resample_regrid: {err}")
-            raise err
+            raise RuntimeError(f"Problem with resample_regrid, {err}")
         finally:
             print("Exiting resample_regrid.")
-            # TODO: read across times and verify data was written?
+            # TODO: read across times and verify dataset was written?
     #######################################################

water_column_sonar_processing/dataset/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .dataset_manager import DatasetManager
+__all__ = ["DatasetManager"]

water_column_sonar_processing/dataset/dataset_manager.py ADDED Viewed

@@ -0,0 +1,205 @@
+from typing import Optional
+import numpy as np
+import xarray as xr
+import xbatcher
+from water_column_sonar_processing.aws import S3FSManager
+from water_column_sonar_processing.utility.constants import BatchShape
+class DatasetManager:
+    """
+    Dataset manager does three things.
+    1) Opens zarr store in s3 bucket with xarray and returns masked dataset
+    2) Loads Xarray DataSet with Xbatcher
+    3) Loads Xbatcher batches into tensorflow dataset
+    """
+    def __init__(
+        self,
+        bucket_name: str,
+        ship_name: str,
+        cruise_name: str,
+        sensor_name: str,
+        endpoint_url: Optional[str] = None,
+    ):
+        self.bucket_name = bucket_name
+        self.ship_name = ship_name
+        self.cruise_name = cruise_name
+        self.sensor_name = sensor_name
+        self.endpoint_url = endpoint_url
+        self.dtype = "float32"
+    def open_xarray_dataset(
+        self,
+        mask: bool = True,
+    ) -> xr.Dataset:
+        # Opens Zarr store in s3 bucket as Xarray Dataset and masks as needed
+        try:
+            s3_path = f"s3://{self.bucket_name}/level_2/{self.ship_name}/{self.cruise_name}/{self.sensor_name}/{self.cruise_name}.zarr"
+            s3fs_manager = S3FSManager(endpoint_url=self.endpoint_url)
+            store_s3_map = s3fs_manager.s3_map(s3_zarr_store_path=s3_path)
+            ds = xr.open_dataset(
+                filename_or_obj=store_s3_map,
+                engine="zarr",
+                # backend_kwargs={'storage_options': {'anon': True}},
+                chunks={},
+                cache=False,
+            )
+            # Mask all sub-bottom dataset
+            if mask:
+                return ds.where(ds.depth < ds.bottom)
+            return ds
+        except Exception as err:
+            raise RuntimeError(f"Problem opening Zarr store from S3 with Xarray, {err}")
+    def vector_indices(
+        self,
+        first_index: int,
+        last_index: int,
+        step: int,
+    ):
+        starts = np.arange(first_index, last_index, step)
+        ends = np.arange(step, last_index + 1, step)
+        return list(zip(starts, ends))
+    def dataset_batcher(
+        self,
+    ):
+        """
+        Opens a dataset and creates a generator that returns different chunks of data for processing.
+        # TODO: get subset of cruise
+        # TODO: if beneath bottom skip
+        # TODO: preprocess? scale/normalize?
+        # TODO: add in features
+        # TODO: pass sv dataset
+        """
+        try:
+            # open zarr store
+            # sv_dataset = self.open_xarray_dataset(mask=True)
+            # patch_input_dims = {"depth": 1, "time": 2, "frequency": 3}
+            # define bounds
+            outline_dims = {"depth": 7, "time": 4, "frequency": 2}
+            bottom = np.array([5, np.nan, 3, 2])  # for nan should sample all depths
+            for f in self.vector_indices(0, outline_dims["frequency"] + 1, 2):
+                for t in self.vector_indices(0, outline_dims["time"] + 1, 2):
+                    for d in self.vector_indices(0, outline_dims["depth"] + 1, 2):
+                        indices = f"[d: {d}, t: {t}, f: {f}]"
+                        if np.isnan(bottom[t]) or d > bottom[t]:
+                            print("_+_+_+subbottom_+_+_+")
+                            continue
+                        yield indices
+            # # generate
+            # for f in np.arange(0, outline_dims['frequency'] + 1, 2):
+            #     for t in np.arange(0, outline_dims['time'] + 1, 2):
+            #         for d in np.arange(0, outline_dims['depth'] + 1, 2):
+            #             indices = f"[d: {d}, t: {t}, f: {f}]"
+            #             # TODO: get subset of cruise
+            #             # TODO: if beneath bottom skip
+            #             if np.isnan(bottom[t]) or d > bottom[t]:
+            #                 print('_+_+_+subbottom_+_+_+')
+            #                 continue
+            #             # TODO: preprocess? scale/normalize?
+            #             # TODO: add in features
+            #             # TODO: pass sv dataset
+            #             yield indices
+        except Exception as err:
+            raise RuntimeError(f"Problem defining dataset_batcher, {err}")
+    # @deprecated("We cannot use xbatcher")
+    def setup_xbatcher(
+        self,
+        bucket_name: str,
+        ship_name: str,
+        cruise_name: str,
+        sensor_name: str,
+        endpoint_url: str = None,
+    ):
+        #  -> xbatcher.generators.BatchGenerator:
+        try:
+            sv_dataset = self.open_xarray_dataset(
+                bucket_name=bucket_name,
+                ship_name=ship_name,
+                cruise_name=cruise_name,
+                sensor_name=sensor_name,
+                endpoint_url=endpoint_url,
+            )
+            patch_input_dims = dict(
+                depth=BatchShape.DEPTH.value,
+                time=BatchShape.TIME.value,
+                frequency=BatchShape.FREQUENCY.value,
+            )
+            patch_input_overlap = dict(depth=0, time=0, frequency=0)
+            batch_generator = xbatcher.generators.BatchGenerator(
+                ds=sv_dataset.Sv,  # TODO: need to get the depth out of this somehow?
+                input_dims=patch_input_dims,
+                input_overlap=patch_input_overlap,
+                # batch_dims={ "depth": 8, "time": 8, "frequency": 4 }, # no idea what this is doing
+                concat_input_dims=False,
+                preload_batch=False,  # Load each batch dynamically
+                cache=None,  # TODO: figure this out
+                # cache_preprocess=preprocess_batch,  # https://xbatcher.readthedocs.io/en/latest/user-guide/caching.html
+            )
+            return batch_generator
+        except Exception as err:
+            raise RuntimeError(f"Problem setting up xbatcher, {err}")
+    # @deprecated("We cannot use xbatcher")
+    # def create_keras_dataloader(
+    #     self,
+    #     bucket_name: str,
+    #     ship_name: str,
+    #     cruise_name: str,
+    #     sensor_name: str,
+    #     endpoint_url: str = None,
+    #     batch_size: int = 3,
+    # ):
+    #     pass
+    # x_batch_generator = self.setup_xbatcher(
+    #     bucket_name=bucket_name,
+    #     ship_name=ship_name,
+    #     cruise_name=cruise_name,  # TODO: move all these to constructor
+    #     sensor_name=sensor_name,
+    #     endpoint_url=endpoint_url,
+    # )
+    #
+    # def transform(
+    #     x,
+    # ):  # TODO: do clip and normalize here... [-100, 0] w mean at -65, clip?
+    #     # return x + 1e-6  # (x + 50.) / 100.
+    #     # return np.clip(x, -60, -50)
+    #     return (x + 50.) / 100.
+    #
+    # keras_dataset = xbatcher.loaders.keras.CustomTFDataset(
+    #     X_generator=x_batch_generator,
+    #     y_generator=x_batch_generator,
+    #     transform=transform,
+    #     target_transform=transform,
+    # )
+    #
+    # output_signature = tensorflow.TensorSpec(
+    #     shape=(
+    #         BatchShape.DEPTH.value, # 2
+    #         BatchShape.TIME.value, # 3
+    #         BatchShape.FREQUENCY.value, # 4
+    #     ),
+    #     dtype=tensorflow.float32,
+    # )
+    # train_dataloader = tensorflow.data.Dataset.from_generator(
+    #     generator=lambda: iter(keras_dataset),
+    #     output_signature=(output_signature, output_signature),
+    # )
+    #
+    # return train_dataloader.batch(batch_size=BatchShape.BATCH_SIZE.value) # 5

water_column_sonar_processing/dataset/feature_manager.py ADDED Viewed

@@ -0,0 +1,32 @@
+from typing import Optional
+import xarray as xr
+class DatasetManager:
+    """
+    Enrich the dataset with features
+    """
+    def __init__(
+        self,
+        bucket_name: str,
+        ship_name: str,
+        cruise_name: str,
+        sensor_name: str,
+        endpoint_url: Optional[str] = None,
+    ):
+        self.bucket_name = bucket_name
+        self.ship_name = ship_name
+        self.cruise_name = cruise_name
+        self.sensor_name = sensor_name
+        self.endpoint_url = endpoint_url
+    def add_features(
+        self,
+    ) -> xr.Dataset:
+        # Opens Zarr store in s3 bucket as Xarray Dataset and masks as needed
+        try:
+            pass
+        except Exception as err:
+            raise RuntimeError(f"Problem opening Zarr store from S3 with Xarray, {err}")

water-column-sonar-processing 25.3.1__py3-none-any.whl → 25.8.0__py3-none-any.whl

Potentially problematic release.

water-column-sonar-processing 25.3.1py3-none-any.whl → 25.8.0py3-none-any.whl