PyPI - water-column-sonar-processing - Versions diffs - 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

water-column-sonar-processing 0.0.4py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

{cruise → water_column_sonar_processing/cruise}/create_empty_zarr_store.py RENAMED Viewed

@@ -1,108 +1,123 @@
 import os
 import numcodecs
 import numpy as np
-from utility.cleaner import Cleaner
-from aws_manager.dynamodb_manager import DynamoDBManager
-from aws_manager.s3_manager import S3Manager
-from zarr_manager.zarr_manager import ZarrManager
+from water_column_sonar_processing.aws.dynamodb_manager import DynamoDBManager
+from water_column_sonar_processing.aws.s3_manager import S3Manager
+from water_column_sonar_processing.model.zarr_manager import ZarrManager
+from water_column_sonar_processing.utility.cleaner import Cleaner
 numcodecs.blosc.use_threads = False
 numcodecs.blosc.set_nthreads(1)
-TEMPDIR = "/tmp"
-# TODO: when ready switch to version 3 of zarr_manager spec
+# TEMPDIR = "/tmp"
+# TODO: when ready switch to version 3 of model spec
 # ZARR_V3_EXPERIMENTAL_API = 1
 # creates the latlon data: foo = ep.consolidate.add_location(ds_Sv, echodata)
 class CreateEmptyZarrStore:
     #######################################################
     def __init__(
-            self,
+        self,
     ):
         self.__overwrite = True
-        # TODO: create output_bucket and input_bucket variables here?
         self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
         self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
     #######################################################
     def upload_zarr_store_to_s3(
-            self,
-            local_directory: str,
-            object_prefix: str,
-            cruise_name: str,
+        self,
+        local_directory: str,
+        object_prefix: str,
+        cruise_name: str,
     ) -> None:
-        print('uploading zarr_manager store to s3')
+        print("uploading model store to s3")
         s3_manager = S3Manager()
         #
-        print('Starting upload with thread pool executor.')
+        print("Starting upload with thread pool executor.")
         # # 'all_files' is passed a list of lists: [[local_path, s3_key], [...], ...]
         all_files = []
-        for subdir, dirs, files in os.walk(f"{local_directory}/{cruise_name}.zarr_manager"):
+        for subdir, dirs, files in os.walk(
+            f"{local_directory}/{cruise_name}.zarr_manager"
+        ):
             for file in files:
                 local_path = os.path.join(subdir, file)
-                # 'level_2/Henry_B._Bigelow/HB0806/EK60/HB0806.zarr_manager/.zattrs'
-                s3_key = f'{object_prefix}/{cruise_name}.zarr_manager{local_path.split(f"{cruise_name}.zarr_manager")[-1]}'
+                # 'level_2/Henry_B._Bigelow/HB0806/EK60/HB0806.model/.zattrs'
+                s3_key = f'{object_prefix}/{cruise_name}.model{local_path.split(f"{cruise_name}.model")[-1]}'
                 all_files.append([local_path, s3_key])
         #
         # print(all_files)
         s3_manager.upload_files_with_thread_pool_executor(
             all_files=all_files,
         )
-        print('Done uploading with thread pool executor.')
+        print("Done uploading with thread pool executor.")
         # TODO: move to common place
     #######################################################
     def create_cruise_level_zarr_store(
-            self,
-            ship_name: str,
-            cruise_name: str,
-            sensor_name: str,
-            table_name: str
+        self,
+        ship_name: str,
+        cruise_name: str,
+        sensor_name: str,
+        table_name: str,
+        tempdir: str,
     ) -> None:
         try:
             # HB0806 - 123, HB0903 - 220
             dynamo_db_manager = DynamoDBManager()
+            s3_manager = S3Manager()
             df = dynamo_db_manager.get_table_as_df(
                 table_name=table_name,
                 ship_name=ship_name,
                 cruise_name=cruise_name,
-                sensor_name=sensor_name
+                sensor_name=sensor_name,
             )
-            # filter the dataframe just for enums >= LEVEL_1_PROCESSING
+            # TODO: filter the dataframe just for enums >= LEVEL_1_PROCESSING
             # df[df['PIPELINE_STATUS'] < PipelineStatus.LEVEL_1_PROCESSING] = np.nan
             # TODO: VERIFY GEOJSON EXISTS as prerequisite!!!
             print(f"DataFrame shape: {df.shape}")
-            cruise_channels = list(set([i for sublist in df['CHANNELS'].dropna() for i in sublist]))
+            cruise_channels = list(
+                set([i for sublist in df["CHANNELS"].dropna() for i in sublist])
+            )
             cruise_channels.sort()
-            consolidated_zarr_width = np.sum(df['NUM_PING_TIME_DROPNA'].dropna().astype(int))
+            consolidated_zarr_width = np.sum(
+                df["NUM_PING_TIME_DROPNA"].dropna().astype(int)
+            )
             # [3] calculate the max/min measurement resolutions for the whole cruise
-            cruise_min_echo_range = float(np.min(df['MIN_ECHO_RANGE'].dropna().astype(float)))
+            cruise_min_echo_range = float(
+                np.min(df["MIN_ECHO_RANGE"].dropna().astype(float))
+            )
             # [4] calculate the maximum of the max depth values
-            cruise_max_echo_range = float(np.max(df['MAX_ECHO_RANGE'].dropna().astype(float)))
-            print(f"cruise_min_echo_range: {cruise_min_echo_range}, cruise_max_echo_range: {cruise_max_echo_range}")
+            cruise_max_echo_range = float(
+                np.max(df["MAX_ECHO_RANGE"].dropna().astype(float))
+            )
+            print(
+                f"cruise_min_echo_range: {cruise_min_echo_range}, cruise_max_echo_range: {cruise_max_echo_range}"
+            )
             # [5] get number of channels
-            cruise_frequencies = [float(i) for i in df['FREQUENCIES'].dropna().values.flatten()[0]]
+            cruise_frequencies = [
+                float(i) for i in df["FREQUENCIES"].dropna().values.flatten()[0]
+            ]
             print(cruise_frequencies)
             new_width = int(consolidated_zarr_width)
             print(f"new_width: {new_width}")
             #################################################################
-            store_name = f"{cruise_name}.zarr_manager"
+            store_name = f"{cruise_name}.model"
             print(store_name)
             ################################################################
-            # Delete existing zarr_manager store if it exists
-            s3_manager = S3Manager()
+            # Delete existing model store if it exists
             zarr_prefix = os.path.join("level_2", ship_name, cruise_name, sensor_name)
             child_objects = s3_manager.get_child_objects(
                 bucket_name=self.output_bucket_name,
@@ -113,16 +128,18 @@ class CreateEmptyZarrStore:
                     objects=child_objects,
                 )
             ################################################################
-            # Create new zarr_manager store
+            # Create new model store
             zarr_manager = ZarrManager()
-            new_height = len(zarr_manager.get_depth_values(
-                min_echo_range=cruise_min_echo_range,
-                max_echo_range=cruise_max_echo_range
-            ))
+            new_height = len(
+                zarr_manager.get_depth_values(
+                    min_echo_range=cruise_min_echo_range,
+                    max_echo_range=cruise_max_echo_range,
+                )
+            )
             print(f"new_height: {new_height}")
             zarr_manager.create_zarr_store(
-                path=TEMPDIR,
+                path=tempdir,
                 ship_name=ship_name,
                 cruise_name=cruise_name,
                 sensor_name=sensor_name,
@@ -134,7 +151,7 @@ class CreateEmptyZarrStore:
             )
             #################################################################
             self.upload_zarr_store_to_s3(
-                local_directory=TEMPDIR,
+                local_directory=tempdir,
                 object_prefix=zarr_prefix,
                 cruise_name=cruise_name,
             )
@@ -157,11 +174,12 @@ class CreateEmptyZarrStore:
             # TODO: update enum in dynamodb
             #################################################################
         except Exception as err:
-            print(f"Problem trying to create new cruise zarr_manager store: {err}")
+            print(f"Problem trying to create new cruise model store: {err}")
         finally:
             cleaner = Cleaner()
             cleaner.delete_local_files()
-        print("Done creating cruise level zarr_manager store")
+            # TODO: should delete zarr store in temp directory too?
+        print("Done creating cruise level model store")
 ###########################################################

{cruise → water_column_sonar_processing/cruise}/resample_regrid.py RENAMED Viewed

@@ -1,48 +1,52 @@
 import gc
 import os
 from pathlib import Path
 import numcodecs
 import numpy as np
-import xarray as xr
 import pandas as pd
+import xarray as xr
-from aws_manager.dynamodb_manager import DynamoDBManager
-from zarr_manager.zarr_manager import ZarrManager
-from geometry_manager.geometry_manager import GeometryManager
+from water_column_sonar_processing.aws.dynamodb_manager import DynamoDBManager
+from water_column_sonar_processing.geometry.geometry_manager import GeometryManager
+from water_column_sonar_processing.model.zarr_manager import ZarrManager
 numcodecs.blosc.use_threads = False
 numcodecs.blosc.set_nthreads(1)
-# TODO: when ready switch to version 3 of zarr_manager spec
+# TODO: when ready switch to version 3 of model spec
 #  ZARR_V3_EXPERIMENTAL_API = 1
 #  creates the latlon data: foo = ep.consolidate.add_location(ds_Sv, echodata)
 class ResampleRegrid:
     #######################################################
     def __init__(
-            self,
+        self,
     ):
         self.__overwrite = True
         self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
         self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
-        self.dtype = 'float32'
+        self.dtype = "float32"
     #################################################################
     def interpolate_data(
-            self,
-            input_xr,
-            ping_times,
-            all_cruise_depth_values,
+        self,
+        input_xr,
+        ping_times,
+        all_cruise_depth_values,
     ) -> np.ndarray:
         print("Interpolating data.")
         try:
-            data = np.empty((
-                len(all_cruise_depth_values),
-                len(ping_times),
-                len(input_xr.frequency_nominal)
-            ), dtype=self.dtype)
+            data = np.empty(
+                (
+                    len(all_cruise_depth_values),
+                    len(ping_times),
+                    len(input_xr.frequency_nominal),
+                ),
+                dtype=self.dtype,
+            )
             data[:] = np.nan
@@ -53,37 +57,60 @@ class ResampleRegrid:
                     "depth": all_cruise_depth_values,
                     "time": ping_times,
                     "frequency": input_xr.frequency_nominal.values,
-                }
+                },
             )
             channels = input_xr.channel.values
-            for channel in range(len(channels)):  # TODO: leaving off here, need to subset for just indices in time axis
-                print(np.nanmax(input_xr.echo_range.sel(channel=input_xr.channel[channel]).values))
+            for channel in range(
+                len(channels)
+            ):  # TODO: leaving off here, need to subset for just indices in time axis
+                print(
+                    np.nanmax(
+                        input_xr.echo_range.sel(
+                            channel=input_xr.channel[channel]
+                        ).values
+                    )
+                )
                 #
                 max_depths = np.nanmax(
                     a=input_xr.echo_range.sel(channel=input_xr.channel[channel]).values,
-                    axis=1
+                    axis=1,
                 )
                 superset_of_max_depths = set(
-                    np.nanmax(input_xr.echo_range.sel(channel=input_xr.channel[channel]).values, 1)
+                    np.nanmax(
+                        input_xr.echo_range.sel(
+                            channel=input_xr.channel[channel]
+                        ).values,
+                        1,
+                    )
                 )
-                set_of_max_depths = list({x for x in superset_of_max_depths if x == x})  # removes nan's
+                set_of_max_depths = list(
+                    {x for x in superset_of_max_depths if x == x}
+                )  # removes nan's
                 # iterate through partitions of data with similar depths and resample
                 for select_max_depth in set_of_max_depths:
                     # TODO: for nan just skip and leave all nan's
-                    select_indices = [i for i in range(0, len(max_depths)) if max_depths[i] == select_max_depth]
+                    select_indices = [
+                        i
+                        for i in range(0, len(max_depths))
+                        if max_depths[i] == select_max_depth
+                    ]
                     # now create new DataArray with proper dimension and indices
                     # data_select = input_xr.Sv.sel(
                     #     channel=input_xr.channel[channel]
                     # ).values[select_indices, :].T  # TODO: dont like this transpose
-                    data_select = input_xr.Sv.sel(channel=input_xr.channel[channel])[select_indices, :].T.values
+                    data_select = input_xr.Sv.sel(channel=input_xr.channel[channel])[
+                        select_indices, :
+                    ].T.values
                     # change from ".values[select_indices, :].T" to "[select_indices, :].values.T"
                     times_select = input_xr.ping_time.values[select_indices]
                     depths_select = input_xr.echo_range.sel(
                         channel=input_xr.channel[channel]
-                    ).values[select_indices[0], :]  # '0' because all others in group should be same
+                    ).values[
+                        select_indices[0], :
+                    ]  # '0' because all others in group should be same
                     da_select = xr.DataArray(
                         data=data_select,
@@ -91,31 +118,36 @@ class ResampleRegrid:
                         coords={
                             "depth": depths_select,
                             "time": times_select,
-                        }
-                    ).dropna(dim='depth')
-                    resampled = da_select.interp(depth=all_cruise_depth_values, method="nearest")
+                        },
+                    ).dropna(dim="depth")
+                    resampled = da_select.interp(
+                        depth=all_cruise_depth_values, method="nearest"
+                    )
                     # write to the resample array
                     regrid_resample.loc[
-                        dict(time=times_select, frequency=input_xr.frequency_nominal.values[channel])
+                        dict(
+                            time=times_select,
+                            frequency=input_xr.frequency_nominal.values[channel],
+                        )
                     ] = resampled
                     print(f"updated {len(times_select)} ping times")
         except Exception as err:
-            print(f'Problem finding the dynamodb table: {err}')
+            print(f"Problem finding the dynamodb table: {err}")
             raise err
         print("Done interpolating data.")
         return regrid_resample
     #################################################################
     def resample_regrid(
-            self,
-            ship_name,
-            cruise_name,
-            sensor_name,
-            table_name,
+        self,
+        ship_name,
+        cruise_name,
+        sensor_name,
+        table_name,
     ) -> None:
         """
         The goal here is to interpolate the data against the depth values already populated
-        in the existing file level zarr_manager stores. We open the cruise-level store with zarr_manager for
+        in the existing file level model stores. We open the cruise-level store with model for
         read/write operations. We open the file-level store with Xarray to leverage tools for
         resampling and subsetting the data.
         """
@@ -124,7 +156,7 @@ class ResampleRegrid:
             zarr_manager = ZarrManager()
             # s3_manager = S3Manager()
             geo_manager = GeometryManager()
-            # get zarr_manager store
+            # get model store
             output_zarr_store = zarr_manager.open_s3_zarr_store_with_zarr(
                 ship_name=ship_name,
                 cruise_name=cruise_name,
@@ -144,7 +176,7 @@ class ResampleRegrid:
             #########################################################
             #########################################################
             # TODO: iterate files here
-            all_file_names = cruise_df['FILE_NAME']
+            all_file_names = cruise_df["FILE_NAME"]
             for file_name in all_file_names:
                 gc.collect()
                 file_name_stem = Path(file_name).stem
@@ -157,8 +189,10 @@ class ResampleRegrid:
                 # TODO: filter rows by enum success, filter the dataframe just for enums >= LEVEL_1_PROCESSING
                 #  df[df['PIPELINE_STATUS'] < PipelineStatus.LEVEL_1_PROCESSING] = np.nan
-                # Get index_manager from all cruise files. Note: should be based on which are included in cruise.
-                index = cruise_df.index[cruise_df['FILE_NAME'] == f"{file_name_stem}.raw"][0]
+                # Get index from all cruise files. Note: should be based on which are included in cruise.
+                index = cruise_df.index[
+                    cruise_df["FILE_NAME"] == f"{file_name_stem}.raw"
+                ][0]
                 # get input store
                 input_xr_zarr_store = zarr_manager.open_s3_zarr_store_with_xarray(
@@ -169,31 +203,40 @@ class ResampleRegrid:
                 )
                 #########################################################################
                 # [3] Get needed indices
-                # Offset from start index_manager to insert new data. Note that missing values are excluded.
+                # Offset from start index to insert new data. Note that missing values are excluded.
                 ping_time_cumsum = np.insert(
-                    np.cumsum(cruise_df['NUM_PING_TIME_DROPNA'].dropna().to_numpy(dtype=int)),
+                    np.cumsum(
+                        cruise_df["NUM_PING_TIME_DROPNA"].dropna().to_numpy(dtype=int)
+                    ),
                     obj=0,
-                    values=0
+                    values=0,
                 )
                 start_ping_time_index = ping_time_cumsum[index]
                 end_ping_time_index = ping_time_cumsum[index + 1]
-                min_echo_range = np.nanmin(np.float32(cruise_df['MIN_ECHO_RANGE']))
-                max_echo_range = np.nanmax(np.float32(cruise_df['MAX_ECHO_RANGE']))
+                min_echo_range = np.nanmin(np.float32(cruise_df["MIN_ECHO_RANGE"]))
+                max_echo_range = np.nanmax(np.float32(cruise_df["MAX_ECHO_RANGE"]))
-                print("Creating empty ndarray for Sv data.")  # Note: cruise_zarr dimensions are (depth, time, frequency)
+                print(
+                    "Creating empty ndarray for Sv data."
+                )  # Note: cruise_zarr dimensions are (depth, time, frequency)
                 cruise_sv_subset = np.empty(
-                    shape=output_zarr_store.Sv[:, start_ping_time_index:end_ping_time_index, :].shape
+                    shape=output_zarr_store.Sv[
+                        :, start_ping_time_index:end_ping_time_index, :
+                    ].shape
                 )
                 cruise_sv_subset[:, :, :] = np.nan  # (5208, 9778, 4)
                 all_cruise_depth_values = zarr_manager.get_depth_values(
-                    min_echo_range=min_echo_range,
-                    max_echo_range=max_echo_range
+                    min_echo_range=min_echo_range, max_echo_range=max_echo_range
                 )
                 print(" ".join(list(input_xr_zarr_store.Sv.dims)))
-                if set(input_xr_zarr_store.Sv.dims) != {'channel', 'ping_time', 'range_sample'}:
+                if set(input_xr_zarr_store.Sv.dims) != {
+                    "channel",
+                    "ping_time",
+                    "range_sample",
+                }:
                     raise Exception("Xarray dimensions are not as expected.")
                 # get geojson
@@ -209,8 +252,13 @@ class ResampleRegrid:
                 ping_times = input_xr.ping_time.values
                 # Date format: numpy.datetime64('2007-07-20T02:10:25.845073920') converts to "1184897425.845074"
-                epoch_seconds = [(pd.Timestamp(i) - pd.Timestamp('1970-01-01')) / pd.Timedelta('1s') for i in ping_times]
-                output_zarr_store.time[start_ping_time_index:end_ping_time_index] = epoch_seconds
+                epoch_seconds = [
+                    (pd.Timestamp(i) - pd.Timestamp("1970-01-01")) / pd.Timedelta("1s")
+                    for i in ping_times
+                ]
+                output_zarr_store.time[start_ping_time_index:end_ping_time_index] = (
+                    epoch_seconds
+                )
                 # --- UPDATING --- #
@@ -220,30 +268,33 @@ class ResampleRegrid:
                     all_cruise_depth_values=all_cruise_depth_values,
                 )
-                print(f"start_ping_time_index: {start_ping_time_index}, end_ping_time_index: {end_ping_time_index}")
+                print(
+                    f"start_ping_time_index: {start_ping_time_index}, end_ping_time_index: {end_ping_time_index}"
+                )
                 #########################################################################
-                # write Sv values to cruise-level-zarr_manager-store
-                for channel in range(len(input_xr.channel.values)):  # doesn't like being written in one fell swoop :(
+                # write Sv values to cruise-level-model-store
+                for channel in range(
+                    len(input_xr.channel.values)
+                ):  # doesn't like being written in one fell swoop :(
                     output_zarr_store.Sv[
-                        :,
-                        start_ping_time_index:end_ping_time_index,
-                        channel
+                        :, start_ping_time_index:end_ping_time_index, channel
                     ] = regrid_resample[:, :, channel]
                 #########################################################################
                 # [5] write subset of latitude/longitude
-                output_zarr_store.latitude[start_ping_time_index:end_ping_time_index] = geospatial.dropna()[
-                    'latitude'
-                ].values
-                output_zarr_store.longitude[start_ping_time_index:end_ping_time_index] = geospatial.dropna()[
-                    'longitude'
-                ].values
+                output_zarr_store.latitude[
+                    start_ping_time_index:end_ping_time_index
+                ] = geospatial.dropna()["latitude"].values
+                output_zarr_store.longitude[
+                    start_ping_time_index:end_ping_time_index
+                ] = geospatial.dropna()["longitude"].values
         except Exception as err:
-            print(f'Problem interpolating the data: {err}')
+            print(f"Problem interpolating the data: {err}")
             raise err
         print("Done interpolating data.")
     #######################################################
 ###########################################################

water_column_sonar_processing/geometry/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .geometry_manager import GeometryManager
+from .geometry_simplification import GeometrySimplification
+from .pmtile_generation import PMTileGeneration
+__all__ = ["GeometryManager", "GeometrySimplification", "PMTileGeneration"]

water-column-sonar-processing 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

water-column-sonar-processing 0.0.4py3-none-any.whl → 0.0.6py3-none-any.whl