PyPI - water-column-sonar-processing - Versions diffs - 25.11.1__py3-none-any.whl → 26.1.14__py3-none-any.whl - Mend

water-column-sonar-processing 25.11.1py3-none-any.whl → 26.1.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (22) hide show

water_column_sonar_processing/processing/raw_to_zarr.py CHANGED Viewed

@@ -2,22 +2,23 @@ import gc
 import os
 from datetime import datetime
 from pathlib import Path
+from typing import Optional
 import echopype as ep
 import numpy as np
 from zarr.codecs import Blosc
 from water_column_sonar_processing.aws import DynamoDBManager, S3Manager
-from water_column_sonar_processing.geometry import GeometryManager
 from water_column_sonar_processing.utility import Cleaner
+from water_column_sonar_processing.utility import Constants
 # from numcodecs import Blosc
+level_1 = str(Constants.LEVEL_1.value)
 def get_water_level(ds):
     """
-    needs to be mocked up so thats why this is broken out
+    needs to be mocked up so that's why this is broken out
     """
     if "water_level" in ds.keys():
         return ds.water_level.values
@@ -46,8 +47,8 @@ class RawToZarr:
     ############################################################################
     ############################################################################
+    @staticmethod
     def __zarr_info_to_table(
-        self,
         table_name,
         ship_name,
         cruise_name,
@@ -118,10 +119,11 @@ class RawToZarr:
     ############################################################################
     ############################################################################
     ############################################################################
+    @staticmethod
     def __upload_files_to_output_bucket(
-        self,
         output_bucket_name: str,
-        local_directory: str,  # e.g. 'D20070724-T042400.zarr'  # TODO: problem: if this is not in the current directory
+        local_directory: str,
+        # e.g. 'D20070724-T042400.zarr'  # TODO: problem: if this is not in the current directory
         object_prefix: str,  # e.g. "level_1/Henry_B._Bigelow/HB0706/EK60/"
         endpoint_url,
     ):
@@ -157,7 +159,7 @@ class RawToZarr:
         cruise_name,
         sensor_name,
         raw_file_name,
-        endpoint_url=None,
+        endpoint_url: Optional[str] = None,
         include_bot=True,
     ):
         """
@@ -165,7 +167,7 @@ class RawToZarr:
         to the nodd bucket.
         """
         print(f"Opening raw: {raw_file_name} and creating zarr store.")
-        geometry_manager = GeometryManager()
+        # geometry_manager = GeometryManager()
         cleaner = Cleaner()
         cleaner.delete_local_files(
             file_types=["*.zarr", "*.json"]
@@ -193,70 +195,61 @@ class RawToZarr:
         try:
             gc.collect()
             print("Opening raw file with echopype.")
-            # s3_file_path = f"s3://{bucket_name}/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}"
-            # s3_file_path = Path(f"s3://noaa-wcsd-pds/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}")
             echodata = ep.open_raw(
                 raw_file=raw_file_name,
                 sonar_model=sensor_name,
                 include_bot=include_bot,
-                # include_idx=?
-                # use_swap=True,
-                # max_chunk_size=300,
-                # storage_options={'anon': True } # 'endpoint_url': self.endpoint_url} # this was creating problems
             )
             print("Compute volume backscattering strength (Sv) from raw dataset.")
             ds_sv = ep.calibrate.compute_Sv(echodata)
-            ds_sv = ep.consolidate.add_depth(
-                ds_sv, echodata
-            )  # TODO: consolidate with other depth values
+            ds_sv = ep.consolidate.add_depth(ds_sv, echodata)
             water_level = get_water_level(ds_sv)
             gc.collect()
             print("Done computing volume backscatter strength (Sv) from raw dataset.")
             # Note: detected_seafloor_depth is located at echodata.vendor.detected_seafloor_depth
-            # but is not written out with ds_sv
+            # but is not written out with ds_sv --> add to ds_sv
             if "detected_seafloor_depth" in list(echodata.vendor.variables):
                 ds_sv["detected_seafloor_depth"] = (
                     echodata.vendor.detected_seafloor_depth
                 )
             #
             frequencies = echodata.environment.frequency_nominal.values
+            if len(frequencies) != len(set(frequencies)):
+                raise Exception("Problem number of frequencies does not match channels")
             #################################################################
-            # Get GPS coordinates
-            gps_data, lat, lon = geometry_manager.read_echodata_gps_data(
-                echodata=echodata,
-                output_bucket_name=output_bucket_name,
-                ship_name=ship_name,
-                cruise_name=cruise_name,
-                sensor_name=sensor_name,
-                file_name=raw_file_name,
-                endpoint_url=endpoint_url,
-                write_geojson=True,
-            )
+            # add gps data
             ds_sv = ep.consolidate.add_location(ds_sv, echodata)
-            ds_sv.latitude.values = (
-                lat  # overwriting echopype gps values to include missing values
-            )
-            ds_sv.longitude.values = lon
-            # gps_data, lat, lon = self.__get_gps_data(echodata=echodata)
+            if np.any(ds_sv.latitude.values > 90.0) or np.any(
+                ds_sv.latitude.values < -90.0
+            ):
+                ds_sv.latitude.values[np.where(ds_sv.latitude.values > 90.0)] = np.nan
+                ds_sv.latitude.values[np.where(ds_sv.latitude.values < -90.0)] = np.nan
+            if np.any(ds_sv.longitude.values > 180.0) or np.any(
+                ds_sv.longitude.values < -180.0
+            ):
+                ds_sv.longitude.values[np.where(ds_sv.longitude.values > 180.0)] = (
+                    np.nan
+                )
+                ds_sv.longitude.values[np.where(ds_sv.longitude.values < -180.0)] = (
+                    np.nan
+                )
             #################################################################
-            # Technically the min_echo_range would be 0 m.
-            # TODO: this var name is supposed to represent minimum resolution of depth measurements
-            # TODO revert this so that smaller diffs can be used
-            # The most minimum the resolution can be is as small as 0.25 meters
             min_echo_range = np.round(np.nanmin(np.diff(ds_sv.echo_range.values)), 2)
-            # For the HB0710 cruise the depths vary from 499.7215 @19cm to 2999.4805 @ 1cm. Moving that back
-            # inline with the
-            min_echo_range = np.max(
-                [0.20, min_echo_range]
-            )  # TODO: experiment with 0.25 and 0.50
             max_echo_range = float(np.nanmax(ds_sv.echo_range))
             # This is the number of missing values found throughout the lat/lon
-            num_ping_time_dropna = lat[~np.isnan(lat)].shape[0]  # symmetric to lon
-            #
+            lat = ds_sv.latitude.values
+            lon = ds_sv.longitude.values
+            num_ping_time_drop_na = np.min(
+                [  # Isn't always symmetric
+                    lat[~np.isnan(lat)].shape[0],
+                    lon[~np.isnan(lon)].shape[0],
+                ]
+            )
             start_time = (
                 np.datetime_as_string(ds_sv.ping_time.values[0], unit="ms") + "Z"
             )
@@ -268,23 +261,21 @@ class RawToZarr:
             #################################################################
             # Create the zarr store
             store_name = f"{Path(raw_file_name).stem}.zarr"
-            # Sv = ds_sv.Sv
-            # ds_sv['Sv'] = Sv.astype('int32', copy=False)
             ds_sv.to_zarr(
                 store=store_name,
                 zarr_format=3,
                 consolidated=False,
                 write_empty_chunks=False,
-            )  # ds_sv.Sv.sel(channel=ds_sv.channel.values[0]).shape
+            )
             gc.collect()
             #################################################################
-            output_zarr_prefix = f"level_1/{ship_name}/{cruise_name}/{sensor_name}/"
+            output_zarr_prefix = f"{level_1}/{ship_name}/{cruise_name}/{sensor_name}/"
             #################################################################
             # If zarr store already exists then delete
             s3_manager = S3Manager(endpoint_url=endpoint_url)
             child_objects = s3_manager.get_child_objects(
                 bucket_name=output_bucket_name,
-                sub_prefix=f"level_1/{ship_name}/{cruise_name}/{sensor_name}/{Path(raw_file_name).stem}.zarr",
+                sub_prefix=f"{level_1}/{ship_name}/{cruise_name}/{sensor_name}/{Path(raw_file_name).stem}.zarr",
             )
             if len(child_objects) > 0:
                 print(
@@ -310,7 +301,7 @@ class RawToZarr:
                 file_name=raw_file_name,
                 min_echo_range=min_echo_range,
                 max_echo_range=max_echo_range,
-                num_ping_time_dropna=num_ping_time_dropna,
+                num_ping_time_dropna=num_ping_time_drop_na,
                 start_time=start_time,
                 end_time=end_time,
                 frequencies=frequencies,
@@ -320,7 +311,6 @@ class RawToZarr:
             #######################################################################
             # TODO: verify count of objects matches, publish message, update status
             #######################################################################
-            print("Finished raw-to-zarr conversion.")
         except Exception as err:
             print(
                 f"Exception encountered creating local Zarr store with echopype: {err}"
@@ -328,96 +318,12 @@ class RawToZarr:
             raise RuntimeError(f"Problem creating local Zarr store, {err}")
         finally:
             gc.collect()
-            print("Finally.")
             cleaner.delete_local_files(
                 file_types=["*.raw", "*.bot", "*.zarr", "*.json"]
             )
-        print("Done creating local zarr store.")
+            print("Finished raw-to-zarr conversion.")
     ############################################################################
-    # TODO: does this get called?
-    # def execute(self, input_message):
-    #     ship_name = input_message['shipName']
-    #     cruise_name = input_message['cruiseName']
-    #     sensor_name = input_message['sensorName']
-    #     input_file_name = input_message['fileName']
-    #     #
-    #     try:
-    #         self.__update_processing_status(
-    #             file_name=input_file_name,
-    #             cruise_name=cruise_name,
-    #             pipeline_status="PROCESSING_RAW_TO_ZARR"
-    #         )
-    #         #######################################################################
-    #         store_name = f"{os.path.splitext(input_file_name)[0]}.zarr"
-    #         output_zarr_prefix = f"level_1/{ship_name}/{cruise_name}/{sensor_name}"
-    #         bucket_key = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/{input_file_name}"
-    #         zarr_prefix = os.path.join("level_1", ship_name, cruise_name, sensor_name)
-    #         #
-    #         os.chdir(TEMPDIR)  # Lambdas require use of temp directory
-    #         #######################################################################
-    #         #######################################################################
-    #         # Check if zarr store already exists
-    #         s3_objects = self.__s3.list_objects(
-    #             bucket_name=self.__output_bucket,
-    #             prefix=f"{zarr_prefix}/{os.path.splitext(input_file_name)[0]}.zarr/",
-    #             access_key_id=self.__output_bucket_access_key,
-    #             secret_access_key=self.__output_bucket_secret_access_key
-    #         )
-    #         if len(s3_objects) > 0:
-    #             print('Zarr store dataset already exists in s3, deleting existing and continuing.')
-    #             self.__s3.delete_objects(
-    #                 bucket_name=self.__output_bucket,
-    #                 objects=s3_objects,
-    #                 access_key_id=self.__output_bucket_access_key,
-    #                 secret_access_key=self.__output_bucket_secret_access_key
-    #             )
-    #         #######################################################################
-    #         # self.__delete_all_local_raw_and_zarr_files()
-    #         Cleaner.delete_local_files(file_types=["*.raw*", "*.zarr"])
-    #         self.__s3.download_file(
-    #             bucket_name=self.__input_bucket,
-    #             key=bucket_key,
-    #             file_name=input_file_name
-    #         )
-    #         self.__create_local_zarr_store(
-    #             raw_file_name=input_file_name,
-    #             cruise_name=cruise_name,
-    #             sensor_name=sensor_name,
-    #             output_zarr_prefix=output_zarr_prefix,
-    #             store_name=store_name
-    #         )
-    #         #######################################################################
-    #         self.__upload_files_to_output_bucket(store_name, output_zarr_prefix)
-    #         #######################################################################
-    #         # # TODO: verify count of objects matches
-    #         # s3_objects = self.__s3.list_objects(
-    #         #     bucket_name=self.__output_bucket,
-    #         #     prefix=f"{zarr_prefix}/{os.path.splitext(input_file_name)[0]}.zarr/",
-    #         #     access_key_id=self.__output_bucket_access_key,
-    #         #     secret_access_key=self.__output_bucket_secret_access_key
-    #         # )
-    #         #######################################################################
-    #         self.__update_processing_status(
-    #             file_name=input_file_name,
-    #             cruise_name=cruise_name,
-    #             pipeline_status='SUCCESS_RAW_TO_ZARR'
-    #         )
-    #         #######################################################################
-    #         self.__publish_done_message(input_message)
-    #         #######################################################################
-    #     # except Exception as err:
-    #     #     print(f'Exception encountered: {err}')
-    #     # self.__update_processing_status(
-    #     #     file_name=input_file_name,
-    #     #     cruise_name=cruise_name,
-    #     #     pipeline_status='FAILURE_RAW_TO_ZARR',
-    #     #     error_message=str(err),
-    #     # )
-    #     finally:
-    #         self.__delete_all_local_raw_and_zarr_files()
-    #######################################################################
     ############################################################################

water_column_sonar_processing/utility/cleaner.py CHANGED Viewed

@@ -5,7 +5,8 @@ import shutil
 ###########################################################
 class Cleaner:
-    def delete_local_files(self, file_types=["*.raw*", "*.model"]):  # '*.json'
+    @staticmethod
+    def delete_local_files(file_types=["*.raw*", "*.model"]):  # '*.json'
         # TODO: add .zarr to this
         print("Deleting all local raw and model files")
         for i in file_types:

water_column_sonar_processing/utility/constants.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from enum import Enum, unique
+import numpy as np
 @unique
 class Instruments(Enum):
@@ -21,15 +23,12 @@ class Constants(Enum):
     # NOTE: larger value here will speed up the TurfJS download of dataset in the UI
     # Problem interpolating the dataset: cannot reshape array of size 65536 into shape...
     # TODO: needs to be enum
-    SPATIOTEMPORAL_CHUNK_SIZE = int(2**16) - 1024
-    # int(2**16) - 1024,
-    # int(2**16) - 1024,
-    # e.g. int(2**14)
+    SPATIOTEMPORAL_CHUNK_SIZE = int(1e6)  # int(2 ** 16) - 1024
     # TODO: create test for SPATIOTEMPORAL_CHUNK_SIZE with requirement!
     LEVEL_0 = "raw"
     LEVEL_1 = "level_1"  # from bucket path
-    LEVEL_2 = "level_2"
+    LEVEL_2 = "level_2a"  # updating zarr store path for zarr v3
     LEVEL_3 = "level_3"
     EK60 = "EK60"  # TODO: use for "instrument"
@@ -39,11 +38,10 @@ class Constants(Enum):
 class Coordinates(Enum):
     """
-    Should try to specify
-        dtype
-        units
-        long_name — most readable description of variable
-        standard_name — name in lowercase and snake_case
+    dtype: data type
+    units: netcdf defined units
+    long_name: most readable description of variable
+    standard_name: name in lowercase and snake_case
     """
     PROJECT_NAME = "echofish"
@@ -54,65 +52,67 @@ class Coordinates(Enum):
     DEPTH_LONG_NAME = "Depth below surface"
     DEPTH_STANDARD_NAME = "depth"
+    # https://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#table-supported-units
     TIME = "time"
-    TIME_DTYPE = "float64"
+    TIME_DTYPE = "datetime64[ns]"
     # Note: units and calendar are used downstream by Xarray
-    TIME_UNITS = "seconds since 1970-01-01 00:00:00"
+    TIME_UNITS = "nanoseconds since 1970-01-01"
     TIME_LONG_NAME = "Timestamp of each ping"
     TIME_STANDARD_NAME = "time"
     TIME_CALENDAR = "proleptic_gregorian"
     # TODO: create test for reading out timestamps in Xarray
     FREQUENCY = "frequency"
-    FREQUENCY_DTYPE = "uint64"
+    FREQUENCY_DTYPE = np.uint64
     FREQUENCY_UNITS = "Hz"
     FREQUENCY_LONG_NAME = "Transducer frequency"
     FREQUENCY_STANDARD_NAME = "sound_frequency"
     LATITUDE = "latitude"
-    LATITUDE_DTYPE = "float32"
+    LATITUDE_DTYPE = np.float32
     LATITUDE_UNITS = "degrees_north"
     LATITUDE_LONG_NAME = "Latitude"
     LATITUDE_STANDARD_NAME = "latitude"
     LONGITUDE = "longitude"
-    LONGITUDE_DTYPE = "float32"
+    LONGITUDE_DTYPE = np.float32
     LONGITUDE_UNITS = "degrees_east"
     LONGITUDE_LONG_NAME = "Longitude"
     LONGITUDE_STANDARD_NAME = "longitude"
     BOTTOM = "bottom"
-    BOTTOM_DTYPE = "float32"
+    BOTTOM_DTYPE = np.float32
     BOTTOM_UNITS = "m"
     BOTTOM_LONG_NAME = "Detected sea floor depth"
     BOTTOM_STANDARD_NAME = "bottom"
     SPEED = "speed"
-    SPEED_DTYPE = "float32"
+    SPEED_DTYPE = np.float32
     SPEED_UNITS = "Knots"
     SPEED_LONG_NAME = "Nautical miles per hour"
     SPEED_STANDARD_NAME = "speed"
-    # This is the width of each slice of the water columns
+    # This is the width of each 'pixel' of the water columns
     DISTANCE = "distance"
-    DISTANCE_DTYPE = "float32"
+    DISTANCE_DTYPE = np.float32
     DISTANCE_UNITS = "m"
     DISTANCE_LONG_NAME = "GPS distance"
     DISTANCE_STANDARD_NAME = "distance"
     SV = "Sv"
-    SV_DTYPE = "float32"  # int64
+    SV_DTYPE = np.float32
     SV_UNITS = "dB"
     SV_LONG_NAME = "Volume backscattering strength (Sv re 1 m-1)"
     SV_STANDARD_NAME = "volume_backscattering_strength"
-class BatchShape(Enum):
-    """
-    The tensor shape of a machine learning sample.
-    """
-    DEPTH = 2
-    TIME = 3
-    FREQUENCY = 4
-    BATCH_SIZE = 5
+# TODO: delete this
+# class BatchShape(Enum):
+#     """
+#     The tensor shape of a machine learning sample.
+#     """
+#
+#     DEPTH = 2
+#     TIME = 3
+#     FREQUENCY = 4
+#     BATCH_SIZE = 5

water-column-sonar-processing 25.11.1__py3-none-any.whl → 26.1.14__py3-none-any.whl

Potentially problematic release.

water-column-sonar-processing 25.11.1py3-none-any.whl → 26.1.14py3-none-any.whl