PyPI - water-column-sonar-processing - Versions diffs - 0.0.13__py3-none-any.whl → 24.1.1__py3-none-any.whl - Mend

water-column-sonar-processing 0.0.13py3-none-any.whl → 24.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (19) hide show

water_column_sonar_processing/aws/s3fs_manager.py CHANGED Viewed

@@ -16,6 +16,7 @@ class S3FSManager:
         # self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
         self.s3_region = os.environ.get("AWS_REGION", default="us-east-1")
         self.s3fs = s3fs.S3FileSystem(
+            asynchronous=False,
             endpoint_url=endpoint_url,
             key=os.environ.get("OUTPUT_BUCKET_ACCESS_KEY"),
             secret=os.environ.get("OUTPUT_BUCKET_SECRET_ACCESS_KEY"),

water_column_sonar_processing/cruise/create_empty_zarr_store.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
+import tempfile
 import numcodecs
 import numpy as np
@@ -11,7 +12,6 @@ from water_column_sonar_processing.utility import Cleaner
 numcodecs.blosc.use_threads = False
 numcodecs.blosc.set_nthreads(1)
-# TEMPDIR = "/tmp"
 # TODO: when ready switch to version 3 of model spec
 # ZARR_V3_EXPERIMENTAL_API = 1
 # creates the latlon data: foo = ep.consolidate.add_location(ds_Sv, echodata)
@@ -61,7 +61,6 @@ class CreateEmptyZarrStore:
         # TODO: move to common place
     #######################################################
-    # @classmethod
     def create_cruise_level_zarr_store(
         self,
         output_bucket_name: str,
@@ -69,8 +68,8 @@ class CreateEmptyZarrStore:
         cruise_name: str,
         sensor_name: str,
         table_name: str,
-        tempdir: str,
     ) -> None:
+        tempdir = tempfile.TemporaryDirectory()
         try:
             # HB0806 - 123, HB0903 - 220
             dynamo_db_manager = DynamoDBManager()
@@ -146,7 +145,7 @@ class CreateEmptyZarrStore:
             print(f"new_height: {new_height}")
             zarr_manager.create_zarr_store(
-                path=tempdir,
+                path=tempdir.name, # TODO: need to use .name or problem
                 ship_name=ship_name,
                 cruise_name=cruise_name,
                 sensor_name=sensor_name,
@@ -159,7 +158,7 @@ class CreateEmptyZarrStore:
             #################################################################
             self.upload_zarr_store_to_s3(
                 output_bucket_name=output_bucket_name,
-                local_directory=tempdir,
+                local_directory=tempdir.name, # TODO: need to use .name or problem
                 object_prefix=zarr_prefix,
                 cruise_name=cruise_name,
             )

water_column_sonar_processing/cruise/datatree_manager.py ADDED Viewed

@@ -0,0 +1,24 @@
+### https://xarray-datatree.readthedocs.io/en/latest/data-structures.html
+import numpy as np
+from datatree import DataTree
+import xarray as xr
+class DatatreeManager:
+    #######################################################
+    def __init__(
+        self,
+    ):
+        self.dtype = "float32"
+    #################################################################
+    def create_datatree(
+        self,
+        input_ds,
+    ) -> None:
+        ds1 = xr.Dataset({"foo": "orange"})
+        dt = DataTree(name="root", data=ds1)  # create root node
+        ds2 = xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])})
+        return dt

water_column_sonar_processing/cruise/resample_regrid.py CHANGED Viewed

@@ -281,12 +281,7 @@ class ResampleRegrid:
                 print(f"start_ping_time_index: {start_ping_time_index}, end_ping_time_index: {end_ping_time_index}")
                 #########################################################################
                 # write Sv values to cruise-level-model-store
-                for channel in range(
-                    len(input_xr.channel.values)
-                ):  # does not like being written in one fell swoop :(
-                    output_zarr_store.Sv[
-                        :, start_ping_time_index:end_ping_time_index, channel
-                    ] = regrid_resample[:, :, channel]
+                output_zarr_store.Sv[:, start_ping_time_index:end_ping_time_index, :] = regrid_resample.values
                 #########################################################################
                 # [5] write subset of latitude/longitude
@@ -300,27 +295,27 @@ class ResampleRegrid:
                 #########################################################################
                 # TODO: add the "detected_seafloor_depth/" to the
                 #  L2 cruise dataarrays
-                # TODO: make bottom optional if 'detected_seafloor_depth' in input_xr.variables:
+                # TODO: make bottom optional
                 # TODO: Only checking the first channel for now. Need to average across all channels
                 #  in the future. See https://github.com/CI-CMG/water-column-sonar-processing/issues/11
-                # detected_seafloor_depths = input_xr.detected_seafloor_depth.values[0, :] # note can include nans?
-                detected_seafloor_depth = input_xr.detected_seafloor_depth.values
-                detected_seafloor_depth[detected_seafloor_depth == 0.] = np.nan
-                detected_seafloor_depths = np.nanmean(detected_seafloor_depth, 0)
-                detected_seafloor_depths[detected_seafloor_depths == 0.] = np.nan
-                print(f"min depth measured: {np.nanmin(detected_seafloor_depths)}")
-                print(f"max depth measured: {np.nanmax(detected_seafloor_depths)}")
-                #available_indices = np.argwhere(np.isnan(geospatial['latitude'].values))
-                output_zarr_store.bottom[
-                    start_ping_time_index:end_ping_time_index
-                ] = detected_seafloor_depths
+                if 'detected_seafloor_depth' in input_xr.variables:
+                    print('Found detected_seafloor_depth, adding data to output store.')
+                    detected_seafloor_depth = input_xr.detected_seafloor_depth.values
+                    detected_seafloor_depth[detected_seafloor_depth == 0.] = np.nan
+                    # TODO: problem here: Processing file: D20070711-T210709.
+                    detected_seafloor_depths = np.nanmean(detected_seafloor_depth, 0) # RuntimeWarning: Mean of empty slice detected_seafloor_depths = np.nanmean(detected_seafloor_depth, 0)
+                    detected_seafloor_depths[detected_seafloor_depths == 0.] = np.nan
+                    print(f"min depth measured: {np.nanmin(detected_seafloor_depths)}")
+                    print(f"max depth measured: {np.nanmax(detected_seafloor_depths)}")
+                    #available_indices = np.argwhere(np.isnan(geospatial['latitude'].values))
+                    output_zarr_store.bottom[
+                        start_ping_time_index:end_ping_time_index
+                    ] = detected_seafloor_depths
                 #########################################################################
                 #########################################################################
         except Exception as err:
             print(f"Problem interpolating the data: {err}")
             raise err
-        # else:
-        #     pass
         finally:
             print("Done interpolating data.")
             # TODO: read across times and verify data was written?

water_column_sonar_processing/geometry/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
+from .elevation_manager import ElevationManager
 from .geometry_manager import GeometryManager
 from .geometry_simplification import GeometrySimplification
 from .pmtile_generation import PMTileGeneration
-__all__ = ["GeometryManager", "GeometrySimplification", "PMTileGeneration"]
+__all__ = ["ElevationManager", "GeometryManager", "GeometrySimplification", "PMTileGeneration"]

water_column_sonar_processing/geometry/elevation_manager.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""
+https://gis.ngdc.noaa.gov/arcgis/rest/services/DEM_mosaics/DEM_global_mosaic/ImageServer/identify?geometry=-31.70235%2C13.03332&geometryType=esriGeometryPoint&returnGeometry=false&returnCatalogItems=false&f=json
+https://gis.ngdc.noaa.gov/arcgis/rest/services/DEM_mosaics/DEM_global_mosaic/ImageServer/
+    identify?
+        geometry=-31.70235%2C13.03332
+        &geometryType=esriGeometryPoint
+        &returnGeometry=false
+        &returnCatalogItems=false
+        &f=json
+{"objectId":0,"name":"Pixel","value":"-5733","location":{"x":-31.702349999999999,"y":13.03332,"spatialReference":{"wkid":4326,"latestWkid":4326}},"properties":null,"catalogItems":null,"catalogItemVisibilities":[]}
+-5733
+(base) rudy:deleteME rudy$ curl https://api.opentopodata.org/v1/gebco2020?locations=13.03332,-31.70235
+{
+  "results": [
+    {
+      "dataset": "gebco2020",
+      "elevation": -5729.0,
+      "location": {
+        "lat": 13.03332,
+        "lng": -31.70235
+      }
+    }
+  ],
+  "status": "OK"
+}
+"""
+import json
+import time
+import requests
+from collections.abc import Generator
+def chunked(
+    ll: list,
+    n: int
+) -> Generator:
+    # Yields successively n-sized chunks from ll.
+    for i in range(0, len(ll), n):
+        yield ll[i : i + n]
+class ElevationManager:
+    #######################################################
+    def __init__(
+        self,
+    ):
+        self.DECIMAL_PRECISION = 5  # precision for GPS coordinates
+        self.TIMOUT_SECONDS = 10
+    #######################################################
+    def get_arcgis_elevation(
+            self,
+            lngs: list,
+            lats: list,
+            chunk_size: int=500, # I think this is the api limit
+    ) -> int:
+        # Reference: https://developers.arcgis.com/rest/services-reference/enterprise/map-to-image/
+        # Info: https://www.arcgis.com/home/item.html?id=c876e3c96a8642ab8557646a3b4fa0ff
+        ### 'https://gis.ngdc.noaa.gov/arcgis/rest/services/DEM_mosaics/DEM_global_mosaic/ImageServer/identify?geometry={"points":[[-31.70235,13.03332],[-32.70235,14.03332]]}&geometryType=esriGeometryMultipoint&returnGeometry=false&returnCatalogItems=false&f=json'
+        if len(lngs) != len(lats):
+            raise ValueError("lngs and lats must have same length")
+        geometryType = "esriGeometryMultipoint" # TODO: allow single point?
+        depths = []
+        list_of_points = [list(elem) for elem in list(zip(lngs, lats))]
+        for chunk in chunked(list_of_points, chunk_size):
+            time.sleep(0.1)
+            # order: (lng, lat)
+            geometry = f'{{"points":{str(chunk)}}}'
+            url=f'https://gis.ngdc.noaa.gov/arcgis/rest/services/DEM_mosaics/DEM_global_mosaic/ImageServer/identify?geometry={geometry}&geometryType={geometryType}&returnGeometry=false&returnCatalogItems=false&f=json'
+            result = requests.get(url, timeout=self.TIMOUT_SECONDS)
+            res = json.loads(result.content.decode('utf8'))
+            if 'results' in res:
+                for element in res['results']:
+                    depths.append(float(element['value']))
+            elif 'value' in res:
+                depths.append(float(res['value']))
+        return depths
+    # def get_gebco_bathymetry_elevation(self) -> int:
+    #     # Documentation: https://www.opentopodata.org/datasets/gebco2020/
+    #     latitude = 13.03332
+    #     longitude = -31.70235
+    #     dataset = "gebco2020"
+    #     url = f"https://api.opentopodata.org/v1/{dataset}?locations={latitude},{longitude}"
+    #     pass
+    # def get_elevation(
+    #         self,
+    #         df,
+    #         lat_column,
+    #         lon_column,
+    # ) -> int:
+    #     """Query service using lat, lon. add the elevation values as a new column."""
+    #     url = r'https://epqs.nationalmap.gov/v1/json?'
+    #     elevations = []
+    #     for lat, lon in zip(df[lat_column], df[lon_column]):
+    #         # define rest query params
+    #         params = {
+    #             'output': 'json',
+    #             'x': lon,
+    #             'y': lat,
+    #             'units': 'Meters'
+    #         }
+    #         result = requests.get((url + urllib.parse.urlencode(params)))
+    #         elevations.append(result.json()['value'])
+    #     return elevations

water_column_sonar_processing/index/index_manager.py CHANGED Viewed

@@ -7,13 +7,20 @@ from concurrent.futures import as_completed
 from water_column_sonar_processing.aws import S3Manager
+MAX_POOL_CONNECTIONS = 64
+MAX_CONCURRENCY = 64
+MAX_WORKERS = 64
+GB = 1024**3
 class IndexManager:
+    # TODO: index into dynamodb instead of csv files
     def __init__(self, input_bucket_name, calibration_bucket, calibration_key):
         self.input_bucket_name = input_bucket_name
         self.calibration_bucket = calibration_bucket
         self.calibration_key = calibration_key
-        self.s3_manager = S3Manager()
+        self.s3_manager = S3Manager() # TODO: make anonymous?
     #################################################################
     def list_ships(
@@ -50,6 +57,9 @@ class IndexManager:
         self,
         cruise_prefixes,
     ):
+        """
+        This returns a list of ek60 prefixed cruises.
+        """
         cruise_sensors = []  # includes all sensor types
         for cruise_prefix in cruise_prefixes:
             page_iterator = self.s3_manager.paginator.paginate(
@@ -67,9 +77,12 @@ class IndexManager:
         cruise_name,
         sensor_name,
     ):
+        # Gets all raw files for a cruise under the given prefix
         prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/"  # Note no forward slash at beginning
         page_iterator = self.s3_manager.paginator.paginate(
-            Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
+            Bucket=self.input_bucket_name,
+            Prefix=prefix,
+            Delimiter="/"
         )
         all_files = []
         for page in page_iterator:
@@ -77,6 +90,57 @@ class IndexManager:
                 all_files.extend([i["Key"] for i in page["Contents"]])
         return [i for i in all_files if i.endswith(".raw")]
+    def get_first_raw_file(
+        self,
+        ship_name,
+        cruise_name,
+        sensor_name,
+    ):
+        # Same as above but only needs to get the first raw file
+        # because we are only interested in the first datagram of one file
+        prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/"  # Note no forward slash at beginning
+        # page_iterator = self.s3_manager.paginator.paginate(
+        #     Bucket=self.input_bucket_name,
+        #     Prefix=prefix,
+        #     Delimiter="/",
+        #     PaginationConfig={ 'MaxItems': 5 }
+        # ) # TODO: this can create a problem if there is a non raw file returned first
+        ### filter with JMESPath expressions ###
+        page_iterator = self.s3_manager.paginator.paginate(
+            Bucket=self.input_bucket_name,
+            Prefix=prefix,
+            Delimiter="/",
+        )
+        # page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
+        page_iterator = page_iterator.search(expression="Contents[?contains(Key, '.raw')] ")
+        for res in page_iterator:
+            if "Key" in res:
+                return res["Key"]
+        # else raise exception?
+        # DSJ0604-D20060406-T050022.bot 2kB == 2152 'Size'
+    def get_files_under_size(
+        self,
+        ship_name,
+        cruise_name,
+        sensor_name,
+    ):
+        # THIS isn't used, just playing with JMES paths spec
+        prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/"
+        ### filter with JMESPath expressions ###
+        page_iterator = self.s3_manager.paginator.paginate(
+            Bucket=self.input_bucket_name,
+            Prefix=prefix,
+            Delimiter="/",
+        )
+        page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
+        all_files = []
+        for page in page_iterator:
+            if "Contents" in page.keys():
+                all_files.extend([i["Key"] for i in page["Contents"]])
+        return [i for i in all_files if i.endswith(".raw")]
     #################################################################
     def get_raw_files_csv(
         self,
@@ -102,6 +166,29 @@ class IndexManager:
         df.to_csv(f"{ship_name}_{cruise_name}.csv", index=False, header=False, sep=" ")
         print("done")
+    def get_raw_files_list(
+        self,
+        ship_name,
+        cruise_name,
+        sensor_name,
+    ):
+        # gets all raw files in cruise and returns a list of dicts
+        raw_files = self.get_raw_files(
+            ship_name=ship_name,
+            cruise_name=cruise_name,
+            sensor_name=sensor_name
+        )
+        files_list = [
+            {
+                "ship_name": ship_name,
+                "cruise_name": cruise_name,
+                "sensor_name": sensor_name,
+                "file_name": os.path.basename(raw_file),
+            }
+            for raw_file in raw_files
+        ]
+        return files_list
     #################################################################
     def get_subset_ek60_prefix( # TODO: is this used?
         self,
@@ -169,16 +256,14 @@ class IndexManager:
         return first_datagram
     #################################################################
-    def get_subset_datagrams(
+    def get_subset_datagrams( # TODO: is this getting used
         self,
         df: pd.DataFrame
     ) -> list:
         print("getting subset of datagrams")
-        select_keys = list(
-            df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values
-        )
+        select_keys = df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values.tolist()
         all_datagrams = []
-        with ThreadPoolExecutor(max_workers=self.max_pool_connections) as executor:
+        with ThreadPoolExecutor(max_workers=MAX_POOL_CONNECTIONS) as executor:
             futures = [
                 executor.submit(self.scan_datagram, select_key)
                 for select_key in select_keys

water_column_sonar_processing/model/zarr_manager.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import os
 import numcodecs
 import numpy as np
 import xarray as xr
@@ -48,6 +47,9 @@ class ZarrManager:
             endpoint=True,
         )
+        if np.any(np.isnan(all_cruise_depth_values)):
+            raise Exception('Problem depth values returned were NaN.')
         print("Done getting depth values.")
         return all_cruise_depth_values.round(decimals=2)
@@ -67,10 +69,10 @@ class ZarrManager:
         print(
             f"Creating local zarr_manager store at {cruise_name}.zarr for ship {ship_name}"
         )
-        # There should be no repeated frequencies
-        assert len(frequencies) == len(set(frequencies))
-        # TODO: eventually switch coordinate to "channel"
+        # There can not currently be repeated frequencies
+        # TODO: eventually switch coordinate to "channel" because frequencies can repeat
+        if len(frequencies) != len(set(frequencies)):
+            raise Exception("Number of frequencies does not match number of channels")
         print(f"Debugging number of threads: {self.__num_threads}")
@@ -118,8 +120,9 @@ class ZarrManager:
             fill_value=np.nan,
             overwrite=self.__overwrite,
         )
-        # TODO: change to exception
-        assert not np.any(np.isnan(depth_values))
+        if np.any(np.isnan(depth_values)):
+            raise Exception('Some depth values returned were NaN.')
         root.depth.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.DEPTH.value]
@@ -219,7 +222,8 @@ class ZarrManager:
         root.create_dataset(
             name=Coordinates.SV.value,
             shape=(len(depth_values), width, len(frequencies)),
-            chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, len(frequencies)),
+            # chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, len(frequencies)),
+            chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, 1), # 256x256x1 <- speed up for alex
             dtype=np.dtype(
                 Coordinates.SV_DTYPE.value
             ),  # TODO: try to experiment with 'float16'
@@ -246,11 +250,12 @@ class ZarrManager:
         #
         root.attrs["processing_software_name"] = Coordinates.PROJECT_NAME.value
         root.attrs["processing_software_version"] = (
-            "0.0.13"  # TODO: get programmatically, echopype>utils>prov.py
+            "24.01.01"  # TODO: get programmatically, echopype>utils>prov.py
         )
         root.attrs["processing_software_time"] = Timestamp.get_timestamp()
         #
         root.attrs["calibration_status"] = calibration_status
+        root.attrs["tile_size"] = Constants.TILE_SIZE.value
         zarr.consolidate_metadata(store)
         #####################################################################

water_column_sonar_processing/processing/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from .cruise_sampler import CruiseSampler
+# from .cruise_sampler import CruiseSampler
 from .raw_to_zarr import RawToZarr
+from .batch_downloader import BatchDownloader
-__all__ = ["CruiseSampler", "RawToZarr"]
+__all__ = ["RawToZarr", "BatchDownloader"]

water_column_sonar_processing/processing/batch_downloader.py ADDED Viewed

@@ -0,0 +1,132 @@
+import xarray as xr
+import numpy as np
+import pandas as pd
+import xbatcher
+from typing import Optional
+# s3fs.core.setup_logging("DEBUG")
+class BatchDownloader:
+    """
+    Uses the xbatcher XbatchDownloader to download data from an xarray dataset. Connection
+    is established
+    """
+    def __init__(
+            self,
+            bucket_name: Optional[str] = "noaa-wcsd-zarr-pds",
+            ship_name: Optional[str] = "Henry_B._Bigelow",
+            cruise_name: Optional[str] = "HB0707",
+            sensor_name: Optional[str] = "EK60",
+            patch_dims: Optional[int] = 64, # TODO: change to 64
+            # input_steps: Optional[int] = 3,
+    ):
+        self.bucket_name = bucket_name
+        self.ship_name = ship_name
+        self.cruise_name = cruise_name
+        self.sensor_name = sensor_name
+        self.patch_dims = patch_dims
+    # TODO: move this to the s3fs module
+    def get_s3_zarr_store(self) -> xr.Dataset:
+        """ Returns an Xarray Dataset """
+        s3_zarr_store_path = f"{self.bucket_name}/level_2/{self.ship_name}/{self.cruise_name}/{self.sensor_name}/{self.cruise_name}.zarr"
+        # Info about the HB0707 cruise:
+        #   Time: ["2007-07-11T18:20:33.657573888", "2007-07-11T18:20:53.657573888", "2007-07-13T00:55:17.454448896"]
+        #   Frequency: [ 18000.  38000. 120000. 200000.]
+        #   Depth: [0.19, 999.74]
+        # Needed to override credentials for github actions
+        # s3_file_system = s3fs.S3FileSystem(anon=True)
+        # store = s3fs.S3Map(root=s3_zarr_store_path, s3=s3_file_system, check=False)
+        # return xr.open_zarr(store=f"s3://{s3_zarr_store_path}", consolidated=True, storage_options={'anon': True})
+        return xr.open_dataset(f"s3://{s3_zarr_store_path}", engine="zarr", storage_options={'anon': True})
+        # return xr.open_zarr(store, consolidated=True)
+    def get_toy_batch_generator(self) -> xbatcher.BatchGenerator:
+        """
+        Returns a BatchGenerator with subsets of Sv data
+        Note: this is synthetic data, for a smaller toy example
+        """
+        depth = np.arange(1, 21) # N meters
+        time = pd.date_range(start="2025-01-01", end="2025-01-31", freq='D') # N days
+        frequency = [1_000, 2_000, 3_000] # N frequencies
+        Sv = np.random.rand(len(depth), len(time), len(frequency)) # synthetic data
+        cruise = xr.Dataset(
+            data_vars={
+                "Sv": (["depth", "time", "frequency"], Sv)
+            },
+            coords={
+                "depth": depth,
+                "time": time,
+                "frequency": frequency,
+            },
+            attrs=dict(description="Toy Example"),
+        )
+        batch_generator = xbatcher.BatchGenerator(
+            ds=cruise,
+            # get samples that are shaped 10x10x3
+            input_dims={ 'depth': 10, 'time': 10, 'frequency': cruise.frequency.shape[0] }, # A dictionary specifying the size of the inputs in each dimension, e.g. ``{'lat': 30, 'lon': 30}`` These are the dimensions the ML library will see. All other dimensions will be stacked into one dimension called ``sample``.
+            # no overlap between samples
+            input_overlap={ 'depth': 0, 'time': 0, 'frequency': 0 }, # Zero means no overlap. A dictionary specifying the overlap along each dimension
+        )
+        return batch_generator
+    def get_s3_batch_generator(self) -> xbatcher.BatchGenerator:
+        """ Returns a BatchGenerator with subsets of Sv data from s3 Zarr store """
+        cruise = self.get_s3_zarr_store()
+        # TODO: temporarily limits to a smaller slice of the data
+        cruise_select = (cruise
+            .where(cruise.depth < 100., drop=True)
+            .sel(time=slice("2007-07-11T18:20:33", "2007-07-11T18:20:53"))
+            # .sel(time=slice("2007-07-11T18:20:00", "2007-07-11T19:20:00"))
+        )
+        print(cruise_select.Sv.shape) # (526 depth, 21 time, 4 freq)
+        batch_generator = xbatcher.BatchGenerator(
+            ds=cruise_select,
+            input_dims={ 'depth': 10, 'time': 10, 'frequency': cruise.frequency.shape[0] }, # A dictionary specifying the size of the inputs in each dimension, e.g. ``{'lat': 30, 'lon': 30}`` These are the dimensions the ML library will see. All other dimensions will be stacked into one dimension called ``sample``.
+            input_overlap={ 'depth': 0, 'time': 0, 'frequency': 0 }, # Zero means no overlap. A dictionary specifying the overlap along each dimension
+            preload_batch=False,
+        )
+        # TODO: need to raise exception if all the data is nan
+        return batch_generator
+        # https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator
+    def get_s3_manual_batch_generator(self):
+        """
+        Using just xarray (no xbatcher), iterate through the data and generate batches.
+        Returns a BatchGenerator with subsets of Sv data from s3 Zarr store.
+        """
+        cruise = self.get_s3_zarr_store()
+        # TODO: temporarily limits to a smaller slice of the data
+        cruise_select = (cruise
+            .where(cruise.depth < 100., drop=True)
+            .sel(time=slice("2007-07-11T18:20:33", "2007-07-11T18:20:53"))
+        )
+        print(cruise_select.Sv.shape) # (526 depth, 21 time, 4 freq)
+        batch_generator = xbatcher.BatchGenerator(
+            ds=cruise_select,
+            input_dims={ 'depth': 10, 'time': 10, 'frequency': cruise.frequency.shape[0] }, # A dictionary specifying the size of the inputs in each dimension, e.g. ``{'lat': 30, 'lon': 30}`` These are the dimensions the ML library will see. All other dimensions will be stacked into one dimension called ``sample``.
+            input_overlap={ 'depth': 0, 'time': 0, 'frequency': 0 }, # Zero means no overlap. A dictionary specifying the overlap along each dimension
+            preload_batch=True,
+        )
+        # TODO: need to raise exception if all the data is nan
+        return batch_generator
+        # https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator
+"""
+(105, 21, 4)
+depth-start: 0.1899999976158142, depth-end: 1.899999976158142
+time-start: 2007-07-11T18:20:33.657573888, time-end: 2007-07-11T18:20:42.657573888
+frequency-start: 18000.0, frequency-end: 200000.0
+(10, 10, 4)
+np.nanmean: -53.70000076293945
+"""

water_column_sonar_processing/processing/raw_to_zarr.py CHANGED Viewed

@@ -11,8 +11,6 @@ from water_column_sonar_processing.aws import DynamoDBManager, S3Manager
 from water_column_sonar_processing.geometry import GeometryManager
 from water_column_sonar_processing.utility import Cleaner, PipelineStatus
-TEMPDIR = "/tmp"
 # This code is getting copied from echofish-aws-raw-to-zarr-lambda
 class RawToZarr:

water_column_sonar_processing/utility/constants.py CHANGED Viewed

@@ -3,11 +3,12 @@ from enum import Enum, Flag, unique
 @unique
 class Constants(Flag):
-    TILE_SIZE = 2048
+    TILE_SIZE = 256 # TODO: add tile size to metadata?
     # Average https://noaa-wcsd-zarr-pds.s3.us-east-1.amazonaws.com/level_2/Henry_B._Bigelow/HB0902/EK60/HB0902.zarr/time/927
     # chunk size is ~1.3 kB, HB0902 cruise takes ~30 seconds to load all time/lat/lon data
     # NOTE: larger value here will speed up the TurfJS download of data in the UI
-    SPATIOTEMPORAL_CHUNK_SIZE = int(1e5) # 2**17
+    SPATIOTEMPORAL_CHUNK_SIZE = int(2**16) # 2**17
 class Coordinates(Enum):

{water_column_sonar_processing-0.0.13.dist-info → water_column_sonar_processing-24.1.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: water_column_sonar_processing
-Version: 0.0.13
+Version: 24.1.1
 Summary: A processing tool for water column sonar data.
 Author-email: Rudy Klucik <rudy.klucik@noaa.gov>
 Project-URL: Homepage, https://github.com/CI-CMG/water-column-sonar-processing
@@ -8,7 +8,7 @@ Project-URL: Issues, https://github.com/CI-CMG/water-column-sonar-processing/iss
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
-Requires-Python: >=3.10
+Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: aiobotocore==2.15.2
@@ -26,26 +26,19 @@ Requires-Dist: pandas==2.2.3
 Requires-Dist: pyarrow==18.1.0
 Requires-Dist: python-dotenv==1.0.1
 Requires-Dist: requests==2.32.3
-Requires-Dist: s3fs==2023.12.1
+Requires-Dist: s3fs==2024.2.0
 Requires-Dist: scipy==1.14.1
 Requires-Dist: setuptools
 Requires-Dist: shapely==2.0.3
 Requires-Dist: typing-extensions==4.10.0
 Requires-Dist: xarray==2024.10.0
+Requires-Dist: xbatcher==0.4.0
 Requires-Dist: zarr==2.18.3
 # Water Column Sonar Processing
 Processing tool for converting L0 data to L1 and L2 as well as generating geospatial information
-![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/CI-CMG/water-column-sonar-processing/test_action.yaml)
-![GitHub License](https://img.shields.io/github/license/CI-CMG/water-column-sonar-processing)
-![PyPI - Implementation](https://img.shields.io/pypi/v/water-column-sonar-processing?color=black)
-![PyPI - Downloads](https://img.shields.io/pypi/dd/water-column-sonar-processing)
-![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/CI-CMG/water-column-sonar-processing) ![GitHub repo size](https://img.shields.io/github/repo-size/CI-CMG/water-column-sonar-processing)
+![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/CI-CMG/water-column-sonar-processing/test_action.yaml?color=black) ![PyPI - Implementation](https://img.shields.io/pypi/v/water-column-sonar-processing?color=black) ![GitHub License](https://img.shields.io/github/license/CI-CMG/water-column-sonar-processing?color=black) ![PyPI - Downloads](https://img.shields.io/pypi/dd/water-column-sonar-processing?color=black) ![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/CI-CMG/water-column-sonar-processing?color=black) ![GitHub repo size](https://img.shields.io/github/repo-size/CI-CMG/water-column-sonar-processing?color=black)
 # Setting up the Python Environment
 > Python 3.10.12
@@ -103,12 +96,6 @@ or
 Following this tutorial:
 https://packaging.python.org/en/latest/tutorials/packaging-projects/
-# To Publish To PROD
-```commandline
-python -m build
-python -m twine upload --repository pypi dist/*
-```
 # Pre Commit Hook
 see here for installation: https://pre-commit.com/
 https://dev.to/rafaelherik/using-trufflehog-and-pre-commit-hook-to-prevent-secret-exposure-edo
@@ -133,13 +120,29 @@ https://colab.research.google.com/drive/1KiLMueXiz9WVB9o4RuzYeGjNZ6PsZU7a#scroll
 # Tag a Release
 Step 1 --> increment the semantic version in the zarr_manager.py "metadata" & the "pyproject.toml"
 ```commandline
-git tag "v0.0.13" -a
+git tag -a v24.01.01 -m "Releasing version v24.01.01"
 ```
-Step 3 --> enter description
 ```commandline
 git push origin --tags
 ```
+# To Publish To PROD
+```commandline
+python -m build
+python -m twine upload --repository pypi dist/*
+```
 # TODO:
 add https://pypi.org/project/setuptools-scm/
 for extracting the version
+# Security scanning
+> bandit -r water_column_sonar_processing/
+# Data Debugging
+Experimental Plotting in Xarray (hvPlot):
+https://colab.research.google.com/drive/18vrI9LAip4xRGEX6EvnuVFp35RAiVYwU#scrollTo=q9_j9p2yXsLV
+HB0707 Cruise zoomable:
+https://hb0707.s3.us-east-1.amazonaws.com/index.html

{water_column_sonar_processing-0.0.13.dist-info → water_column_sonar_processing-24.1.1.dist-info}/RECORD RENAMED Viewed

@@ -3,31 +3,32 @@ water_column_sonar_processing/process.py,sha256=-yQtK3rnZq6lGAr3q02zLDe1NuMH9c0P
 water_column_sonar_processing/aws/__init__.py,sha256=KJqK8oYMn-u8n8i-Jp_lG5BvCOTjwWSjWP8yAyDlWVo,297
 water_column_sonar_processing/aws/dynamodb_manager.py,sha256=LQ3eh7Zf1fBLG-RKovod9KbQwhE-0Qdq1JPk4Ro5bdo,10252
 water_column_sonar_processing/aws/s3_manager.py,sha256=-PCiW7YF31nGIPa1oVOVTzjTSExAAkT_IyNNnvWv2HU,16214
-water_column_sonar_processing/aws/s3fs_manager.py,sha256=d7p9Sx-ocooKzHjVJVCawnXSGv6BpmKvvN9uhzilglw,2529
+water_column_sonar_processing/aws/s3fs_manager.py,sha256=Vo-DXj6vgb8t1l4LdtNu7JCtq_RfFsnl33RuGeBUXhk,2561
 water_column_sonar_processing/aws/sns_manager.py,sha256=Dp9avG5VSugSWPR1dZ-askuAw1fCZkNUHbOUP65iR-k,1867
 water_column_sonar_processing/aws/sqs_manager.py,sha256=NSUrWmnSC8h8Gf7gT0U8zFaQQ-yX89h0Q0mDLKGqp2Y,1597
 water_column_sonar_processing/cruise/__init__.py,sha256=H5hW0JMORuaFvQk_R31B4VL8RnRyKeanOOiWmqEMZJk,156
-water_column_sonar_processing/cruise/create_empty_zarr_store.py,sha256=1IehrlhMAS5XAl7DLdQI4jIMSY9ZNLiW4YdcBEwYkbc,7679
-water_column_sonar_processing/cruise/experiment_datatree.py,sha256=K6Uq_36Rygw5oFF8zWavEwb1x8D27lJv5G3j0B59agE,243
-water_column_sonar_processing/cruise/resample_regrid.py,sha256=XpGRs8nWspWuVoXBEV6VNVJSMlr3_IjnKlN1dK6dEA4,14292
-water_column_sonar_processing/geometry/__init__.py,sha256=_ol5nI8AL30pYXeAh5rtP7YmQggitPC6LA_kuTfPJ0Q,231
+water_column_sonar_processing/cruise/create_empty_zarr_store.py,sha256=ZsFQTDA0gXfQHlxDsXBGD1qQ0ipmx4kS81DcY6ml5Ew,7767
+water_column_sonar_processing/cruise/datatree_manager.py,sha256=Qy4dZCW8_q31lbjxbMsx3JtBS4BvQT17_2P0QD1RQcY,639
+water_column_sonar_processing/cruise/resample_regrid.py,sha256=gz_uP-mBD4JSBRBr69ZvsfmXX4yyBdRG9-P1z3If43E,14246
+water_column_sonar_processing/geometry/__init__.py,sha256=GIzzc-_7pwEwbOkGpc4i_fmjWI5ymllXqzdHq_d3Rio,299
+water_column_sonar_processing/geometry/elevation_manager.py,sha256=eq9w691WJknPwWYkvO3giKTPleIxCVc2tMGR0e8ZRxQ,4267
 water_column_sonar_processing/geometry/geometry_manager.py,sha256=nz5T1vCDWHYIfQ853EqKYHDetTul7jRWS3y8Evep8QU,10855
 water_column_sonar_processing/geometry/geometry_simplification.py,sha256=im1HG9nfYIerQv3w-PUHzphw2B7aGgnsA3Zcdy2oTmA,3016
 water_column_sonar_processing/geometry/pmtile_generation.py,sha256=7Lm08Jr6YaM4nYmexClxbIMOqSV1teo9wMm6dfjFuNA,12384
 water_column_sonar_processing/index/__init__.py,sha256=izEObsKiOoIJ0kZCFhvaYsBd6Ga71XJxnogjrNInw68,68
-water_column_sonar_processing/index/index_manager.py,sha256=YS6y_THfGAZpjfBZOj5n8O1aY_BnBYS781eNHfhpip0,11239
+water_column_sonar_processing/index/index_manager.py,sha256=qsS6rKObJlFXKyzRuT1bk2_qW1YagW-Fg_AkQ1U_KRs,14213
 water_column_sonar_processing/model/__init__.py,sha256=FXaCdbPqxp0ogmZm9NplRirqpgMiYs1iRYgJbFbbX2Y,65
-water_column_sonar_processing/model/zarr_manager.py,sha256=LoL8vOnEl2r_Jhu4l30p6AgfUZg1tW5aBydHx_BZAZg,15068
-water_column_sonar_processing/processing/__init__.py,sha256=UwdB3BnoUxy4q3k9-ZjBF6KzmCWVDcqbcArTeHgmvGA,118
-water_column_sonar_processing/processing/cruise_sampler.py,sha256=hadPrnH5nz7_oG_4pND7YbMFH6NMR9d6p3xAXedtKU8,15927
-water_column_sonar_processing/processing/raw_to_zarr.py,sha256=agbb2A0BWf7D4b5u-mYOBN_VyjRVjOdQM2aeRGBweWw,17617
+water_column_sonar_processing/model/zarr_manager.py,sha256=Sgh8wXhjTgvQ_UlHGALIbUQA9d7ESdpAT2hJIavpXwM,15507
+water_column_sonar_processing/processing/__init__.py,sha256=tdpSfwnY6lbAS_yBTu4aG0SjPgCKqh6LAFvIj_t3j3U,168
+water_column_sonar_processing/processing/batch_downloader.py,sha256=qXoruHdbgzAolmroK6eRn9bWgeHFgaVQLwhJ6X5oHRE,6299
+water_column_sonar_processing/processing/raw_to_zarr.py,sha256=Sn0_zBT7yYP6abbSTlQBPA6iZSBxeVqPYYSgoroiBEU,17599
 water_column_sonar_processing/utility/__init__.py,sha256=yDObMOL0_OxKWet5wffK2-XVJgoE9iwiY2q04GZrtBQ,234
 water_column_sonar_processing/utility/cleaner.py,sha256=bNbs-hopWxtKAFBK0Eu18xdRErZCGZvtla3j-1bTwQw,619
-water_column_sonar_processing/utility/constants.py,sha256=EbzsorvYKadsPjuutRjQKKByGibhFm0Gw6D-Sp2ZD3I,2143
+water_column_sonar_processing/utility/constants.py,sha256=AD6RlDrJRVN1GYwRvo7cunLhrdC0F8CyOlbkB_GxL-s,2180
 water_column_sonar_processing/utility/pipeline_status.py,sha256=O-0SySqdRGJ6bs3zQe1NV9vkOpmsRM7zj5QoHgzYioY,4395
 water_column_sonar_processing/utility/timestamp.py,sha256=bO0oir7KxxoEHPGRkz9FCBfOligkocUyRiWRzAq8fnU,361
-water_column_sonar_processing-0.0.13.dist-info/LICENSE,sha256=lz4IpJ5_adG3S0ali-WaIpQFVTnEAOucMDQPECUVEYw,1110
-water_column_sonar_processing-0.0.13.dist-info/METADATA,sha256=MUkVn5e1wkAFUAYpk25V02yNCeYNmwBsyib788i2ibg,5087
-water_column_sonar_processing-0.0.13.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-water_column_sonar_processing-0.0.13.dist-info/top_level.txt,sha256=aRYU4A7RNBlNrL4vzjytFAir3BNnmOgsvIGKKA36tg4,30
-water_column_sonar_processing-0.0.13.dist-info/RECORD,,
+water_column_sonar_processing-24.1.1.dist-info/LICENSE,sha256=lz4IpJ5_adG3S0ali-WaIpQFVTnEAOucMDQPECUVEYw,1110
+water_column_sonar_processing-24.1.1.dist-info/METADATA,sha256=Bym-EHrC46s9vFs9eN-nqZisesp5r5AFOwCckUVULS8,5474
+water_column_sonar_processing-24.1.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+water_column_sonar_processing-24.1.1.dist-info/top_level.txt,sha256=aRYU4A7RNBlNrL4vzjytFAir3BNnmOgsvIGKKA36tg4,30
+water_column_sonar_processing-24.1.1.dist-info/RECORD,,

{water_column_sonar_processing-0.0.13.dist-info → water_column_sonar_processing-24.1.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.6.0)
+Generator: setuptools (75.8.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

water_column_sonar_processing/cruise/experiment_datatree.py DELETED Viewed

@@ -1,13 +0,0 @@
-from datatree import DataTree
-ds1 = xr.Dataset({"foo": "orange"})
-dt = DataTree(name="root", data=ds1)  # create root node
-dt
-Out[4]:
-DataTree('root', parent=None)
-    Dimensions:  ()
-    Data variables:
-        foo      <U6 24B 'orange'

water_column_sonar_processing/processing/cruise_sampler.py DELETED Viewed

@@ -1,342 +0,0 @@
-import gc
-import os
-import echopype as ep
-import numpy as np
-from numcodecs import Blosc
-from water_column_sonar_processing.utility import Cleaner
-TEMPDIR = "/tmp"
-# This code is getting copied from echofish-aws-raw-to-zarr-lambda
-class CruiseSampler:
-    #######################################################
-    def __init__(
-            self,
-    ):
-        # TODO: revert to Blosc.BITSHUFFLE, troubleshooting misc error
-        self.__compressor = Blosc(cname="zstd", clevel=2)  # shuffle=Blosc.NOSHUFFLE
-        self.bucket_name = os.environ.get("INPUT_BUCKET_NAME")
-        # self.__s3 = s3_operations
-    ############################################################################
-    ############################################################################
-    def __zarr_info_to_table(
-            self,
-            file_name,
-            cruise_name,
-            zarr_path,
-            min_echo_range,
-            max_echo_range,
-            num_ping_time_dropna,
-            start_time,
-            end_time,
-            frequencies,
-            channels
-    ):
-        print('Writing Zarr information to DynamoDB table.')
-        self.__dynamo.update_item(
-            table_name=self.__table_name,
-            key={
-                'FILE_NAME': {'S': file_name},  # Partition Key
-                'CRUISE_NAME': {'S': cruise_name},  # Sort Key
-                # TODO: should be FILE_NAME & SENSOR_NAME so they are truely unique for when two sensors are processed within one cruise
-            },
-            expression='SET #ZB = :zb, #ZP = :zp, #MINER = :miner, #MAXER = :maxer, #P = :p, #ST = :st, #ET = :et, #F = :f, #C = :c',
-            attribute_names={
-                '#ZB': 'ZARR_BUCKET',
-                '#ZP': 'ZARR_PATH',
-                '#MINER': 'MIN_ECHO_RANGE',
-                '#MAXER': 'MAX_ECHO_RANGE',
-                '#P': 'NUM_PING_TIME_DROPNA',
-                '#ST': 'START_TIME',
-                '#ET': 'END_TIME',
-                '#F': 'FREQUENCIES',
-                '#C': 'CHANNELS',
-            },
-            attribute_values={
-                ':zb': {
-                    'S': self.__output_bucket
-                },
-                ':zp': {
-                    'S': zarr_path
-                },
-                ':miner': {
-                    'N': str(np.round(min_echo_range, 4))
-                },
-                ':maxer': {
-                    'N': str(np.round(max_echo_range, 4))
-                },
-                ':p': {
-                    'N': str(num_ping_time_dropna)
-                },
-                ':st': {
-                    'S': start_time
-                },
-                ':et': {
-                    'S': end_time
-                },
-                ':f': {
-                    'L': [{'N': str(i)} for i in frequencies]
-                },
-                ':c': {
-                    'L': [{'S': i} for i in channels]
-                }
-            }
-        )
-    ############################################################################
-    ############################################################################
-    ############################################################################
-    def raw_to_zarr(
-            self,
-            ship_name,
-            cruise_name,
-            sensor_name,
-            file_name,
-    ):
-        print(f'Opening raw: {file_name} and creating zarr store.')
-        geometry_manager = GeometryManager()
-        try:
-            gc.collect()
-            print('Opening raw file with echopype.')
-            bucket_name="test_input_bucket" # noaa-wcsd-pds
-            s3_file_path = f"s3://{bucket_name}/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}"
-            # s3_file_path = Path(f"s3://noaa-wcsd-pds/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}")
-            # TODO: add the bottom file here
-            echodata = ep.open_raw(
-                raw_file=s3_file_path,
-                sonar_model=sensor_name,
-                # include_bot=True,
-                use_swap=True,
-                # max_chunk_size=100,
-                # storage_options={'anon': True} # this was creating problems
-            )
-            print('Compute volume backscattering strength (Sv) from raw data.')
-            ds_sv = ep.calibrate.compute_Sv(echodata)
-            print('Done computing volume backscattering strength (Sv) from raw data.')
-            frequencies = echodata.environment.frequency_nominal.values
-            #################################################################
-            # Get GPS coordinates
-            gps_data, lat, lon = geometry_manager.read_echodata_gps_data(
-                echodata=echodata,
-                ship_name=ship_name,
-                cruise_name=cruise_name,
-                sensor_name=sensor_name,
-                file_name=file_name,
-                write_geojson=True
-            )
-            # gps_data, lat, lon = self.__get_gps_data(echodata=echodata)
-            #################################################################
-            # Technically the min_echo_range would be 0 m.
-            # TODO: this var name is supposed to represent minimum resolution of depth measurements
-            # The most minimum the resolution can be is as small as 0.25 meters
-            min_echo_range = np.maximum(0.25, np.nanmin(np.diff(ds_sv.echo_range.values)))
-            max_echo_range = float(np.nanmax(ds_sv.echo_range))
-            #
-            num_ping_time_dropna = lat[~np.isnan(lat)].shape[0]  # symmetric to lon
-            #
-            start_time = np.datetime_as_string(ds_sv.ping_time.values[0], unit='ms') + "Z"
-            end_time = np.datetime_as_string(ds_sv.ping_time.values[-1], unit='ms') + "Z"
-            channels = list(ds_sv.channel.values)
-            #
-            #################################################################
-            # Create the zarr store
-            ds_sv.to_zarr(store=store_name)
-            #################################################################
-            print('Note: Adding GeoJSON inside Zarr store')
-            self.__write_geojson_to_file(store_name=store_name, data=gps_data)
-            #################################################################
-            self.__zarr_info_to_table(
-                file_name=raw_file_name,
-                cruise_name=cruise_name,
-                zarr_path=os.path.join(output_zarr_prefix, store_name),
-                min_echo_range=min_echo_range,
-                max_echo_range=max_echo_range,
-                num_ping_time_dropna=num_ping_time_dropna,
-                start_time=start_time,
-                end_time=end_time,
-                frequencies=frequencies,
-                channels=channels
-            )
-        except Exception as err:
-            print(f'Exception encountered creating local Zarr store with echopype: {err}')
-            raise RuntimeError(f"Problem creating local Zarr store, {err}")
-        print('Done creating local zarr store.')
-    ############################################################################
-    def __upload_files_to_output_bucket(
-            self,
-            local_directory,
-            object_prefix,
-    ):
-        # Note: this will be passed credentials if using NODD
-        print('Uploading files using thread pool executor.')
-        all_files = []
-        for subdir, dirs, files in os.walk(local_directory):
-            for file in files:
-                local_path = os.path.join(subdir, file)
-                s3_key = os.path.join(object_prefix, local_path)
-                all_files.append([local_path, s3_key])
-        # all_files
-        all_uploads = self.__s3.upload_files_with_thread_pool_executor(
-            bucket_name=self.__output_bucket,
-            all_files=all_files,
-            access_key_id=self.__output_bucket_access_key,
-            secret_access_key=self.__output_bucket_secret_access_key
-        )
-        return all_uploads
-    ############################################################################
-    def execute(self, input_message):
-        ship_name = input_message['shipName']
-        cruise_name = input_message['cruiseName']
-        sensor_name = input_message['sensorName']
-        input_file_name = input_message['fileName']
-        #
-        try:
-            self.__update_processing_status(
-                file_name=input_file_name,
-                cruise_name=cruise_name,
-                pipeline_status="PROCESSING_RAW_TO_ZARR"
-            )
-            #######################################################################
-            store_name = f"{os.path.splitext(input_file_name)[0]}.zarr"
-            output_zarr_prefix = f"level_1/{ship_name}/{cruise_name}/{sensor_name}"
-            bucket_key = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/{input_file_name}"
-            zarr_prefix = os.path.join("level_1", ship_name, cruise_name, sensor_name)
-            #
-            os.chdir(TEMPDIR)  # Lambdas require use of temp directory
-            #######################################################################
-            #######################################################################
-            # Check if zarr store already exists
-            s3_objects = self.__s3.list_objects(
-                bucket_name=self.__output_bucket,
-                prefix=f"{zarr_prefix}/{os.path.splitext(input_file_name)[0]}.zarr/",
-                access_key_id=self.__output_bucket_access_key,
-                secret_access_key=self.__output_bucket_secret_access_key
-            )
-            if len(s3_objects) > 0:
-                print('Zarr store data already exists in s3, deleting existing and continuing.')
-                self.__s3.delete_objects(
-                    bucket_name=self.__output_bucket,
-                    objects=s3_objects,
-                    access_key_id=self.__output_bucket_access_key,
-                    secret_access_key=self.__output_bucket_secret_access_key
-                )
-            #######################################################################
-            # self.__delete_all_local_raw_and_zarr_files()
-            Cleaner.delete_local_files(file_types=["*.raw*", "*.zarr"])
-            self.__s3.download_file(
-                bucket_name=self.__input_bucket,
-                key=bucket_key,
-                file_name=input_file_name
-            )
-            self.__create_local_zarr_store(
-                raw_file_name=input_file_name,
-                cruise_name=cruise_name,
-                sensor_name=sensor_name,
-                output_zarr_prefix=output_zarr_prefix,
-                store_name=store_name
-            )
-            #######################################################################
-            self.__upload_files_to_output_bucket(store_name, output_zarr_prefix)
-            #######################################################################
-            # # TODO: verify count of objects matches
-            # s3_objects = self.__s3.list_objects(
-            #     bucket_name=self.__output_bucket,
-            #     prefix=f"{zarr_prefix}/{os.path.splitext(input_file_name)[0]}.zarr/",
-            #     access_key_id=self.__output_bucket_access_key,
-            #     secret_access_key=self.__output_bucket_secret_access_key
-            # )
-            #######################################################################
-            self.__update_processing_status(
-                file_name=input_file_name,
-                cruise_name=cruise_name,
-                pipeline_status='SUCCESS_RAW_TO_ZARR'
-            )
-            #######################################################################
-            self.__publish_done_message(input_message)
-            #######################################################################
-        # except Exception as err:
-        #     print(f'Exception encountered: {err}')
-        # self.__update_processing_status(
-        #     file_name=input_file_name,
-        #     cruise_name=cruise_name,
-        #     pipeline_status='FAILURE_RAW_TO_ZARR',
-        #     error_message=str(err),
-        # )
-        finally:
-            self.__delete_all_local_raw_and_zarr_files()
-        #######################################################################
-    ############################################################################
-################################################################################
-############################################################################
-# TODO: DELETE
-# def __get_gps_data(
-#         self,
-#         echodata: ep.echodata.echodata.EchoData
-# ) -> tuple:
-#     print('Getting GPS data.')
-#     try:
-#         # if 'latitude' not in echodata.platform.variables and 'longitude' not in echodata.platform.variables:
-#         #     raise KeyError;
-#         assert(  # TODO: raise error, e.g. KeyError
-#                 'latitude' in echodata.platform.variables and 'longitude' in echodata.platform.variables
-#         ), "Problem: GPS coordinates not found in echodata."
-#         latitude = echodata.platform.latitude.values
-#         longitude = echodata.platform.longitude.values  # len(longitude) == 14691
-#         # RE: time coordinates: https://github.com/OSOceanAcoustics/echopype/issues/656#issue-1219104771
-#         assert(
-#                 'time1' in echodata.platform.variables and 'time1' in echodata.environment.variables
-#         ), "Problem: Time coordinate not found in echodata."
-#         # 'nmea_times' are times from the nmea datalogger associated with GPS
-#         #   nmea times, unlike env times, can be sorted
-#         nmea_times = np.sort(echodata.platform.time1.values)
-#         # 'time1' are times from the echosounder associated with transducer measurement
-#         time1 = echodata.environment.time1.values
-#         # Align 'sv_times' to 'nmea_times'
-#         assert(
-#                 np.all(time1[:-1] <= time1[1:]) and np.all(nmea_times[:-1] <= nmea_times[1:])
-#         ), "Problem: NMEA time stamps are not sorted."
-#         # Finds the indices where 'v' can be inserted just to the right of 'a'
-#         indices = np.searchsorted(a=nmea_times, v=time1, side="right") - 1
-#         #
-#         lat = latitude[indices]
-#         lat[indices < 0] = np.nan  # values recorded before indexing are set to nan
-#         lon = longitude[indices]
-#         lon[indices < 0] = np.nan
-#         if len(lat) < 2 or len(lon) < 2:
-#             raise Exception("There was not enough data in lat or lon to create geojson.")
-#         assert(  # TODO: raise ValueError
-#                 np.all(lat[~np.isnan(lat)] >= -90.) and np.all(lat[~np.isnan(lat)] <= 90.) and np.all(lon[~np.isnan(lon)] >= -180.) and np.all(lon[~np.isnan(lon)] <= 180.)
-#         ), "Problem: Data falls outside GPS bounds!"
-#         # TODO: check for visits to null island
-#         # https://osoceanacoustics.github.io/echopype-examples/echopype_tour.html
-#         print(np.count_nonzero(np.isnan(lat)))
-#         print(np.count_nonzero(np.isnan(lon)))
-#         if len(lat[~np.isnan(lat)]) < 1:
-#             raise RuntimeError(f"Problem all data is NaN.")
-#         time1 = time1[~np.isnan(lat)]
-#         lat = lat[~np.isnan(lat)]
-#         lon = lon[~np.isnan(lon)]
-#         #
-#         gps_df = pd.DataFrame({
-#             'latitude': lat,
-#             'longitude': lon,
-#             'time1': time1
-#         }).set_index(['time1'])
-#         gps_gdf = geopandas.GeoDataFrame(
-#             gps_df,
-#             geometry=geopandas.points_from_xy(gps_df['longitude'], gps_df['latitude']),
-#             crs="epsg:4326"  # TODO: does this sound right?
-#         )
-#         # GeoJSON FeatureCollection with IDs as "time1"
-#         geo_json = gps_gdf.to_json()
-#     except Exception as err:
-#         print(f'Exception encountered creating local Zarr store with echopype: {err}')
-#         raise
-#     return geo_json, lat, lon

{water_column_sonar_processing-0.0.13.dist-info → water_column_sonar_processing-24.1.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{water_column_sonar_processing-0.0.13.dist-info → water_column_sonar_processing-24.1.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

water-column-sonar-processing 0.0.13__py3-none-any.whl → 24.1.1__py3-none-any.whl

Potentially problematic release.

water-column-sonar-processing 0.0.13py3-none-any.whl → 24.1.1py3-none-any.whl