PyPI - water-column-sonar-processing - Versions diffs - 25.1.7__py3-none-any.whl → 25.3.0__py3-none-any.whl - Mend

water-column-sonar-processing 25.1.7py3-none-any.whl → 25.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (26) hide show

water_column_sonar_processing/geometry/pmtile_generation.py CHANGED Viewed

@@ -1,15 +1,14 @@
 import glob
 import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 import fiona
-import s3fs
+import geopandas
+import geopandas as gpd
 import numpy as np
 import pandas as pd
 import xarray as xr
-import geopandas
-import geopandas as gpd
-import pyogrio
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from shapely.geometry import LineString
 MAX_POOL_CONNECTIONS = 64
@@ -19,6 +18,16 @@ GB = 1024**3
 class PMTileGeneration(object):
+    """
+    TODO: need to
+     - iterate through the zarr stores for all cruises
+     - generate geojson in geopandas df
+     - consolidate into singular df, one cruise per row
+     - export as _shape?_ file
+     - document next steps creating pmtiles with linux commands
+     - upload to s3
+    """
     #######################################################
     def __init__(
         self,
@@ -85,13 +94,20 @@ class PMTileGeneration(object):
         ship_name,
         cruise_names,
     ):
+        # TODO: NOT USED ANYWHERE
         total_size = 0
-        s3_fs = s3fs.S3FileSystem(anon=True)
+        # s3_fs = s3fs.S3FileSystem(anon=True)
         for cruise_name in cruise_names:
-            path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
-            zarr_store = s3fs.S3Map(root=path_to_zarr_store, s3=s3_fs)
-            xr_store = xr.open_zarr(store=zarr_store, consolidated=None)
-            print(f'Cruise: {cruise_name}, shape: {xr_store.time.shape[0]}')
+            s3_path = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
+            # zarr_store = s3fs.S3Map(root=s3_path, s3=s3_fs)
+            xr_store = xr.open_dataset(
+                filename_or_obj=s3_path,
+                engine="zarr",
+                storage_options={"anon": True},
+                chunks={},  # this allows the engine to define the chunk scheme
+                cache=True,
+            )
+            print(f"Cruise: {cruise_name}, shape: {xr_store.time.shape[0]}")
             total_size = total_size + xr_store.time.shape[0]
     def get_geospatial_info_from_zarr_store(
@@ -102,40 +118,51 @@ class PMTileGeneration(object):
         """
         Open Zarr store, create geometry, write to geojson, return name
         """
-        s3_fs = s3fs.S3FileSystem(anon=True)
+        # s3_fs = s3fs.S3FileSystem(anon=True)
         gps_gdf = geopandas.GeoDataFrame(
             columns=["id", "ship", "cruise", "sensor", "geometry"],
             geometry="geometry",
-            crs="EPSG:4326"
+            crs="EPSG:4326",
         )
-        path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
-        # file_name = os.path.normpath(path_to_zarr_store).split(os.sep)[-1]
-        # file_stem = os.path.splitext(os.path.basename(file_name))[0]
-        zarr_store = s3fs.S3Map(root=path_to_zarr_store, s3=s3_fs)
-        # ---Open Zarr Store--- #
+        s3_path = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
         # TODO: try-except to allow failures
-        print('opening store')
-        # xr_store = xr.open_zarr(store=zarr_store, consolidated=False)
-        xr_store = xr.open_zarr(store=zarr_store, consolidated=None)
+        print("opening store")
+        xr_store = xr.open_dataset(
+            filename_or_obj=s3_path,
+            engine="zarr",
+            storage_options={"anon": True},
+            chunks={},  # this allows the engine to define the chunk scheme
+            cache=True,
+        )
         print(xr_store.Sv.shape)
         # ---Read Zarr Store Time/Latitude/Longitude--- #
         latitude = xr_store.latitude.values
         longitude = xr_store.longitude.values
         if np.isnan(latitude).any() or np.isnan(longitude).any():
-            print(f'there was missing lat-lon data for {cruise_name}')
+            print(f"there was missing lat-lon data for {cruise_name}")
             return None
         # ---Add To GeoPandas Dataframe--- #
         # TODO: experiment with tolerance "0.001"
-        geom = LineString(list(zip(longitude, latitude))).simplify(tolerance=0.001, preserve_topology=True)
-        gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom)  # (ship, cruise, sensor, geometry)
-        gps_gdf.set_index('id', inplace=True)
-        gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON") #, engine="pyogrio")
+        geom = LineString(list(zip(longitude, latitude))).simplify(
+            tolerance=0.001, preserve_topology=True
+        )
+        gps_gdf.loc[0] = (
+            0,
+            "Henry_B._Bigelow",
+            cruise_name,
+            "EK60",
+            geom,
+        )  # (ship, cruise, sensor, geometry)
+        gps_gdf.set_index("id", inplace=True)
+        gps_gdf.to_file(
+            f"dataframe_{cruise_name}.geojson", driver="GeoJSON"
+        )  # , engine="pyogrio")
         return cruise_name
     #######################################################
     def open_zarr_stores_with_thread_pool_executor(
-            self,
-            cruises: list,
+        self,
+        cruises: list,
     ):
         # 'cruises' is a list of cruises to process
         completed_cruises = []
@@ -156,37 +183,46 @@ class PMTileGeneration(object):
         except Exception as err:
             print(err)
         print("Done opening zarr stores using thread pool.")
-        return completed_cruises # Took ~12 minutes
+        return completed_cruises  # Took ~12 minutes
     #######################################################
     # https://docs.protomaps.com/pmtiles/create
-    def aggregate_geojson_into_dataframe(
-        self
-    ):
+    def aggregate_geojson_into_dataframe(self):
         """
         iterate through cruises, threadpoolexecute geojson creation, aggregate geojson files into df,
         """
         gps_gdf = geopandas.GeoDataFrame(
             columns=["id", "ship", "cruise", "sensor", "geometry"],
             geometry="geometry",
-            crs="EPSG:4326"
+            crs="EPSG:4326",
         )
-        file_type = 'dataframe_*.geojson'
+        file_type = "dataframe_*.geojson"
         geojson_files = glob.glob(file_type)
         for jjj in range(len(geojson_files)):
             print(jjj)
             geom = geopandas.read_file(geojson_files[jjj])
-            gps_gdf.loc[jjj] = (jjj, geom.ship[0], geom.cruise[0], geom.sensor[0], geom.geometry[0])
-            #gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom)  # (ship, cruise, sensor, geometry)
+            gps_gdf.loc[jjj] = (
+                jjj,
+                geom.ship[0],
+                geom.cruise[0],
+                geom.sensor[0],
+                geom.geometry[0],
+            )
+            # gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom)  # (ship, cruise, sensor, geometry)
         print(gps_gdf)
-        gps_gdf.set_index('id', inplace=True)
-        gps_gdf.to_file(f"data.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
+        gps_gdf.set_index("id", inplace=True)
+        gps_gdf.to_file(
+            "data.geojson",
+            driver="GeoJSON",
+            engine="pyogrio",
+            layer_options={"ID_GENERATE": "YES"},
+        )
         return list(gps_gdf.cruise)
         # gps_gdf.loc[iii] = (iii, "Henry_B._Bigelow", cruise_name, "EK60", geom)  # (ship, cruise, sensor, geometry)
-        #print('writing to file')
-        #print(gps_gdf)
+        # print('writing to file')
+        # print(gps_gdf)
         # gps_gdf.set_index('id', inplace=True)
         # gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
         # https://gdal.org/en/latest/drivers/vector/jsonfg.html
@@ -198,25 +234,25 @@ class PMTileGeneration(object):
         # )
         # gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", id_generate=True)
 # print(fiona.supported_drivers) # {'DXF': 'rw', 'CSV': 'raw', 'OpenFileGDB': 'raw', 'ESRIJSON': 'r', 'ESRI Shapefile': 'raw', 'FlatGeobuf': 'raw', 'GeoJSON': 'raw', 'GeoJSONSeq': 'raw', 'GPKG': 'raw', 'GML': 'rw', 'OGR_GMT': 'rw', 'GPX': 'rw', 'MapInfo File': 'raw', 'DGN': 'raw', 'S57': 'r', 'SQLite': 'raw', 'TopoJSON': 'r'}
-#gps_gdf.to_file('dataframe.shp', crs="EPSG:4326", engine="fiona")
+# gps_gdf.to_file('dataframe.shp', crs="EPSG:4326", engine="fiona")
 # Convert geojson feature collection to pmtiles
-#gps_gdf.to_file("dataframe.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona")
-#print("done")
+# gps_gdf.to_file("dataframe.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona")
+# print("done")
 # ---Export Shapefile--- #
-#gps_gdf.set_geometry(col='geometry', inplace=True)
-#gps_gdf.__geo_interface__
-#gps_gdf.set_index('id', inplace=True)
-#gps_gdf.to_file(f"dataframe3.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona", index=True)
+# gps_gdf.set_geometry(col='geometry', inplace=True)
+# gps_gdf.__geo_interface__
+# gps_gdf.set_index('id', inplace=True)
+# gps_gdf.to_file(f"dataframe3.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona", index=True)
 ### this gives the right layer id values
-#gps_gdf.to_file(f"dataframe6.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
+# gps_gdf.to_file(f"dataframe6.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
 # jq '{"type": "FeatureCollection", "features": [.[] | .features[]]}' --slurp input*.geojson > output.geojson
-#tippecanoe -zg --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises output.geojson
-#tippecanoe -zg --convert-stringified-ids-to-numbers --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises dataframe*.geojson
+# tippecanoe -zg --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises output.geojson
+# tippecanoe -zg --convert-stringified-ids-to-numbers --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises dataframe*.geojson
 # {
 # "type": "FeatureCollection",
 # "name": "dataframe5",
@@ -226,19 +262,19 @@ class PMTileGeneration(object):
 # { "type": "Feature", "id": 2, "properties": { "id": 2, "ship": "Henry_B._Bigelow", "cruise": "HB0710", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -72.489486694335938, 40.331901550292969 ], [ -72.490760803222656, 40.33099365234375 ] ] } }
 # ]
 # }
-"""
-# https://docs.protomaps.com/pmtiles/create
-#ogr2ogr -t_srs EPSG:4326 data.geojson dataframe.shp
-# Only need to do the second one here...
-tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises dataframe.geojson
-tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
-# used this to combine all the geojson files into single pmtile file (2024-12-03):
-tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
-TODO:
-    run each one of the cruises in a separate ospool workflow.
-    each process gets own store
-"""
+# # https://docs.protomaps.com/pmtiles/create
+# #ogr2ogr -t_srs EPSG:4326 data.geojson dataframe.shp
+# # Only need to do the second one here...
+# tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises dataframe.geojson
+# tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
+# # used this to combine all the geojson files into single pmtile file (2024-12-03):
+# tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
+#
+# TODO:
+#     run each one of the cruises in a separate ospool workflow.
+#     each process gets own store
 ###########################################################
 # s3_manager = S3Manager()  # endpoint_url=endpoint_url)
@@ -258,5 +294,4 @@ TODO:
 # print(ds_zarr.Sv.shape)
-total = [246847, 89911, 169763, 658047, 887640, 708771, 187099, 3672813, 4095002, 763268, 162727, 189454, 1925270, 3575857, 1031920, 1167590, 3737415, 4099957, 3990725, 3619996, 3573052, 2973090, 55851, 143192, 1550164, 3692819, 668400, 489735, 393260, 1311234, 242989, 4515760, 1303091, 704663, 270645, 3886437, 4204381, 1062090, 428639, 541455, 4206506, 298561, 1279329, 137416, 139836, 228947, 517949]
+# total = [246847, 89911, 169763, 658047, 887640, 708771, 187099, 3672813, 4095002, 763268, 162727, 189454, 1925270, 3575857, 1031920, 1167590, 3737415, 4099957, 3990725, 3619996, 3573052, 2973090, 55851, 143192, 1550164, 3692819, 668400, 489735, 393260, 1311234, 242989, 4515760, 1303091, 704663, 270645, 3886437, 4204381, 1062090, 428639, 541455, 4206506, 298561, 1279329, 137416, 139836, 228947, 517949]

water_column_sonar_processing/index/index_manager.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import os
 import re
-import pandas as pd
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
-from concurrent.futures import ThreadPoolExecutor
-from concurrent.futures import as_completed
-from water_column_sonar_processing.aws import S3Manager
+from hashlib import sha256
+import networkx as nx
+import numpy as np
+import pandas as pd
+from water_column_sonar_processing.aws import S3Manager
 MAX_POOL_CONNECTIONS = 64
 MAX_CONCURRENCY = 64
@@ -19,8 +22,8 @@ class IndexManager:
     def __init__(self, input_bucket_name, calibration_bucket, calibration_key):
         self.input_bucket_name = input_bucket_name
         self.calibration_bucket = calibration_bucket
-        self.calibration_key = calibration_key
-        self.s3_manager = S3Manager() # TODO: make anonymous?
+        self.calibration_key = calibration_key  # TODO: make optional?
+        self.s3_manager = S3Manager()  # TODO: make anonymous?
     #################################################################
     def list_ships(
@@ -80,9 +83,7 @@ class IndexManager:
         # Gets all raw files for a cruise under the given prefix
         prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/"  # Note no forward slash at beginning
         page_iterator = self.s3_manager.paginator.paginate(
-            Bucket=self.input_bucket_name,
-            Prefix=prefix,
-            Delimiter="/"
+            Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
         )
         all_files = []
         for page in page_iterator:
@@ -112,7 +113,9 @@ class IndexManager:
             Delimiter="/",
         )
         # page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
-        page_iterator = page_iterator.search(expression="Contents[?contains(Key, '.raw')] ")
+        page_iterator = page_iterator.search(
+            expression="Contents[?contains(Key, '.raw')] "
+        )
         for res in page_iterator:
             if "Key" in res:
                 return res["Key"]
@@ -149,9 +152,7 @@ class IndexManager:
         sensor_name,
     ):
         raw_files = self.get_raw_files(
-            ship_name=ship_name,
-            cruise_name=cruise_name,
-            sensor_name=sensor_name
+            ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
         )
         files_list = [
             {
@@ -174,9 +175,7 @@ class IndexManager:
     ):
         # gets all raw files in cruise and returns a list of dicts
         raw_files = self.get_raw_files(
-            ship_name=ship_name,
-            cruise_name=cruise_name,
-            sensor_name=sensor_name
+            ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
         )
         files_list = [
             {
@@ -190,10 +189,9 @@ class IndexManager:
         return files_list
     #################################################################
-    def get_subset_ek60_prefix( # TODO: is this used?
-        self,
-        df: pd.DataFrame
-    ) -> pd.DataFrame:
+    def get_subset_ek60_prefix(
+        self, df: pd.DataFrame
+    ) -> pd.DataFrame:  # TODO: is this used?
         # Returns all objects with 'EK60' in prefix of file path
         # Note that this can include 'EK80' data that are false-positives
         # in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
@@ -237,10 +235,7 @@ class IndexManager:
         return pd.DataFrame(objects)
     #################################################################
-    def scan_datagram(
-        self,
-        select_key: str
-    ) -> list:
+    def scan_datagram(self, select_key: str) -> list:
         # Reads the first 8 bytes of S3 file. Used to determine if ek60 or ek80
         # Note: uses boto3 session instead of boto3 client: https://github.com/boto/boto3/issues/801
         # select_key = 'data/raw/Albatross_Iv/AL0403/EK60/L0005-D20040302-T200108-EK60.raw'
@@ -256,12 +251,15 @@ class IndexManager:
         return first_datagram
     #################################################################
-    def get_subset_datagrams( # TODO: is this getting used
-        self,
-        df: pd.DataFrame
-    ) -> list:
+    def get_subset_datagrams(
+        self, df: pd.DataFrame
+    ) -> list:  # TODO: is this getting used
         print("getting subset of datagrams")
-        select_keys = df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values.tolist()
+        select_keys = (
+            df[["KEY", "CRUISE"]]
+            .drop_duplicates(subset="CRUISE")["KEY"]
+            .values.tolist()
+        )
         all_datagrams = []
         with ThreadPoolExecutor(max_workers=MAX_POOL_CONNECTIONS) as executor:
             futures = [
@@ -276,9 +274,7 @@ class IndexManager:
     #################################################################
     def get_ek60_objects(
-        self,
-        df: pd.DataFrame,
-        subset_datagrams: list
+        self, df: pd.DataFrame, subset_datagrams: list
     ) -> pd.DataFrame:
         # for each key write datagram value to all other files in same cruise
         for subset_datagram in subset_datagrams:
@@ -346,3 +342,42 @@ class IndexManager:
     #     end_time = datetime.now()  # used for benchmarking
     #     print(start_time)
     #     print(end_time)
+    # TODO: wip
+    def build_merkle_tree(self):
+        G = nx.DiGraph()
+        # https://noaa-wcsd-pds.s3.amazonaws.com/index.html#data/raw/Henry_B._Bigelow/HB0707/
+        ship_name = "Henry_B._Bigelow"
+        cruise_name = "HB0707"
+        # cruise_name = "HB0805"
+        prefix = f"data/raw/{ship_name}/{cruise_name}/"
+        # prefix = f"data/raw/{ship_name}/"
+        page_iterator = self.s3_manager.paginator.paginate(
+            Bucket=self.input_bucket_name,
+            Prefix=prefix,
+        )
+        for page in page_iterator:
+            for contents in page["Contents"]:
+                obj_key = contents["Key"]
+                # https://datatracker.ietf.org/doc/html/rfc7232#section-2.3
+                obj_etag = contents["ETag"].split('"')[1]  # properties
+                obj_size = contents["Size"]
+                basename = os.path.basename(obj_key)
+                G.add_node(
+                    node_for_adding=basename, ETag=obj_etag, Size=obj_size, Key=obj_key
+                )  # TODO: add parent hash
+                split_path = os.path.normpath(obj_key).split(os.path.sep)
+                # split_path: ['data', 'raw', 'Henry_B._Bigelow', 'HB0707', 'EK60', 'D20070712-T004447.raw']
+                for previous, current in zip(split_path, split_path[1:]):
+                    if not G.has_edge(previous, current):
+                        G.add_edge(previous, current)
+        # print(G)
+        etag_set = frozenset(
+            [k for j, k in list(G.nodes.data("ETag")) if k is not None]
+        )
+        new_hash = sha256(str(etag_set.__hash__()).encode("utf-8")).hexdigest()
+        total_size = [k for j, k in list(G.nodes.data("Size")) if k is not None]
+        print(np.sum(total_size))  # 22.24 Terabytes in Henry_B._Bigelow cruises
+        print(" ")
+        print(new_hash)
+        return new_hash

water_column_sonar_processing/model/zarr_manager.py CHANGED Viewed

@@ -1,17 +1,16 @@
+import importlib.metadata
 import numcodecs
 import numpy as np
 import xarray as xr
 import zarr
-import importlib.metadata
 from numcodecs import Blosc
 from water_column_sonar_processing.aws import S3FSManager
-from water_column_sonar_processing.utility import Constants
-from water_column_sonar_processing.utility import Timestamp
-from water_column_sonar_processing.utility import Coordinates
+from water_column_sonar_processing.utility import Constants, Coordinates, Timestamp
-numcodecs.blosc.use_threads = False
-numcodecs.blosc.set_nthreads(1)
+# numcodecs.blosc.use_threads = False
+# numcodecs.blosc.set_nthreads(1)
 # TODO: when ready switch to version 3 of model spec
@@ -36,20 +35,22 @@ class ZarrManager:
         self,
         min_echo_range: float = 1.0,  # minimum depth measured (zero non-inclusive) from whole cruise
         max_echo_range: float = 100.0,  # maximum depth measured from whole cruise
+        cruise_min_epsilon: float = 0.25,  # resolution between subsequent measurements
     ):
         # Gets the set of depth values that will be used when resampling and
         # regridding the data to a cruise level model store.
         # Note: returned values do not start at zero.
+        # For more info see here: https://echopype.readthedocs.io/en/stable/data-proc-additional.html
         print("Getting depth values.")
         all_cruise_depth_values = np.linspace(
             start=min_echo_range,
             stop=max_echo_range,
-            num=int(max_echo_range / min_echo_range) + 1,
+            num=int((max_echo_range - min_echo_range) / cruise_min_epsilon) + 1,
             endpoint=True,
-        )
+        )  # np.arange(min_echo_range, max_echo_range, step=min_echo_range) # this is worse
         if np.any(np.isnan(all_cruise_depth_values)):
-            raise Exception('Problem depth values returned were NaN.')
+            raise Exception("Problem depth values returned were NaN.")
         print("Done getting depth values.")
         return all_cruise_depth_values.round(decimals=2)
@@ -57,7 +58,7 @@ class ZarrManager:
     #######################################################
     def create_zarr_store(
         self,
-        path: str, # 'level_2/Henry_B._Bigelow/HB0707/EK60/HB0707.model/tmp/HB0707.zarr/.zattrs'
+        path: str,  # 'level_2/Henry_B._Bigelow/HB0707/EK60/HB0707.model/tmp/HB0707.zarr/.zattrs'
         ship_name: str,
         cruise_name: str,
         sensor_name: str,
@@ -65,6 +66,7 @@ class ZarrManager:
         width: int,  # TODO: needs better name... "ping_time"
         min_echo_range: float,  # smallest resolution in meters
         max_echo_range: float,
+        cruise_min_epsilon: float,
         calibration_status: bool = False,  # Assume uncalibrated
     ) -> str:
         print(
@@ -105,7 +107,9 @@ class ZarrManager:
         #####################################################################
         # --- Coordinate: Depth --- #
         depth_values = self.get_depth_values(
-            min_echo_range=min_echo_range, max_echo_range=max_echo_range
+            min_echo_range=min_echo_range,
+            max_echo_range=max_echo_range,
+            cruise_min_epsilon=cruise_min_epsilon,
         )
         root.create_dataset(
@@ -123,7 +127,7 @@ class ZarrManager:
         )
         if np.any(np.isnan(depth_values)):
-            raise Exception('Some depth values returned were NaN.')
+            raise Exception("Some depth values returned were NaN.")
         root.depth.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.DEPTH.value]
@@ -171,7 +175,9 @@ class ZarrManager:
         root.longitude.attrs["units"] = Coordinates.LONGITUDE_UNITS.value
         root.longitude.attrs["long_name"] = Coordinates.LONGITUDE_LONG_NAME.value
-        root.longitude.attrs["standard_name"] = Coordinates.LONGITUDE_STANDARD_NAME.value
+        root.longitude.attrs["standard_name"] = (
+            Coordinates.LONGITUDE_STANDARD_NAME.value
+        )
         #####################################################################
         # TODO: verify adding this variable for where the bottom was detected
@@ -224,7 +230,11 @@ class ZarrManager:
             name=Coordinates.SV.value,
             shape=(len(depth_values), width, len(frequencies)),
             # chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, len(frequencies)),
-            chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, 1), # 256x256x1 <- speed up for alex
+            chunks=(
+                Constants.TILE_SIZE.value,
+                Constants.TILE_SIZE.value,
+                1,
+            ),  # 256x256x1 <- speed up for alex
             dtype=np.dtype(
                 Coordinates.SV_DTYPE.value
             ),  # TODO: try to experiment with 'float16'
@@ -251,7 +261,9 @@ class ZarrManager:
         #
         root.attrs["processing_software_name"] = Coordinates.PROJECT_NAME.value
-        current_project_version = importlib.metadata.version('water_column_sonar_processing')
+        current_project_version = importlib.metadata.version(
+            "water_column_sonar_processing"
+        )
         root.attrs["processing_software_version"] = current_project_version
         root.attrs["processing_software_time"] = Timestamp.get_timestamp()
         #
@@ -317,16 +329,14 @@ class ZarrManager:
         input_bucket_name: str,
         endpoint_url=None,
     ) -> xr.Dataset:
-        print("Opening L1 Zarr store in S3 with Xarray.") # TODO: Is this only used for reading from?
+        print(
+            "Opening L1 Zarr store in S3 with Xarray."
+        )  # TODO: Is this only used for reading from?
         try:
             zarr_path = f"s3://{input_bucket_name}/level_1/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.zarr"
             s3fs_manager = S3FSManager(endpoint_url=endpoint_url)
             store_s3_map = s3fs_manager.s3_map(s3_zarr_store_path=zarr_path)
-            ds = xr.open_dataset(
-                filename_or_obj=store_s3_map,
-                engine="zarr",
-                chunks={}
-            )
+            ds = xr.open_dataset(filename_or_obj=store_s3_map, engine="zarr", chunks={})
         except Exception as err:
             print("Problem opening Zarr store in S3 as Xarray.")
             raise err
@@ -353,6 +363,7 @@ class ZarrManager:
             raise err
         print("Done opening Zarr store in S3 as Xarray.")
         return ds
     ############################################################################
     #######################################################

water_column_sonar_processing/process.py CHANGED Viewed

@@ -3,10 +3,12 @@ import os
 import numpy as np
-from water_column_sonar_processing.aws import DynamoDBManager
-from water_column_sonar_processing.aws import S3Manager
-from water_column_sonar_processing.aws import S3FSManager
-from water_column_sonar_processing.aws import SNSManager
+from water_column_sonar_processing.aws import (
+    DynamoDBManager,
+    S3FSManager,
+    S3Manager,
+    SNSManager,
+)
 ###########################################################
@@ -23,9 +25,9 @@ class Process:
         # self.output_bucket_secret_access_key = ?
     def execute(self):
-        input_s3_manager = (
-            S3Manager()
-        )  # TODO: Need to allow passing in of credentials when writing to protected bucket
+        # input_s3_manager = (
+        #     S3Manager()
+        # )  # TODO: Need to allow passing in of credentials when writing to protected bucket
         s3fs_manager = S3FSManager()  # TODO: delete this
         print(s3fs_manager)  # TODO: delete this
         output_s3_manager = S3Manager()
@@ -76,8 +78,8 @@ class Process:
                 "#SE": "SENSOR_NAME",
                 "#SH": "SHIP_NAME",
                 "#ST": "START_TIME",
-                "#ZB": "ZARR_BUCKET",
-                "#ZP": "ZARR_PATH",
+                # "#ZB": "ZARR_BUCKET",
+                # "#ZP": "ZARR_PATH",
             },
             expression_attribute_values={
                 ":ch": {"L": [{"S": i} for i in test_channels]},
@@ -92,10 +94,10 @@ class Process:
                 ":se": {"S": sensor_name},
                 ":sh": {"S": ship_name},
                 ":st": {"S": "2006-04-06T11:34:07.288Z"},
-                ":zb": {"S": "r2d2-dev-echofish2-118234403147-echofish-dev-output"},
-                ":zp": {
-                    "S": "level_1/David_Starr_Jordan/DS0604/EK60/DSJ0604-D20060406-T113407.model"
-                },
+                # ":zb": {"S": "r2d2-dev-echofish2-118234403147-echofish-dev-output"},
+                # ":zp": {
+                #     "S": "level_1/David_Starr_Jordan/DS0604/EK60/DSJ0604-D20060406-T113407.model"
+                # },
             },
             update_expression=(
                 "SET "

water_column_sonar_processing/processing/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # from .cruise_sampler import CruiseSampler
-from .raw_to_zarr import RawToZarr
 from .batch_downloader import BatchDownloader
+from .raw_to_zarr import RawToZarr
-__all__ = ["RawToZarr", "BatchDownloader"]
+__all__ = ["RawToZarr", "BatchDownloader"]

water-column-sonar-processing 25.1.7__py3-none-any.whl → 25.3.0__py3-none-any.whl

Potentially problematic release.

water-column-sonar-processing 25.1.7py3-none-any.whl → 25.3.0py3-none-any.whl