PyPI - water-column-sonar-processing - Versions diffs - 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl - Mend

water-column-sonar-processing 0.0.10py3-none-any.whl → 0.0.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (14) hide show

water_column_sonar_processing/aws/dynamodb_manager.py CHANGED Viewed

@@ -111,17 +111,21 @@ class DynamoDBManager:
         expression_attribute_names,
         expression_attribute_values,
         update_expression,
-    ):
-        response = self.__dynamodb_client.update_item(
-            TableName=table_name,
-            Key=key,
-            ExpressionAttributeNames=expression_attribute_names,
-            ExpressionAttributeValues=expression_attribute_values,
-            UpdateExpression=update_expression,
-        )
-        status_code = response["ResponseMetadata"]["HTTPStatusCode"]
-        assert response['ConsumedCapacity']['TableName'] == table_name
-        assert status_code == 200, "Problem, unable to update dynamodb table."
+    ): # TODO: convert to boolean
+        try:
+            response = self.__dynamodb_client.update_item(
+                TableName=table_name,
+                Key=key,
+                ExpressionAttributeNames=expression_attribute_names,
+                ExpressionAttributeValues=expression_attribute_values,
+                UpdateExpression=update_expression,
+            )
+            status_code = response["ResponseMetadata"]["HTTPStatusCode"]
+            # print(f"HTTPStatusCode: {status_code}")
+            # assert status_code == 200, "Problem, unable to update dynamodb table."
+            # assert response['ConsumedCapacity']['TableName'] == table_name
+        except Exception as err:
+            print(f"Problem was encountered while updating item: {err}")
     #####################################################################
     # TODO: change to "get_cruise_as_df"

water_column_sonar_processing/aws/s3_manager.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import os
 import boto3
+from typing import Optional
 from collections.abc import Generator
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -29,9 +30,11 @@ class S3Manager:
     #####################################################################
     def __init__(
         self,
+        endpoint_url: Optional[str] = None,
     ):
-        self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
-        self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
+        self.endpoint_url = endpoint_url
+        # self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
+        # self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
         self.s3_region = os.environ.get("AWS_REGION", default="us-east-1")
         self.s3_client_config = Config(max_pool_connections=MAX_POOL_CONNECTIONS)
         self.s3_transfer_config = TransferConfig(
@@ -49,6 +52,7 @@ class S3Manager:
             service_name="s3",
             config=self.s3_client_config,
             region_name=self.s3_region,
+            endpoint_url=self.endpoint_url,
         )
         self.s3_resource = boto3.resource(
             service_name="s3",
@@ -64,11 +68,13 @@ class S3Manager:
             service_name="s3",
             config=self.s3_client_config,
             region_name=self.s3_region,
+            endpoint_url=self.endpoint_url,
         )
         self.s3_resource_noaa_wcsd_zarr_pds = self.s3_session_noaa_wcsd_zarr_pds.resource(
             service_name="s3",
             config=self.s3_client_config,
             region_name=self.s3_region,
+            endpoint_url=self.endpoint_url,
         )
         self.paginator = self.s3_client.get_paginator('list_objects_v2')
         self.paginator_noaa_wcsd_zarr_pds = self.s3_client_noaa_wcsd_zarr_pds.get_paginator('list_objects_v2')
@@ -85,13 +91,31 @@ class S3Manager:
         self,
         bucket_name: str,
     ):
-        self.s3_client.create_bucket(
+        """
+        Note: this function is only really meant to be used for creating test
+        buckets. It allows public read of all objects.
+        """
+        # https://github.com/aodn/aodn_cloud_optimised/blob/e5035495e782783cc8b9e58711d63ed466420350/test_aodn_cloud_optimised/test_schema.py#L7
+        public_policy = {
+            "Version": "2012-10-17",
+            "Statement": [
+                {
+                    "Effect": "Allow",
+                    "Principal": "*",
+                    "Action": "s3:GetObject",
+                    "Resource": f"arn:aws:s3:::{bucket_name}/*",
+                }
+            ],
+        }
+        response1 = self.s3_client.create_bucket(
             Bucket=bucket_name,
-            # Required when region is different then us-east-1
-            #
-            # TODO: if region is us-east-1, don't include this line somehow
-            # CreateBucketConfiguration={'LocationConstraint': self.__s3_region}
+            ACL='public-read'
         )
+        print(response1)
+        # response = self.s3_client.put_bucket_policy(
+        #     Bucket=bucket_name, Policy=json.dumps(public_policy)
+        # )
+        # print(response)
     #####################################################################
     def list_buckets(self):
@@ -156,6 +180,7 @@ class S3Manager:
         self,
         local_directory,
         remote_directory,
+        output_bucket_name,
     ):
         # Right now this is just for uploading a model store to s3
         print("Uploading files to output bucket.")
@@ -173,7 +198,7 @@ class S3Manager:
                 all_files.append([local_path, s3_key])
         all_uploads = self.upload_files_with_thread_pool_executor(
-            output_bucket_name=self.output_bucket_name,
+            output_bucket_name=output_bucket_name,
             all_files=all_files,
         )
         print("Done uploading files to output bucket.")
@@ -228,8 +253,8 @@ class S3Manager:
     # ):
     #     # Returns a list of key strings for each object in bucket defined by prefix
     #     keys = []
-    #     page_iterator = self.paginator_noaa_wcsd_zarr_pds.paginate(Bucket=self.output_bucket_name, Prefix=prefix):
-    #     for page in paginator.paginate(Bucket=self.output_bucket_name, Prefix=prefix):
+    #     page_iterator = self.paginator_noaa_wcsd_zarr_pds.paginate(Bucket=output_bucket_name, Prefix=prefix):
+    #     for page in paginator.paginate(Bucket=output_bucket_name, Prefix=prefix):
     #         if "Contents" in page.keys():
     #             keys.extend([k["Key"] for k in page["Contents"]])
     #     return keys
@@ -371,7 +396,6 @@ class S3Manager:
             print(f"Problem was encountered while deleting objects: {err}")
     #####################################################################
-    # not used TODO: remove
     def put(self, bucket_name, key, body):  # noaa-wcsd-model-pds
         self.s3_client.put_object(Bucket=bucket_name, Key=key, Body=body) # "Body" can be a file
@@ -382,10 +406,12 @@ class S3Manager:
         cruise_name,
         sensor_name,
         file_name_stem,
+        output_bucket_name,
     ) -> str:
         try:
-            content_object = self.s3_resource_noaa_wcsd_zarr_pds.Object(
-                bucket_name=self.output_bucket_name,
+            resource = self.s3_resource_noaa_wcsd_zarr_pds
+            content_object = resource.Object(
+                bucket_name=output_bucket_name,
                 key=f"spatial/geojson/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.json",
             ).get()
             file_content = content_object["Body"].read().decode("utf-8")

water_column_sonar_processing/aws/s3fs_manager.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
+from typing import Optional
 import s3fs
 # TODO: S3FS_LOGGING_LEVEL=DEBUG
@@ -9,37 +9,25 @@ class S3FSManager:
     #####################################################################
     def __init__(
         self,
+        endpoint_url: Optional[str] = None,
     ):
-        self.__s3_region = os.environ.get("AWS_REGION", default="us-east-1")
+        self.endpoint_url = endpoint_url
+        # self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
+        # self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
+        self.s3_region = os.environ.get("AWS_REGION", default="us-east-1")
         self.s3fs = s3fs.S3FileSystem(
+            endpoint_url=endpoint_url,
             key=os.environ.get("OUTPUT_BUCKET_ACCESS_KEY"),
             secret=os.environ.get("OUTPUT_BUCKET_SECRET_ACCESS_KEY"),
-            # asynchronous=True
-            # use_ssl=False,
-            # skip_instance_cache=True,
-            # default_block_size='100MB',  # if no specific value is given at all time. The built-in default is 5MB
-            # client_kwargs={
-            #     "region_name": self.__s3_region
-            # }
         )
-    #####################################################################
-    def add_file(self, filename):
-        full_path = f"{os.getenv('OUTPUT_BUCKET_NAME')}/testing/{filename}"
-        print(full_path)
-        self.s3fs.touch(full_path)
-        ff = self.s3fs.ls(f"{os.getenv('OUTPUT_BUCKET_NAME')}/")
-        print(ff)
-    #####################################################################
-    def upload_data(self, bucket_name, file_path, prefix):
-        # TODO: this works in theory but use boto3 to upload files
-        s3_path = f"s3://{bucket_name}/{prefix}/"
-        s3_file_system = self.s3fs
-        s3_file_system.put(file_path, s3_path, recursive=True)
+    # s3_fs = s3fs.S3FileSystem( # TODO: use s3fs_manager?
+    #     anon=True,
+    #     client_kwargs={
+    #         "endpoint_url": moto_server,
+    #         "region_name": "us-east-1",
+    #     },
+    # )
     #####################################################################
     def s3_map(
         self,
@@ -49,20 +37,39 @@ class S3FSManager:
         # create=False, not false because will be writing
         # return s3fs.S3Map(root=s3_zarr_store_path, s3=self.s3fs, check=True)
         return s3fs.S3Map(
-            root=s3_zarr_store_path, s3=self.s3fs
+            root=s3_zarr_store_path,
+            s3=self.s3fs
         )  # create=False, not false because will be writing
+    #####################################################################
+    # def add_file(self, filename):
+    #     full_path = f"{os.getenv('OUTPUT_BUCKET_NAME')}/testing/{filename}"
+    #     print(full_path)
+    #
+    #     self.s3fs.touch(full_path)
+    #     ff = self.s3fs.ls(f"{os.getenv('OUTPUT_BUCKET_NAME')}/")
+    #
+    #     print(ff)
+    #####################################################################
+    def upload_data(
+            self,
+            bucket_name,
+            file_path,
+            prefix
+    ):
+        # TODO: this works in theory but use boto3 to upload files
+        s3_path = f"s3://{bucket_name}/{prefix}/"
+        s3_file_system = self.s3fs
+        s3_file_system.put(file_path, s3_path, recursive=True)
     #####################################################################
     def exists(
         self,
-        geo_json_s3_path,
+        s3_path,
     ):
-        s3_file_system = self.s3fs
-        return s3_file_system.exists(path=geo_json_s3_path)
+        # s3_file_system =
+        return self.s3fs.exists(s3_path)
     #####################################################################
-    # def put(
-    #         self
-    # ):
-    #     s3_file_system = self.s3fs
-    #     return

water_column_sonar_processing/cruise/create_empty_zarr_store.py CHANGED Viewed

@@ -24,14 +24,14 @@ class CreateEmptyZarrStore:
         self,
     ):
         self.__overwrite = True
-        self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
-        self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
+        # self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
+        # self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
     #######################################################
     # TODO: move this to the s3_manager
     def upload_zarr_store_to_s3(
         self,
+        output_bucket_name: str,
         local_directory: str,
         object_prefix: str,
         cruise_name: str,
@@ -43,24 +43,28 @@ class CreateEmptyZarrStore:
         # # 'all_files' is passed a list of lists: [[local_path, s3_key], [...], ...]
         all_files = []
         for subdir, dirs, files in os.walk(
-            f"{local_directory}/{cruise_name}.zarr_manager"
+            f"{local_directory}/{cruise_name}.zarr"
         ):
             for file in files:
                 local_path = os.path.join(subdir, file)
-                # 'level_2/Henry_B._Bigelow/HB0806/EK60/HB0806.model/..zattrs'
-                s3_key = f'{object_prefix}/{cruise_name}.model{local_path.split(f"{cruise_name}.model")[-1]}'
+                # TODO: find a better method for splitting strings here:
+                # 'level_2/Henry_B._Bigelow/HB0806/EK60/HB0806.zarr/.zattrs'
+                s3_key = f'{object_prefix}/{cruise_name}.zarr{local_path.split(f"{cruise_name}.zarr")[-1]}'
                 all_files.append([local_path, s3_key])
         #
         # print(all_files)
         s3_manager.upload_files_with_thread_pool_executor(
+            output_bucket_name=output_bucket_name,
             all_files=all_files,
         )
         print("Done uploading with thread pool executor.")
         # TODO: move to common place
     #######################################################
+    # @classmethod
     def create_cruise_level_zarr_store(
         self,
+        output_bucket_name: str,
         ship_name: str,
         cruise_name: str,
         sensor_name: str,
@@ -116,17 +120,18 @@ class CreateEmptyZarrStore:
             new_width = int(consolidated_zarr_width)
             print(f"new_width: {new_width}")
             #################################################################
-            store_name = f"{cruise_name}.model"
+            store_name = f"{cruise_name}.zarr"
             print(store_name)
             ################################################################
             # Delete existing model store if it exists
             zarr_prefix = os.path.join("level_2", ship_name, cruise_name, sensor_name)
             child_objects = s3_manager.get_child_objects(
-                bucket_name=self.output_bucket_name,
+                bucket_name=output_bucket_name,
                 sub_prefix=zarr_prefix,
             )
             if len(child_objects) > 0:
                 s3_manager.delete_nodd_objects(
+                    bucket_name=output_bucket_name,
                     objects=child_objects,
                 )
             ################################################################
@@ -153,6 +158,7 @@ class CreateEmptyZarrStore:
             )
             #################################################################
             self.upload_zarr_store_to_s3(
+                output_bucket_name=output_bucket_name,
                 local_directory=tempdir,
                 object_prefix=zarr_prefix,
                 cruise_name=cruise_name,
@@ -174,6 +180,7 @@ class CreateEmptyZarrStore:
             #################################################################
             # Success
             # TODO: update enum in dynamodb
+            print("Done creating cruise level zarr store.")
             #################################################################
         except Exception as err:
             print(f"Problem trying to create new cruise model store: {err}")

water_column_sonar_processing/cruise/experiment_datatree.py ADDED Viewed

@@ -0,0 +1,13 @@
+from datatree import DataTree
+ds1 = xr.Dataset({"foo": "orange"})
+dt = DataTree(name="root", data=ds1)  # create root node
+dt
+Out[4]:
+DataTree('root', parent=None)
+    Dimensions:  ()
+    Data variables:
+        foo      <U6 24B 'orange'

water_column_sonar_processing/cruise/resample_regrid.py CHANGED Viewed

@@ -26,8 +26,8 @@ class ResampleRegrid:
         self,
     ):
         self.__overwrite = True
-        self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
-        self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
+        # self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
+        # self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
         self.dtype = "float32"
     #################################################################
@@ -144,6 +144,10 @@ class ResampleRegrid:
         cruise_name,
         sensor_name,
         table_name,
+        # TODO: file_name?,
+        bucket_name, # TODO: this is the same bucket
+        override_select_files=None,
+        endpoint_url=None
     ) -> None:
         """
         The goal here is to interpolate the data against the depth values already populated
@@ -151,17 +155,17 @@ class ResampleRegrid:
         read/write operations. We open the file-level store with Xarray to leverage tools for
         resampling and subsetting the data.
         """
-        print("Interpolating data.")
+        print("Resample Regrid, Interpolating data.")
         try:
             zarr_manager = ZarrManager()
-            # s3_manager = S3Manager()
             geo_manager = GeometryManager()
-            # get model store
             output_zarr_store = zarr_manager.open_s3_zarr_store_with_zarr(
                 ship_name=ship_name,
                 cruise_name=cruise_name,
                 sensor_name=sensor_name,
-                # zarr_synchronizer=?  # TODO: pass in for parallelization
+                output_bucket_name=bucket_name,
+                endpoint_url=endpoint_url,
             )
             # get dynamo stuff
@@ -175,8 +179,12 @@ class ResampleRegrid:
             #########################################################
             #########################################################
-            # TODO: iterate files here
             all_file_names = cruise_df["FILE_NAME"]
+            if override_select_files is not None:
+                all_file_names = override_select_files
+            # Iterate files
             for file_name in all_file_names:
                 gc.collect()
                 file_name_stem = Path(file_name).stem
@@ -200,6 +208,8 @@ class ResampleRegrid:
                     cruise_name=cruise_name,
                     sensor_name=sensor_name,
                     file_name_stem=file_name_stem,
+                    input_bucket_name=bucket_name,
+                    endpoint_url=endpoint_url,
                 )
                 #########################################################################
                 # [3] Get needed indices
@@ -225,11 +235,11 @@ class ResampleRegrid:
                         :, start_ping_time_index:end_ping_time_index, :
                     ].shape
                 )
-                cruise_sv_subset[:, :, :] = np.nan  # (5208, 9778, 4)
+                cruise_sv_subset[:, :, :] = np.nan
                 all_cruise_depth_values = zarr_manager.get_depth_values(
                     min_echo_range=min_echo_range, max_echo_range=max_echo_range
-                )
+                ) # (5262,) and
                 print(" ".join(list(input_xr_zarr_store.Sv.dims)))
                 if set(input_xr_zarr_store.Sv.dims) != {
@@ -239,13 +249,14 @@ class ResampleRegrid:
                 }:
                     raise Exception("Xarray dimensions are not as expected.")
-                # get geojson
                 indices, geospatial = geo_manager.read_s3_geo_json(
                     ship_name=ship_name,
                     cruise_name=cruise_name,
                     sensor_name=sensor_name,
                     file_name_stem=file_name_stem,
                     input_xr_zarr_store=input_xr_zarr_store,
+                    endpoint_url=endpoint_url,
+                    output_bucket_name=bucket_name,
                 )
                 input_xr = input_xr_zarr_store.isel(ping_time=indices)
@@ -261,22 +272,18 @@ class ResampleRegrid:
                 )
                 # --- UPDATING --- #
                 regrid_resample = self.interpolate_data(
                     input_xr=input_xr,
                     ping_times=ping_times,
                     all_cruise_depth_values=all_cruise_depth_values,
                 )
-                print(
-                    f"start_ping_time_index: {start_ping_time_index}, end_ping_time_index: {end_ping_time_index}"
-                )
+                print(f"start_ping_time_index: {start_ping_time_index}, end_ping_time_index: {end_ping_time_index}")
                 #########################################################################
                 # write Sv values to cruise-level-model-store
                 for channel in range(
                     len(input_xr.channel.values)
-                ):  # doesn't like being written in one fell swoop :(
+                ):  # does not like being written in one fell swoop :(
                     output_zarr_store.Sv[
                         :, start_ping_time_index:end_ping_time_index, channel
                     ] = regrid_resample[:, :, channel]
@@ -285,14 +292,18 @@ class ResampleRegrid:
                 # [5] write subset of latitude/longitude
                 output_zarr_store.latitude[
                     start_ping_time_index:end_ping_time_index
-                ] = geospatial.dropna()["latitude"].values
+                ] = geospatial.dropna()["latitude"].values # TODO: get from ds_sv directly, dont need geojson anymore
                 output_zarr_store.longitude[
                     start_ping_time_index:end_ping_time_index
                 ] = geospatial.dropna()["longitude"].values
         except Exception as err:
             print(f"Problem interpolating the data: {err}")
             raise err
-        print("Done interpolating data.")
+        # else:
+        #     pass
+        finally:
+            print("Done interpolating data.")
+            # TODO: read across times and verify data was written?
     #######################################################

water_column_sonar_processing/geometry/geometry_manager.py CHANGED Viewed

@@ -38,6 +38,7 @@ class GeometryManager:
         cruise_name,
         sensor_name,
         file_name,
+        endpoint_url=None,
         write_geojson=True,
     ) -> tuple:
         file_name_stem = Path(file_name).stem
@@ -61,7 +62,7 @@ class GeometryManager:
             time1 = echodata.environment.time1.values
             if len(nmea_times) < len(time1):
-                raise Exception(
+                raise Exception( # TODO: explore this logic further...
                     "Problem: Not enough NMEA times available to extrapolate time1."
                 )
@@ -137,7 +138,7 @@ class GeometryManager:
                 )
                 print("Checking s3 and deleting any existing GeoJSON file.")
-                s3_manager = S3Manager()
+                s3_manager = S3Manager(endpoint_url=endpoint_url)
                 geojson_object_exists = s3_manager.check_if_object_exists(
                     bucket_name=output_bucket_name,
                     key_name=f"{geo_json_prefix}/{geo_json_name}"
@@ -180,7 +181,8 @@ class GeometryManager:
             raise
         # Note: returned lat/lon values can include np.nan because they need to be aligned with
         # the Sv data! GeoJSON needs simplification but has been filtered.
-        return gps_df.index.values, gps_df.latitude.values, gps_df.longitude.values
+        # return gps_df.index.values, gps_df.latitude.values, gps_df.longitude.values
+        return gps_df.index.values, lat, lon
         # TODO: if geojson is already returned with 0,0, the return here
         #  can include np.nan values?
@@ -192,14 +194,18 @@ class GeometryManager:
         sensor_name,
         file_name_stem,
         input_xr_zarr_store,
+        endpoint_url,
+        output_bucket_name,
     ):
         try:
-            s3_manager = S3Manager()
+            s3_manager = S3Manager(endpoint_url=endpoint_url)
             geo_json = s3_manager.read_s3_json(
                 ship_name=ship_name,
                 cruise_name=cruise_name,
                 sensor_name=sensor_name,
                 file_name_stem=file_name_stem,
+                output_bucket_name=output_bucket_name,
             )
             ###
             geospatial = geopandas.GeoDataFrame.from_features(

water_column_sonar_processing/model/zarr_manager.py CHANGED Viewed

@@ -28,8 +28,8 @@ class ZarrManager:
         self.__compressor = Blosc(cname="zstd", clevel=2)  # shuffle=Blosc.NOSHUFFLE
         self.__overwrite = True
         self.__num_threads = numcodecs.blosc.get_nthreads()
-        self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
-        self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
+        # self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
+        # self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
     #######################################################
     def get_depth_values(
@@ -54,7 +54,7 @@ class ZarrManager:
     #######################################################
     def create_zarr_store(
         self,
-        path: str,
+        path: str, # 'level_2/Henry_B._Bigelow/HB0707/EK60/HB0707.model/tmp/HB0707.zarr/.zattrs'
         ship_name: str,
         cruise_name: str,
         sensor_name: str,
@@ -246,7 +246,7 @@ class ZarrManager:
         #
         root.attrs["processing_software_name"] = Coordinates.PROJECT_NAME.value
         root.attrs["processing_software_version"] = (
-            "0.0.9"  # TODO: get programmatically, echopype>utils>prov.py
+            "0.0.12"  # TODO: get programmatically, echopype>utils>prov.py
         )
         root.attrs["processing_software_time"] = Timestamp.get_timestamp()
         #
@@ -282,14 +282,16 @@ class ZarrManager:
         ship_name: str,
         cruise_name: str,
         sensor_name: str,
-        # zarr_synchronizer: Union[str, None] = None,
+        # zarr_synchronizer: Union[str, None] = None, # TODO:
+        output_bucket_name: str,
+        endpoint_url=None,
     ):
         # Mounts a Zarr store using pythons Zarr implementation. The mounted store
         #  will have read/write privileges so that store can be updated.
         print("Opening Zarr store with Zarr.")
         try:
-            s3fs_manager = S3FSManager()
-            root = f"{self.output_bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
+            s3fs_manager = S3FSManager(endpoint_url=endpoint_url)
+            root = f"{output_bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
             store = s3fs_manager.s3_map(s3_zarr_store_path=root)
             # synchronizer = model.ProcessSynchronizer(f"/tmp/{ship_name}_{cruise_name}.sync")
             cruise_zarr = zarr.open(store=store, mode="r+")
@@ -306,11 +308,13 @@ class ZarrManager:
         cruise_name: str,
         sensor_name: str,
         file_name_stem: str,
+        input_bucket_name: str,
+        endpoint_url=None,
     ) -> xr.Dataset:
-        print("Opening Zarr store in S3 as Xarray.")
+        print("Opening L1 Zarr store in S3 with Xarray.")
         try:
-            zarr_path = f"s3://{self.output_bucket_name}/level_1/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.zarr"
-            s3fs_manager = S3FSManager()
+            zarr_path = f"s3://{input_bucket_name}/level_1/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.zarr"
+            s3fs_manager = S3FSManager(endpoint_url=endpoint_url)
             store_s3_map = s3fs_manager.s3_map(s3_zarr_store_path=zarr_path)
             ds = xr.open_zarr(
                 store=store_s3_map, consolidated=None
@@ -321,6 +325,25 @@ class ZarrManager:
         print("Done opening Zarr store in S3 as Xarray.")
         return ds
+    def open_l2_zarr_store_with_xarray(
+        self,
+        ship_name: str,
+        cruise_name: str,
+        sensor_name: str,
+        bucket_name: str,
+        endpoint_url=None,
+    ) -> xr.Dataset:
+        print("Opening L2 Zarr store in S3 with Xarray.")
+        try:
+            zarr_path = f"s3://{bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
+            s3fs_manager = S3FSManager(endpoint_url=endpoint_url)
+            store_s3_map = s3fs_manager.s3_map(s3_zarr_store_path=zarr_path)
+            ds = xr.open_zarr(store=store_s3_map, consolidated=None)
+        except Exception as err:
+            print("Problem opening Zarr store in S3 as Xarray.")
+            raise err
+        print("Done opening Zarr store in S3 as Xarray.")
+        return ds
     ############################################################################
     #######################################################

water_column_sonar_processing/processing/raw_to_zarr.py CHANGED Viewed

@@ -9,7 +9,7 @@ from pathlib import Path # , PurePath
 from water_column_sonar_processing.aws import DynamoDBManager, S3Manager
 from water_column_sonar_processing.geometry import GeometryManager
-from water_column_sonar_processing.utility import Cleaner
+from water_column_sonar_processing.utility import Cleaner, PipelineStatus
 TEMPDIR = "/tmp"
@@ -53,10 +53,6 @@ class RawToZarr:
     ):
         print('Writing Zarr information to DynamoDB table.')
         dynamodb_manager = DynamoDBManager()
-        # The problem is that these values were never populated
-        # and so when the query looks for values that aren't there
-        # they fail
         dynamodb_manager.update_item(
             table_name=table_name,
             key={
@@ -87,7 +83,8 @@ class RawToZarr:
                 ":ma": {"N": str(np.round(max_echo_range, 4))},
                 ":mi": {"N": str(np.round(min_echo_range, 4))},
                 ":nd": {"N": str(num_ping_time_dropna)},
-                ":ps": {"S": "PROCESSING_RESAMPLE_AND_WRITE_TO_ZARR_STORE"},
+                # ":ps": {"S": "PROCESSING_RESAMPLE_AND_WRITE_TO_ZARR_STORE"},
+                ":ps": {"S": PipelineStatus.LEVEL_1_PROCESSING.name},
                 ":pt": {"S": datetime.now().isoformat(timespec="seconds") + "Z"},
                 ":se": {"S": sensor_name},
                 ":sh": {"S": ship_name},
@@ -113,6 +110,7 @@ class RawToZarr:
                 "#ZP = :zp"
             ),
         )
+        print('Done writing Zarr information to DynamoDB table.')
     ############################################################################
     ############################################################################
@@ -122,9 +120,10 @@ class RawToZarr:
             output_bucket_name,
             local_directory,
             object_prefix,
+            endpoint_url,
     ):
         # Note: this will be passed credentials if using NODD
-        s3_manager = S3Manager()
+        s3_manager = S3Manager(endpoint_url=endpoint_url)
         print('Uploading files using thread pool executor.')
         all_files = []
         for subdir, dirs, files in os.walk(local_directory):
@@ -143,11 +142,14 @@ class RawToZarr:
     def raw_to_zarr(
             self,
             table_name,
+            input_bucket_name,
             output_bucket_name,
             ship_name,
             cruise_name,
             sensor_name,
             raw_file_name,
+            endpoint_url=None,
+            include_bot=True,
     ):
         """
         Downloads the raw files, processes them with echopype, writes geojson, and uploads files
@@ -157,6 +159,16 @@ class RawToZarr:
         geometry_manager = GeometryManager()
         cleaner = Cleaner()
         cleaner.delete_local_files(file_types=["*.zarr", "*.json"]) # TODO: include bot and raw?
+        s3_manager = S3Manager(endpoint_url=endpoint_url)
+        s3_file_path = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/{raw_file_name}"
+        bottom_file_name = f"{Path(raw_file_name).stem}.bot"
+        s3_bottom_file_path = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/{bottom_file_name}"
+        s3_manager.download_file(bucket_name=input_bucket_name, key=s3_file_path, file_name=raw_file_name)
+        # TODO: add the bottom file
+        if include_bot:
+            s3_manager.download_file(bucket_name=input_bucket_name, key=s3_bottom_file_path, file_name=bottom_file_name)
         try:
             gc.collect()
             print('Opening raw file with echopype.')
@@ -165,14 +177,20 @@ class RawToZarr:
             echodata = ep.open_raw(
                 raw_file=raw_file_name,
                 sonar_model=sensor_name,
-                include_bot=True,
-                use_swap=True,
-                # max_chunk_size=100,
+                include_bot=include_bot,
+                # use_swap=True,
+                # max_chunk_size=300,
                 # storage_options={'anon': True } # 'endpoint_url': self.endpoint_url} # this was creating problems
             )
             print('Compute volume backscattering strength (Sv) from raw data.')
             ds_sv = ep.calibrate.compute_Sv(echodata)
-            print('Done computing volume backscattering strength (Sv) from raw data.')
+            gc.collect()
+            print('Done computing volume backscatter strength (Sv) from raw data.')
+            # Note: detected_seafloor_depth is located at echodata.vendor.detected_seafloor_depth
+            # but is not written out with ds_sv
+            if "detected_seafloor_depth" in list(echodata.vendor.variables):
+                ds_sv["detected_seafloor_depth"] = echodata.vendor.detected_seafloor_depth
+            #
             frequencies = echodata.environment.frequency_nominal.values
             #################################################################
             # Get GPS coordinates
@@ -183,20 +201,21 @@ class RawToZarr:
                 cruise_name=cruise_name,
                 sensor_name=sensor_name,
                 file_name=raw_file_name,
+                endpoint_url=endpoint_url,
                 write_geojson=True
             )
+            ds_sv = ep.consolidate.add_location(ds_sv, echodata)
+            ds_sv.latitude.values = lat # overwriting echopype gps values to include missing values
+            ds_sv.longitude.values = lon
             # gps_data, lat, lon = self.__get_gps_data(echodata=echodata)
             #################################################################
             # Technically the min_echo_range would be 0 m.
             # TODO: this var name is supposed to represent minimum resolution of depth measurements
             # TODO revert this so that smaller diffs can be used
             # The most minimum the resolution can be is as small as 0.25 meters
-            min_echo_range = np.maximum(
-                0.25,
-                np.nanmin(np.diff(ds_sv.echo_range.values))
-            )
+            min_echo_range = np.round(np.nanmin(np.diff(ds_sv.echo_range.values)), 2)
             max_echo_range = float(np.nanmax(ds_sv.echo_range))
-            #
+            # This is the number of missing values found throughout the lat/lon
             num_ping_time_dropna = lat[~np.isnan(lat)].shape[0]  # symmetric to lon
             #
             start_time = np.datetime_as_string(ds_sv.ping_time.values[0], unit='ms') + "Z"
@@ -206,12 +225,15 @@ class RawToZarr:
             #################################################################
             # Create the zarr store
             store_name = f"{Path(raw_file_name).stem}.zarr"
-            ds_sv.to_zarr(store=store_name)
+            # Sv = ds_sv.Sv
+            # ds_sv['Sv'] = Sv.astype('int32', copy=False)
+            ds_sv.to_zarr(store=store_name) # ds_sv.Sv.sel(channel=ds_sv.channel.values[0]).shape
+            gc.collect()
             #################################################################
             output_zarr_prefix = f"level_1/{ship_name}/{cruise_name}/{sensor_name}/"
             #################################################################
             # If zarr store already exists then delete
-            s3_manager = S3Manager()
+            s3_manager = S3Manager(endpoint_url=endpoint_url)
             child_objects = s3_manager.get_child_objects(
                 bucket_name=output_bucket_name,
                 sub_prefix=f"level_1/{ship_name}/{cruise_name}/{sensor_name}/{Path(raw_file_name).stem}.zarr",
@@ -226,7 +248,8 @@ class RawToZarr:
             self.__upload_files_to_output_bucket(
                 output_bucket_name=output_bucket_name,
                 local_directory=store_name,
-                object_prefix=output_zarr_prefix
+                object_prefix=output_zarr_prefix,
+                endpoint_url=endpoint_url
             )
             #################################################################
             self.__zarr_info_to_table(
@@ -248,11 +271,13 @@ class RawToZarr:
             #######################################################################
             # TODO: verify count of objects matches, publish message, update status
             #######################################################################
-            print('here')
+            print('Finished raw-to-zarr conversion.')
         except Exception as err:
             print(f'Exception encountered creating local Zarr store with echopype: {err}')
             raise RuntimeError(f"Problem creating local Zarr store, {err}")
         finally:
+            gc.collect()
+            print("Finally.")
             cleaner.delete_local_files(file_types=["*.raw", "*.bot", "*.zarr", "*.json"])
         print('Done creating local zarr store.')

{water_column_sonar_processing-0.0.10.dist-info → water_column_sonar_processing-0.0.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: water_column_sonar_processing
-Version: 0.0.10
+Version: 0.0.12
 Summary: A processing tool for water column sonar data.
 Author-email: Rudy Klucik <rudy.klucik@noaa.gov>
 Project-URL: Homepage, https://github.com/CI-CMG/water-column-sonar-processing
@@ -28,7 +28,7 @@ Requires-Dist: python-dotenv==1.0.1
 Requires-Dist: requests==2.32.3
 Requires-Dist: s3fs==2023.12.1
 Requires-Dist: scipy==1.14.1
-Requires-Dist: setuptools==75.6.0
+Requires-Dist: setuptools
 Requires-Dist: shapely==2.0.3
 Requires-Dist: typing-extensions==4.10.0
 Requires-Dist: xarray==2024.10.0
@@ -37,6 +37,16 @@ Requires-Dist: zarr==2.18.3
 # Water Column Sonar Processing
 Processing tool for converting L0 data to L1 and L2 as well as generating geospatial information
+![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/CI-CMG/water-column-sonar-processing/test_action.yaml)
+![GitHub License](https://img.shields.io/github/license/CI-CMG/water-column-sonar-processing)
+![PyPI - Implementation](https://img.shields.io/pypi/v/water-column-sonar-processing?color=black)
+![PyPI - Downloads](https://img.shields.io/pypi/dd/water-column-sonar-processing)
+![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/CI-CMG/water-column-sonar-processing) ![GitHub repo size](https://img.shields.io/github/repo-size/CI-CMG/water-column-sonar-processing)
 # Setting up the Python Environment
 > Python 3.10.12
@@ -93,20 +103,6 @@ or
 Following this tutorial:
 https://packaging.python.org/en/latest/tutorials/packaging-projects/
-# To Publish To TEST
-```commandline
-python -m build
-# python -m build --sdist
-# python -m build --wheel
-python -m twine upload --repository testpypi dist/*
-pytho -m pip install --index-url https://test.pypi.org/simple/ hello-pypi-rudy-klucik
-python
-```
-```
-from water-column-sonar-processing import ZarrManager
-example.add_one(2)
-```
 # To Publish To PROD
 ```commandline
 python -m build
@@ -134,6 +130,12 @@ https://colab.research.google.com/drive/1KiLMueXiz9WVB9o4RuzYeGjNZ6PsZU7a#scroll
 5 failed, 35 passed, 3 skipped, 1 warning in 9.71s
 3 failed, 38 passed, 3 skipped, 1 warning in 7.24s
+# Tag a Release
+```commandline
+git tag "v0.0.12" -a
+# enter description
+git push origin --tags
+```
 # TODO:
 add https://pypi.org/project/setuptools-scm/

{water_column_sonar_processing-0.0.10.dist-info → water_column_sonar_processing-0.0.12.dist-info}/RECORD RENAMED Viewed

@@ -1,32 +1,33 @@
 water_column_sonar_processing/__init__.py,sha256=fvRK4uFo_A0l7w_T4yckvDqJ3wMUq4JB3VVPXqWfewE,226
 water_column_sonar_processing/process.py,sha256=-yQtK3rnZq6lGAr3q02zLDe1NuMH9c0PiUOxKzG_r18,5386
 water_column_sonar_processing/aws/__init__.py,sha256=KJqK8oYMn-u8n8i-Jp_lG5BvCOTjwWSjWP8yAyDlWVo,297
-water_column_sonar_processing/aws/dynamodb_manager.py,sha256=sZHn-hgCt3K3w0x5BcXfF5jLMt_F11dAtQHJToij9nU,10008
-water_column_sonar_processing/aws/s3_manager.py,sha256=kS48Vu_jE_fOKbwKOhCLWKDSqHzOGVEdZ_Lc4MaMCfA,15291
-water_column_sonar_processing/aws/s3fs_manager.py,sha256=thVJPQKhbvF1g-Ue3BYgwazFOFDYOICIEJx4zkXBQ1E,2381
+water_column_sonar_processing/aws/dynamodb_manager.py,sha256=LQ3eh7Zf1fBLG-RKovod9KbQwhE-0Qdq1JPk4Ro5bdo,10252
+water_column_sonar_processing/aws/s3_manager.py,sha256=-PCiW7YF31nGIPa1oVOVTzjTSExAAkT_IyNNnvWv2HU,16214
+water_column_sonar_processing/aws/s3fs_manager.py,sha256=d7p9Sx-ocooKzHjVJVCawnXSGv6BpmKvvN9uhzilglw,2529
 water_column_sonar_processing/aws/sns_manager.py,sha256=Dp9avG5VSugSWPR1dZ-askuAw1fCZkNUHbOUP65iR-k,1867
 water_column_sonar_processing/aws/sqs_manager.py,sha256=NSUrWmnSC8h8Gf7gT0U8zFaQQ-yX89h0Q0mDLKGqp2Y,1597
 water_column_sonar_processing/cruise/__init__.py,sha256=H5hW0JMORuaFvQk_R31B4VL8RnRyKeanOOiWmqEMZJk,156
-water_column_sonar_processing/cruise/create_empty_zarr_store.py,sha256=uQiZoKm16jD0SUuXmhuPryxdE-6bUc6BlCi2UtmzUpw,7318
-water_column_sonar_processing/cruise/resample_regrid.py,sha256=4Tw6Ro9mQZOr0uIph6foz6a1OeFAZW0SMUT_asIwvKw,12309
+water_column_sonar_processing/cruise/create_empty_zarr_store.py,sha256=1IehrlhMAS5XAl7DLdQI4jIMSY9ZNLiW4YdcBEwYkbc,7679
+water_column_sonar_processing/cruise/experiment_datatree.py,sha256=K6Uq_36Rygw5oFF8zWavEwb1x8D27lJv5G3j0B59agE,243
+water_column_sonar_processing/cruise/resample_regrid.py,sha256=WFWxP083X4VpH9x50Om4nxSEUwTsjKjdejQz3Nh8CLs,12822
 water_column_sonar_processing/geometry/__init__.py,sha256=_ol5nI8AL30pYXeAh5rtP7YmQggitPC6LA_kuTfPJ0Q,231
-water_column_sonar_processing/geometry/geometry_manager.py,sha256=0Q9IRiBr6XvxUg5M2vCPtUhbnYnwa5pJI1ayfWXMgMs,10587
+water_column_sonar_processing/geometry/geometry_manager.py,sha256=nz5T1vCDWHYIfQ853EqKYHDetTul7jRWS3y8Evep8QU,10855
 water_column_sonar_processing/geometry/geometry_simplification.py,sha256=im1HG9nfYIerQv3w-PUHzphw2B7aGgnsA3Zcdy2oTmA,3016
 water_column_sonar_processing/geometry/pmtile_generation.py,sha256=7Lm08Jr6YaM4nYmexClxbIMOqSV1teo9wMm6dfjFuNA,12384
 water_column_sonar_processing/index/__init__.py,sha256=izEObsKiOoIJ0kZCFhvaYsBd6Ga71XJxnogjrNInw68,68
 water_column_sonar_processing/index/index_manager.py,sha256=YS6y_THfGAZpjfBZOj5n8O1aY_BnBYS781eNHfhpip0,11239
 water_column_sonar_processing/model/__init__.py,sha256=FXaCdbPqxp0ogmZm9NplRirqpgMiYs1iRYgJbFbbX2Y,65
-water_column_sonar_processing/model/zarr_manager.py,sha256=TbcVux-GWfX4XJ7UT20E7dI_h_islrKsGtjx_VwSsLg,14003
+water_column_sonar_processing/model/zarr_manager.py,sha256=ph0sU-aJQM5TkbyyArDHqXLpeiIki_ce6WN_Z7RVxxw,15053
 water_column_sonar_processing/processing/__init__.py,sha256=UwdB3BnoUxy4q3k9-ZjBF6KzmCWVDcqbcArTeHgmvGA,118
 water_column_sonar_processing/processing/cruise_sampler.py,sha256=hadPrnH5nz7_oG_4pND7YbMFH6NMR9d6p3xAXedtKU8,15927
-water_column_sonar_processing/processing/raw_to_zarr.py,sha256=7vvoNe0jlB34R5mBPceQjL9N_5X0GTWs9xpCqvRK1nQ,15931
+water_column_sonar_processing/processing/raw_to_zarr.py,sha256=agbb2A0BWf7D4b5u-mYOBN_VyjRVjOdQM2aeRGBweWw,17617
 water_column_sonar_processing/utility/__init__.py,sha256=yDObMOL0_OxKWet5wffK2-XVJgoE9iwiY2q04GZrtBQ,234
 water_column_sonar_processing/utility/cleaner.py,sha256=bNbs-hopWxtKAFBK0Eu18xdRErZCGZvtla3j-1bTwQw,619
 water_column_sonar_processing/utility/constants.py,sha256=EbzsorvYKadsPjuutRjQKKByGibhFm0Gw6D-Sp2ZD3I,2143
 water_column_sonar_processing/utility/pipeline_status.py,sha256=O-0SySqdRGJ6bs3zQe1NV9vkOpmsRM7zj5QoHgzYioY,4395
 water_column_sonar_processing/utility/timestamp.py,sha256=bO0oir7KxxoEHPGRkz9FCBfOligkocUyRiWRzAq8fnU,361
-water_column_sonar_processing-0.0.10.dist-info/LICENSE,sha256=lz4IpJ5_adG3S0ali-WaIpQFVTnEAOucMDQPECUVEYw,1110
-water_column_sonar_processing-0.0.10.dist-info/METADATA,sha256=qFNeJ3GduRHKfcJRYShO9LamuMREk66qm18IUUXsMg8,4566
-water_column_sonar_processing-0.0.10.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-water_column_sonar_processing-0.0.10.dist-info/top_level.txt,sha256=aRYU4A7RNBlNrL4vzjytFAir3BNnmOgsvIGKKA36tg4,30
-water_column_sonar_processing-0.0.10.dist-info/RECORD,,
+water_column_sonar_processing-0.0.12.dist-info/LICENSE,sha256=lz4IpJ5_adG3S0ali-WaIpQFVTnEAOucMDQPECUVEYw,1110
+water_column_sonar_processing-0.0.12.dist-info/METADATA,sha256=813ibpVKvkucEfCFlJVHeUfKIC8n1_Pt_Di4k6OebrQ,4960
+water_column_sonar_processing-0.0.12.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+water_column_sonar_processing-0.0.12.dist-info/top_level.txt,sha256=aRYU4A7RNBlNrL4vzjytFAir3BNnmOgsvIGKKA36tg4,30
+water_column_sonar_processing-0.0.12.dist-info/RECORD,,

{water_column_sonar_processing-0.0.10.dist-info → water_column_sonar_processing-0.0.12.dist-info}/LICENSE RENAMED Viewed

File without changes

{water_column_sonar_processing-0.0.10.dist-info → water_column_sonar_processing-0.0.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{water_column_sonar_processing-0.0.10.dist-info → water_column_sonar_processing-0.0.12.dist-info}/top_level.txt RENAMED Viewed

File without changes

water-column-sonar-processing 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

Potentially problematic release.

water-column-sonar-processing 0.0.10py3-none-any.whl → 0.0.12py3-none-any.whl