PyPI - water-column-sonar-processing - Versions diffs - 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl - Mend

water-column-sonar-processing 0.0.7py3-none-any.whl → 0.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

water_column_sonar_processing/__init__.py CHANGED Viewed

@@ -1,8 +1,6 @@
 from __future__ import absolute_import
-from . import aws, cruise, geometry, index, model, utility, process
-from .model import ZarrManager
-from .process import Process
+from . import aws, cruise, geometry, index, model, processing, utility
 __all__ = [
     "aws",
@@ -10,7 +8,6 @@ __all__ = [
     "geometry",
     "index",
     "model",
+    "processing",
     "utility",
-    "process",
-    "Process",
 ]

water_column_sonar_processing/aws/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from .dynamodb_manager import DynamoDBManager
-from .s3_manager import S3Manager
+from .s3_manager import S3Manager, chunked
 from .s3fs_manager import S3FSManager
 from .sns_manager import SNSManager
 from .sqs_manager import SQSManager
-__all__ = ["DynamoDBManager", "S3Manager", "S3FSManager", "SNSManager", "SQSManager"]
+__all__ = ["DynamoDBManager", "S3Manager", "chunked", "S3FSManager", "SNSManager", "SQSManager"]

water_column_sonar_processing/aws/dynamodb_manager.py CHANGED Viewed

@@ -8,7 +8,11 @@ from boto3.dynamodb.types import TypeDeserializer, TypeSerializer
 #########################################################################
 class DynamoDBManager:
     #####################################################################
-    def __init__(self):
+    def __init__(
+            self,
+            # endpoint_url
+    ):
+        # self.endpoint_url = endpoint_url
         self.__dynamodb_session = boto3.Session(
             aws_access_key_id=os.environ.get("ACCESS_KEY_ID"),
             aws_secret_access_key=os.environ.get("SECRET_ACCESS_KEY"),
@@ -16,9 +20,11 @@ class DynamoDBManager:
         )
         self.__dynamodb_resource = self.__dynamodb_session.resource(
             service_name="dynamodb",
+            # endpoint_url=self.endpoint_url
         )
         self.__dynamodb_client = self.__dynamodb_session.client(
             service_name="dynamodb",
+            # endpoint_url=self.endpoint_url
         )
         self.type_serializer = TypeSerializer()  # https://stackoverflow.com/a/46738251
         self.type_deserializer = TypeDeserializer()
@@ -35,31 +41,14 @@ class DynamoDBManager:
     #     assert (status_code == 200), "Problem, unable to update dynamodb table."
     #####################################################################
-    def create_table(
-        self,
-        table_name,
-        key_schema,
-        attribute_definitions,
-    ):
-        self.__dynamodb_client.create_table(
-            AttributeDefinitions=attribute_definitions,
-            TableName=table_name,
-            KeySchema=key_schema,
-            BillingMode="PAY_PER_REQUEST",  # "PROVISIONED",
-            # ProvisionedThroughput={
-            #     'ReadCapacityUnits': 1_000,
-            #     'WriteCapacityUnits': 1_000
-            # }
-        )
     #####################################################################
     def create_water_column_sonar_table(
         self,
         table_name,
     ):
-        self.create_table(
-            table_name=table_name,
-            key_schema=[
+        self.__dynamodb_client.create_table(
+            TableName=table_name,
+            KeySchema=[
                 {
                     "AttributeName": "FILE_NAME",
                     "KeyType": "HASH",
@@ -69,20 +58,50 @@ class DynamoDBManager:
                     "KeyType": "RANGE",
                 },
             ],
-            attribute_definitions=[
+            AttributeDefinitions=[
                 {"AttributeName": "FILE_NAME", "AttributeType": "S"},
                 {"AttributeName": "CRUISE_NAME", "AttributeType": "S"},
             ],
+            BillingMode="PAY_PER_REQUEST"
+            # ProvisionedThroughput={
+            #     'ReadCapacityUnits': 1_000,
+            #     'WriteCapacityUnits': 1_000
+            # }
         )
+        # TODO: after creating status is 'CREATING', wait until 'ACTIVE'
+        response = self.__dynamodb_client.describe_table(TableName=table_name)
+        print(response) # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb/client/describe_table.html
+        # sleep then response['Table']['TableStatus'] == 'ACTIVE'
+    #####################################################################
+    # don't think this is used?
+    # def get_item(
+    #         self,
+    #         table_name,
+    #         key
+    # ):
+    #     response = self.__dynamodb_client.get_item(TableName=table_name, Key=key)
+    #     item = None
+    #     if response["ResponseMetadata"]["HTTPStatusCode"] == 200:
+    #         if "Item" in response:
+    #             item = response["Item"]
+    #     return item
     #####################################################################
-    def get_item(self, table_name, key):
-        response = self.__dynamodb_client.get_item(TableName=table_name, Key=key)
-        item = None
-        if response["ResponseMetadata"]["HTTPStatusCode"] == 200:
-            if "Item" in response:
-                item = response["Item"]
-        return item
+    def get_table_item(
+        self,
+        table_name,
+        key,
+    ):
+        """
+        Gets a single row from the db.
+        """
+        table = self.__dynamodb_resource.Table(table_name)
+        response = table.get_item(Key=key)
+        # TODO:
+        # if response["ResponseMetadata"]["HTTPStatusCode"] != 200:
+        #     throw error
+        return response
     #####################################################################
     def update_item(
@@ -101,17 +120,22 @@ class DynamoDBManager:
             UpdateExpression=update_expression,
         )
         status_code = response["ResponseMetadata"]["HTTPStatusCode"]
-        # TODO: change to exception
+        assert response['ConsumedCapacity']['TableName'] == table_name
         assert status_code == 200, "Problem, unable to update dynamodb table."
     #####################################################################
+    # TODO: change to "get_cruise_as_df"
     def get_table_as_df(
         self,
         ship_name,
         cruise_name,
         sensor_name,
         table_name,
-    ):
+    ) -> pd.DataFrame:
+        """
+        To be used to initialize a cruise, deletes all entries associated with that cruise
+        in the database.
+        """
         expression_attribute_values = {
             ":cr": {"S": cruise_name},
             ":se": {"S": sensor_name},
@@ -128,6 +152,9 @@ class DynamoDBManager:
             FilterExpression=filter_expression,
         )
         # Note: table.scan() has 1 MB limit on results so pagination is used
+        if len(response["Items"]) == 0:
+            return pd.DataFrame() # If no results, return empty dataframe
         data = response["Items"]
         while "LastEvaluatedKey" in response:
@@ -146,25 +173,104 @@ class DynamoDBManager:
         return df.sort_values(by="START_TIME", ignore_index=True)
     #####################################################################
-    # is this used?
-    def get_table_item(
+    # TODO: WIP
+    def delete_item(
         self,
         table_name,
-        key,
+        cruise_name,
+        file_name,
     ):
-        # a bit more high level, uses resource to get table item
-        table = self.__dynamodb_resource.Table(table_name)
-        response = table.get_item(Key=key)
+        """
+        Finds all rows associated with a cruise and deletes them.
+        """
+        response = self.__dynamodb_client.delete_item(
+            Key={
+                "CRUISE_NAME": {
+                    "S": cruise_name
+                },
+                "FILE_NAME": {
+                    "S": file_name
+                }
+            },
+            TableName=table_name,
+            ReturnConsumedCapacity="TOTALS",
+        )
+        # TODO: there should be attributes included in response but they are missing
+        # if response["ResponseMetadata"]["HTTPStatusCode"] != 200:
+        #     throw error
         return response
     #####################################################################
-    # TODO: add helper method to delete the data
-    def delete_cruise(
-        self,
-        table_name,
-        cruise_name,
+    def describe_table(
+            self,
+            table_name,
     ):
-        pass
+        """
+        Get a description of the table. Used to verify that records were added/removed.
+        """
+        response = self.__dynamodb_client.describe_table(TableName=table_name)
+        print(response)
+        return response
+    #####################################################################
+    # TODO: from test_raw_to_zarr get enum and use here
+    # def __update_processing_status(
+    #         self,
+    #         file_name: str,
+    #         cruise_name: str,
+    #         pipeline_status: str,
+    #         error_message: str = None,
+    # ):
+    #     print(f"Updating processing status to {pipeline_status}.")
+    #     if error_message:
+    #         print(f"Error message: {error_message}")
+    #         self.__dynamo.update_item(
+    #             table_name=self.__table_name,
+    #             key={
+    #                 'FILE_NAME': {'S': file_name},  # Partition Key
+    #                 'CRUISE_NAME': {'S': cruise_name},  # Sort Key
+    #             },
+    #             attribute_names={
+    #                 '#PT': 'PIPELINE_TIME',
+    #                 '#PS': 'PIPELINE_STATUS',
+    #                 '#EM': 'ERROR_MESSAGE',
+    #             },
+    #             expression='SET #PT = :pt, #PS = :ps, #EM = :em',
+    #             attribute_values={
+    #                 ':pt': {
+    #                     'S': datetime.now().isoformat(timespec="seconds") + "Z"
+    #                 },
+    #                 ':ps': {
+    #                     'S': pipeline_status
+    #                 },
+    #                 ':em': {
+    #                     'S': error_message
+    #                 }
+    #             }
+    #         )
+    #     else:
+    #         self.__dynamo.update_item(
+    #             table_name=self.__table_name,
+    #             key={
+    #                 'FILE_NAME': {'S': file_name},  # Partition Key
+    #                 'CRUISE_NAME': {'S': cruise_name},  # Sort Key
+    #             },
+    #             attribute_names={
+    #                 '#PT': 'PIPELINE_TIME',
+    #                 '#PS': 'PIPELINE_STATUS',
+    #             },
+    #             expression='SET #PT = :pt, #PS = :ps',
+    #             attribute_values={
+    #                 ':pt': {
+    #                     'S': datetime.now().isoformat(timespec="seconds") + "Z"
+    #                 },
+    #                 ':ps': {
+    #                     'S': pipeline_status
+    #                 }
+    #             }
+    #         )
+    #     print("Done updating processing status.")
 #########################################################################

water_column_sonar_processing/aws/s3_manager.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import json
 import os
+import boto3
 from collections.abc import Generator
 from concurrent.futures import ThreadPoolExecutor, as_completed
-import boto3
 from boto3.s3.transfer import TransferConfig
 from botocore.config import Config
 from botocore.exceptions import ClientError
@@ -25,10 +24,16 @@ class S3Manager:
     #####################################################################
     def __init__(
         self,
+        # input_endpoint_url: str,
+        # output_endpoint_url: str,
+        # endpoint_url
         # TODO: Need to allow passing in of credentials when writing to protected bucket
     ):
         self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
         self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
+        # self.endpoint_url = endpoint_url
+        # self.input_endpoint_url = input_endpoint_url
+        # self.output_endpoint_url = output_endpoint_url
         self.s3_region = os.environ.get("AWS_REGION", default="us-east-1")
         self.s3_client_config = Config(max_pool_connections=MAX_POOL_CONNECTIONS)
         self.s3_transfer_config = TransferConfig(
@@ -46,6 +51,7 @@ class S3Manager:
             service_name="s3",
             config=self.s3_client_config,
             region_name=self.s3_region,
+            # endpoint_url=endpoint_url, # TODO: temporary
         )
         self.s3_resource = boto3.resource(
             service_name="s3",
@@ -53,7 +59,6 @@ class S3Manager:
             region_name=self.s3_region,
         )
         # self.paginator = self.s3_client.get_paginator(operation_name='list_objects_v2')
-        # TODO: create both "s3_client_input" and "s3_client_output" ???
         self.s3_session_noaa_wcsd_zarr_pds = boto3.Session(
             aws_access_key_id=os.environ.get("OUTPUT_BUCKET_ACCESS_KEY"),
             aws_secret_access_key=os.environ.get("OUTPUT_BUCKET_SECRET_ACCESS_KEY"),
@@ -63,19 +68,20 @@ class S3Manager:
             service_name="s3",
             config=self.s3_client_config,
             region_name=self.s3_region,
+            # endpoint_url=endpoint_url, # TODO: temporary
         )
-        self.s3_resource_noaa_wcsd_zarr_pds = (
-            self.s3_session_noaa_wcsd_zarr_pds.resource(
-                service_name="s3",
-                config=self.s3_client_config,
-                region_name=self.s3_region,
-            )
+        self.s3_resource_noaa_wcsd_zarr_pds = self.s3_session_noaa_wcsd_zarr_pds.resource(
+            service_name="s3",
+            config=self.s3_client_config,
+            region_name=self.s3_region,
         )
+        self.paginator = self.s3_client.get_paginator('list_objects_v2')
+        self.paginator_noaa_wcsd_zarr_pds = self.s3_client_noaa_wcsd_zarr_pds.get_paginator('list_objects_v2')
-    def get_client(self):
+    def get_client(self): # TODO: do i need this?
         return self.s3_session.client(
             service_name="s3",
-            config=self.__s3_client_config,
+            config=self.s3_client_config,
             region_name=self.s3_region,
         )
@@ -103,17 +109,18 @@ class S3Manager:
         self,
         file_name: str,
         key: str,
+        output_bucket_name: str,
     ):
-        self.s3_client_noaa_wcsd_zarr_pds.upload_file(
-            Filename=file_name,
-            Bucket=self.output_bucket_name,
-            Key=key,
-        )
+        """
+        Used to upload a single file, e.g. the GeoJSON file to the NODD bucket
+        """
+        self.s3_resource_noaa_wcsd_zarr_pds.Bucket(output_bucket_name).upload_file(Filename=file_name, Key=key)
         return key
     #####################################################################
     def upload_files_with_thread_pool_executor(
         self,
+        output_bucket_name: str,
         all_files: list,
     ):
         # 'all_files' is passed a list of lists: [[local_path, s3_key], [...], ...]
@@ -122,21 +129,45 @@ class S3Manager:
             with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
                 futures = [
                     executor.submit(
-                        self.upload_nodd_file,
+                        self.upload_nodd_file,  # TODO: verify which one is using this
                         all_file[0],  # file_name
                         all_file[1],  # key
+                        output_bucket_name,  # output_bucket_name
                     )
                     for all_file in all_files
                 ]
                 for future in as_completed(futures):
                     result = future.result()
                     if result:
-                        all_uploads.extend(result)
+                        all_uploads.extend([result])
         except Exception as err:
             print(err)
         print("Done uploading files using threading pool.")
         return all_uploads
+    #####################################################################
+    # def upload_nodd_file2(
+    #         self,
+    #         body: str,
+    #         bucket: str,
+    #         key: str,
+    # ):
+    #     self.s3_client_noaa_wcsd_zarr_pds.put_object(
+    #         Body=body,
+    #         Bucket=bucket,
+    #         Key=key,
+    #     )
+    # TODO: this uses resource, try to use client
+    def upload_file(
+            self,
+            filename: str,
+            bucket_name: str,
+            key: str,
+    ):
+        # self.s3_client.upload_file(Filename=filename, Bucket=bucket, Key=key)
+        self.s3_resource.Bucket(bucket_name).upload_file(Filename=filename, Key=key)
     #####################################################################
     def upload_zarr_files_to_bucket(  # noaa-wcsd-model-pds
         self,
@@ -165,32 +196,34 @@ class S3Manager:
         return all_uploads
     #####################################################################
-    # used: raw-to-model
-    def list_objects(  # noaa-wcsd-pds and noaa-wcsd-model-pds
-        self, bucket_name, prefix
+    # used: raw-to-zarr
+    def list_objects(  # noaa-wcsd-pds and noaa-wcsd-zarr-pds
+        self,
+        bucket_name,
+        prefix
     ):
         # analog to "find_children_objects"
         # Returns a list of key strings for each object in bucket defined by prefix
-        s3_client = self.s3_client
+        # s3_client = self.s3_client
         keys = []
-        paginator = s3_client.get_paginator("list_objects_v2")
-        page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
+        # paginator = s3_client.get_paginator("list_objects_v2")
+        page_iterator = self.paginator.paginate(Bucket=bucket_name, Prefix=prefix)
         for page in page_iterator:
             if "Contents" in page.keys():
                 keys.extend([k["Key"] for k in page["Contents"]])
         return keys
-    def list_nodd_objects(  # These are used by the geometry for uploading data
-        self,
-        prefix,
-    ):
-        # Returns a list of key strings for each object in bucket defined by prefix
-        keys = []
-        paginator = self.s3_client_noaa_wcsd_zarr_pds.get_paginator("list_objects_v2")
-        for page in paginator.paginate(Bucket=self.output_bucket_name, Prefix=prefix):
-            if "Contents" in page.keys():
-                keys.extend([k["Key"] for k in page["Contents"]])
-        return keys
+    # def list_nodd_objects(  # These are used by the geometry for uploading data
+    #     self,
+    #     prefix,
+    # ):
+    #     # Returns a list of key strings for each object in bucket defined by prefix
+    #     keys = []
+    #     page_iterator = self.paginator_noaa_wcsd_zarr_pds.paginate(Bucket=self.output_bucket_name, Prefix=prefix):
+    #     for page in paginator.paginate(Bucket=self.output_bucket_name, Prefix=prefix):
+    #         if "Contents" in page.keys():
+    #             keys.extend([k["Key"] for k in page["Contents"]])
+    #     return keys
     #####################################################################
     # TODO: change name to "directory"
@@ -279,9 +312,10 @@ class S3Manager:
         self,
         bucket_name,
         key,
-        file_name,
+        file_name, # where the file will be saved
     ):
         self.s3_client.download_file(Bucket=bucket_name, Key=key, Filename=file_name)
+        # TODO: if bottom file doesn't exist, don't fail downloader
         print("downloaded file")
     #####################################################################
@@ -318,7 +352,7 @@ class S3Manager:
     #####################################################################
     # not used TODO: remove
     def put(self, bucket_name, key, body):  # noaa-wcsd-model-pds
-        self.s3_client.put_object(Bucket=bucket_name, Key=key, Body=body)
+        self.s3_client.put_object(Bucket=bucket_name, Key=key, Body=body) # "Body" can be a file
     #####################################################################
     def read_s3_json(

water_column_sonar_processing/cruise/create_empty_zarr_store.py CHANGED Viewed

@@ -3,10 +3,10 @@ import os
 import numcodecs
 import numpy as np
-from water_column_sonar_processing.aws.dynamodb_manager import DynamoDBManager
-from water_column_sonar_processing.aws.s3_manager import S3Manager
-from water_column_sonar_processing.model.zarr_manager import ZarrManager
-from water_column_sonar_processing.utility.cleaner import Cleaner
+from water_column_sonar_processing.aws import DynamoDBManager
+from water_column_sonar_processing.aws import S3Manager
+from water_column_sonar_processing.model import ZarrManager
+from water_column_sonar_processing.utility import Cleaner
 numcodecs.blosc.use_threads = False
 numcodecs.blosc.set_nthreads(1)
@@ -17,6 +17,7 @@ numcodecs.blosc.set_nthreads(1)
 # creates the latlon data: foo = ep.consolidate.add_location(ds_Sv, echodata)
+# TODO: change name to "CreateLocalEmptyZarrStore"
 class CreateEmptyZarrStore:
     #######################################################
     def __init__(
@@ -28,6 +29,7 @@ class CreateEmptyZarrStore:
     #######################################################
+    # TODO: move this to the s3_manager
     def upload_zarr_store_to_s3(
         self,
         local_directory: str,

water_column_sonar_processing/cruise/resample_regrid.py CHANGED Viewed

@@ -7,9 +7,9 @@ import numpy as np
 import pandas as pd
 import xarray as xr
-from water_column_sonar_processing.aws.dynamodb_manager import DynamoDBManager
-from water_column_sonar_processing.geometry.geometry_manager import GeometryManager
-from water_column_sonar_processing.model.zarr_manager import ZarrManager
+from water_column_sonar_processing.aws import DynamoDBManager
+from water_column_sonar_processing.geometry import GeometryManager
+from water_column_sonar_processing.model import ZarrManager
 numcodecs.blosc.use_threads = False
 numcodecs.blosc.set_nthreads(1)

water_column_sonar_processing/geometry/geometry_manager.py CHANGED Viewed

@@ -1,11 +1,12 @@
+import os
 from pathlib import Path
 import geopandas
 import numpy as np
 import pandas as pd
-from water_column_sonar_processing.aws.s3_manager import S3Manager
-from water_column_sonar_processing.utility.cleaner import Cleaner
+from water_column_sonar_processing.aws import S3Manager
+from water_column_sonar_processing.utility import Cleaner
 """
 //  [Decimal / Places / Degrees	/ Object that can be recognized at scale / N/S or E/W at equator, E/W at 23N/S, E/W at 45N/S, E/W at 67N/S]
@@ -26,12 +27,13 @@ class GeometryManager:
         self,
     ):
         self.DECIMAL_PRECISION = 5  # precision for GPS coordinates
-        self.SIMPLIFICATION_TOLERANCE = 0.0001  # RDP simplification to street level
+        self.SIMPLIFICATION_TOLERANCE = 0.0001  # RDP simplification to "street level"
     #######################################################
     def read_echodata_gps_data(
         self,
         echodata,
+        output_bucket_name,
         ship_name,
         cruise_name,
         sensor_name,
@@ -123,12 +125,12 @@ class GeometryManager:
                 crs="epsg:4326",
             )
             # Note: We set np.nan to 0,0 so downstream missing values can be omitted
+            # TODO: so what ends up here is data with corruption at null island!!!
             geo_json_line = gps_gdf.to_json()
             if write_geojson:
                 print("Creating local copy of geojson file.")
                 with open(geo_json_name, "w") as write_file:
-                    write_file.write(geo_json_line)
+                    write_file.write(geo_json_line) # NOTE: this file can include zeros for lat lon
                 geo_json_prefix = (
                     f"spatial/geojson/{ship_name}/{cruise_name}/{sensor_name}"
@@ -136,7 +138,8 @@ class GeometryManager:
                 print("Checking s3 and deleting any existing GeoJSON file.")
                 s3_manager = S3Manager()
-                s3_objects = s3_manager.list_nodd_objects(
+                s3_objects = s3_manager.list_objects(
+                    bucket_name=output_bucket_name,
                     prefix=f"{geo_json_prefix}/{geo_json_name}"
                 )
                 if len(s3_objects) > 0:
@@ -149,6 +152,7 @@ class GeometryManager:
                 s3_manager.upload_nodd_file(
                     file_name=geo_json_name,  # file_name
                     key=f"{geo_json_prefix}/{geo_json_name}",  # key
+                    output_bucket_name=output_bucket_name,
                 )
                 # TODO: delete geo_json file
@@ -221,5 +225,16 @@ class GeometryManager:
             print(f"Exception encountered reading s3 GeoJSON: {err}")
             raise
+    ############################################################################
+    # COMES from the raw-to-zarr conversion
+    def __write_geojson_to_file(
+            self,
+            store_name,
+            data
+    ) -> None:
+        print('Writing GeoJSON to file.')
+        with open(os.path.join(store_name, 'geo.json'), "w") as outfile:
+            outfile.write(data)
 ###########################################################

water-column-sonar-processing 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

water-column-sonar-processing 0.0.7py3-none-any.whl → 0.0.9py3-none-any.whl