water-column-sonar-processing 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- water_column_sonar_processing/__init__.py +16 -0
- water_column_sonar_processing/aws/__init__.py +7 -4
- water_column_sonar_processing/aws/dynamodb_manager.py +70 -49
- water_column_sonar_processing/aws/s3_manager.py +112 -122
- water_column_sonar_processing/aws/s3fs_manager.py +13 -19
- water_column_sonar_processing/aws/sns_manager.py +10 -21
- water_column_sonar_processing/aws/sqs_manager.py +10 -18
- water_column_sonar_processing/cruise/__init__.py +4 -0
- water_column_sonar_processing/cruise/create_empty_zarr_store.py +51 -33
- water_column_sonar_processing/cruise/resample_regrid.py +109 -58
- water_column_sonar_processing/geometry/__init__.py +5 -0
- water_column_sonar_processing/geometry/geometry_manager.py +79 -48
- water_column_sonar_processing/geometry/geometry_simplification.py +13 -12
- water_column_sonar_processing/geometry/pmtile_generation.py +24 -23
- water_column_sonar_processing/index/__init__.py +3 -0
- water_column_sonar_processing/index/index_manager.py +104 -80
- water_column_sonar_processing/model/__init__.py +3 -0
- water_column_sonar_processing/model/zarr_manager.py +113 -75
- water_column_sonar_processing/process.py +76 -69
- water_column_sonar_processing/utility/__init__.py +6 -0
- water_column_sonar_processing/utility/cleaner.py +6 -7
- water_column_sonar_processing/utility/constants.py +42 -35
- water_column_sonar_processing/utility/pipeline_status.py +37 -10
- water_column_sonar_processing/utility/timestamp.py +3 -2
- {water_column_sonar_processing-0.0.5.dist-info → water_column_sonar_processing-0.0.6.dist-info}/METADATA +31 -1
- water_column_sonar_processing-0.0.6.dist-info/RECORD +29 -0
- water_column_sonar_processing-0.0.5.dist-info/RECORD +0 -29
- {water_column_sonar_processing-0.0.5.dist-info → water_column_sonar_processing-0.0.6.dist-info}/LICENSE +0 -0
- {water_column_sonar_processing-0.0.5.dist-info → water_column_sonar_processing-0.0.6.dist-info}/WHEEL +0 -0
- {water_column_sonar_processing-0.0.5.dist-info → water_column_sonar_processing-0.0.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from __future__ import absolute_import
|
|
2
|
+
|
|
3
|
+
from . import aws, cruise, geometry, index, model, utility, process
|
|
4
|
+
from .model import ZarrManager
|
|
5
|
+
from .process import Process
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"aws",
|
|
9
|
+
"cruise",
|
|
10
|
+
"geometry",
|
|
11
|
+
"index",
|
|
12
|
+
"model",
|
|
13
|
+
"utility",
|
|
14
|
+
"process",
|
|
15
|
+
"Process",
|
|
16
|
+
]
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
from .dynamodb_manager import DynamoDBManager
|
|
2
|
+
from .s3_manager import S3Manager
|
|
3
|
+
from .s3fs_manager import S3FSManager
|
|
4
|
+
from .sns_manager import SNSManager
|
|
5
|
+
from .sqs_manager import SQSManager
|
|
6
|
+
|
|
7
|
+
__all__ = ["DynamoDBManager", "S3Manager", "S3FSManager", "SNSManager", "SQSManager"]
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import os
|
|
2
|
+
|
|
2
3
|
import boto3
|
|
3
4
|
import pandas as pd
|
|
4
|
-
from boto3.dynamodb.types import
|
|
5
|
+
from boto3.dynamodb.types import TypeDeserializer, TypeSerializer
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
#########################################################################
|
|
@@ -9,9 +10,9 @@ class DynamoDBManager:
|
|
|
9
10
|
#####################################################################
|
|
10
11
|
def __init__(self):
|
|
11
12
|
self.__dynamodb_session = boto3.Session(
|
|
12
|
-
aws_access_key_id=os.environ.get(
|
|
13
|
-
aws_secret_access_key=os.environ.get(
|
|
14
|
-
region_name=os.environ.get("AWS_REGION", default="us-east-1")
|
|
13
|
+
aws_access_key_id=os.environ.get("ACCESS_KEY_ID"),
|
|
14
|
+
aws_secret_access_key=os.environ.get("SECRET_ACCESS_KEY"),
|
|
15
|
+
region_name=os.environ.get("AWS_REGION", default="us-east-1"),
|
|
15
16
|
)
|
|
16
17
|
self.__dynamodb_resource = self.__dynamodb_session.resource(
|
|
17
18
|
service_name="dynamodb",
|
|
@@ -35,10 +36,10 @@ class DynamoDBManager:
|
|
|
35
36
|
|
|
36
37
|
#####################################################################
|
|
37
38
|
def create_table(
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
39
|
+
self,
|
|
40
|
+
table_name,
|
|
41
|
+
key_schema,
|
|
42
|
+
attribute_definitions,
|
|
42
43
|
):
|
|
43
44
|
self.__dynamodb_client.create_table(
|
|
44
45
|
AttributeDefinitions=attribute_definitions,
|
|
@@ -52,98 +53,118 @@ class DynamoDBManager:
|
|
|
52
53
|
)
|
|
53
54
|
|
|
54
55
|
#####################################################################
|
|
55
|
-
def
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
key
|
|
56
|
+
def create_water_column_sonar_table(
|
|
57
|
+
self,
|
|
58
|
+
table_name,
|
|
59
59
|
):
|
|
60
|
+
self.create_table(
|
|
61
|
+
table_name=table_name,
|
|
62
|
+
key_schema=[
|
|
63
|
+
{
|
|
64
|
+
"AttributeName": "FILE_NAME",
|
|
65
|
+
"KeyType": "HASH",
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"AttributeName": "CRUISE_NAME",
|
|
69
|
+
"KeyType": "RANGE",
|
|
70
|
+
},
|
|
71
|
+
],
|
|
72
|
+
attribute_definitions=[
|
|
73
|
+
{"AttributeName": "FILE_NAME", "AttributeType": "S"},
|
|
74
|
+
{"AttributeName": "CRUISE_NAME", "AttributeType": "S"},
|
|
75
|
+
],
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
#####################################################################
|
|
79
|
+
def get_item(self, table_name, key):
|
|
60
80
|
response = self.__dynamodb_client.get_item(TableName=table_name, Key=key)
|
|
61
81
|
item = None
|
|
62
|
-
if response[
|
|
63
|
-
if
|
|
64
|
-
item = response[
|
|
82
|
+
if response["ResponseMetadata"]["HTTPStatusCode"] == 200:
|
|
83
|
+
if "Item" in response:
|
|
84
|
+
item = response["Item"]
|
|
65
85
|
return item
|
|
66
86
|
|
|
67
87
|
#####################################################################
|
|
68
88
|
def update_item(
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
89
|
+
self,
|
|
90
|
+
table_name,
|
|
91
|
+
key,
|
|
92
|
+
expression_attribute_names,
|
|
93
|
+
expression_attribute_values,
|
|
94
|
+
update_expression,
|
|
75
95
|
):
|
|
76
96
|
response = self.__dynamodb_client.update_item(
|
|
77
97
|
TableName=table_name,
|
|
78
98
|
Key=key,
|
|
79
99
|
ExpressionAttributeNames=expression_attribute_names,
|
|
80
100
|
ExpressionAttributeValues=expression_attribute_values,
|
|
81
|
-
UpdateExpression=update_expression
|
|
101
|
+
UpdateExpression=update_expression,
|
|
82
102
|
)
|
|
83
|
-
status_code = response[
|
|
103
|
+
status_code = response["ResponseMetadata"]["HTTPStatusCode"]
|
|
84
104
|
# TODO: change to exception
|
|
85
|
-
assert
|
|
105
|
+
assert status_code == 200, "Problem, unable to update dynamodb table."
|
|
86
106
|
|
|
87
107
|
#####################################################################
|
|
88
108
|
def get_table_as_df(
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
109
|
+
self,
|
|
110
|
+
ship_name,
|
|
111
|
+
cruise_name,
|
|
112
|
+
sensor_name,
|
|
113
|
+
table_name,
|
|
94
114
|
):
|
|
95
115
|
expression_attribute_values = {
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
116
|
+
":cr": {"S": cruise_name},
|
|
117
|
+
":se": {"S": sensor_name},
|
|
118
|
+
":sh": {"S": ship_name},
|
|
99
119
|
}
|
|
100
120
|
|
|
101
|
-
filter_expression =
|
|
121
|
+
filter_expression = (
|
|
122
|
+
"CRUISE_NAME = :cr and SENSOR_NAME = :se and SHIP_NAME = :sh"
|
|
123
|
+
)
|
|
102
124
|
response = self.__dynamodb_client.scan(
|
|
103
125
|
TableName=table_name,
|
|
104
|
-
Select=
|
|
126
|
+
Select="ALL_ATTRIBUTES",
|
|
105
127
|
ExpressionAttributeValues=expression_attribute_values,
|
|
106
128
|
FilterExpression=filter_expression,
|
|
107
129
|
)
|
|
108
130
|
# Note: table.scan() has 1 MB limit on results so pagination is used
|
|
109
|
-
data = response[
|
|
131
|
+
data = response["Items"]
|
|
110
132
|
|
|
111
|
-
while
|
|
133
|
+
while "LastEvaluatedKey" in response:
|
|
112
134
|
response = self.__dynamodb_client.scan(
|
|
113
135
|
TableName=table_name,
|
|
114
|
-
Select=
|
|
136
|
+
Select="ALL_ATTRIBUTES",
|
|
115
137
|
ExpressionAttributeValues=expression_attribute_values,
|
|
116
138
|
FilterExpression=filter_expression,
|
|
117
|
-
ExclusiveStartKey=response[
|
|
139
|
+
ExclusiveStartKey=response["LastEvaluatedKey"],
|
|
118
140
|
)
|
|
119
|
-
data.extend(response[
|
|
141
|
+
data.extend(response["Items"])
|
|
120
142
|
|
|
121
143
|
deserializer = self.type_deserializer
|
|
122
144
|
df = pd.DataFrame([deserializer.deserialize({"M": i}) for i in data])
|
|
123
145
|
|
|
124
|
-
return df.sort_values(by=
|
|
146
|
+
return df.sort_values(by="START_TIME", ignore_index=True)
|
|
125
147
|
|
|
126
148
|
#####################################################################
|
|
127
149
|
# is this used?
|
|
128
150
|
def get_table_item(
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
151
|
+
self,
|
|
152
|
+
table_name,
|
|
153
|
+
key,
|
|
132
154
|
):
|
|
133
155
|
# a bit more high level, uses resource to get table item
|
|
134
156
|
table = self.__dynamodb_resource.Table(table_name)
|
|
135
|
-
response = table.get_item(
|
|
136
|
-
Key=key
|
|
137
|
-
)
|
|
157
|
+
response = table.get_item(Key=key)
|
|
138
158
|
return response
|
|
139
159
|
|
|
140
160
|
#####################################################################
|
|
141
161
|
# TODO: add helper method to delete the data
|
|
142
162
|
def delete_cruise(
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
163
|
+
self,
|
|
164
|
+
table_name,
|
|
165
|
+
cruise_name,
|
|
146
166
|
):
|
|
147
167
|
pass
|
|
148
168
|
|
|
169
|
+
|
|
149
170
|
#########################################################################
|
|
@@ -1,47 +1,45 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
import boto3
|
|
4
|
-
# import pandas as pd
|
|
5
3
|
from collections.abc import Generator
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
5
|
|
|
7
|
-
|
|
8
|
-
from botocore.config import Config
|
|
6
|
+
import boto3
|
|
9
7
|
from boto3.s3.transfer import TransferConfig
|
|
8
|
+
from botocore.config import Config
|
|
10
9
|
from botocore.exceptions import ClientError
|
|
11
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
12
|
-
from concurrent.futures import as_completed
|
|
13
10
|
|
|
14
11
|
MAX_POOL_CONNECTIONS = 64
|
|
15
12
|
MAX_CONCURRENCY = 64
|
|
16
13
|
MAX_WORKERS = 64
|
|
17
|
-
GB = 1024
|
|
14
|
+
GB = 1024**3
|
|
15
|
+
|
|
18
16
|
|
|
19
17
|
#########################################################################
|
|
20
18
|
def chunked(ll: list, n: int) -> Generator:
|
|
21
19
|
# Yields successively n-sized chunks from ll.
|
|
22
20
|
for i in range(0, len(ll), n):
|
|
23
|
-
yield ll[i:i + n]
|
|
21
|
+
yield ll[i : i + n]
|
|
24
22
|
|
|
25
23
|
|
|
26
24
|
class S3Manager:
|
|
27
25
|
#####################################################################
|
|
28
26
|
def __init__(
|
|
29
|
-
|
|
30
|
-
|
|
27
|
+
self,
|
|
28
|
+
# TODO: Need to allow passing in of credentials when writing to protected bucket
|
|
31
29
|
):
|
|
32
|
-
self.input_bucket_name = os.environ.get(
|
|
33
|
-
self.output_bucket_name = os.environ.get(
|
|
30
|
+
self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
|
|
31
|
+
self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
|
|
34
32
|
self.s3_region = os.environ.get("AWS_REGION", default="us-east-1")
|
|
35
33
|
self.s3_client_config = Config(max_pool_connections=MAX_POOL_CONNECTIONS)
|
|
36
34
|
self.s3_transfer_config = TransferConfig(
|
|
37
35
|
max_concurrency=MAX_CONCURRENCY,
|
|
38
36
|
use_threads=True,
|
|
39
37
|
max_bandwidth=None,
|
|
40
|
-
multipart_threshold=10 * GB
|
|
38
|
+
multipart_threshold=10 * GB,
|
|
41
39
|
)
|
|
42
40
|
self.s3_session = boto3.Session(
|
|
43
|
-
aws_access_key_id=os.environ.get(
|
|
44
|
-
aws_secret_access_key=os.environ.get(
|
|
41
|
+
aws_access_key_id=os.environ.get("ACCESS_KEY_ID"),
|
|
42
|
+
aws_secret_access_key=os.environ.get("SECRET_ACCESS_KEY"),
|
|
45
43
|
region_name=self.s3_region,
|
|
46
44
|
)
|
|
47
45
|
self.s3_client = self.s3_session.client(
|
|
@@ -57,8 +55,8 @@ class S3Manager:
|
|
|
57
55
|
# self.paginator = self.s3_client.get_paginator(operation_name='list_objects_v2')
|
|
58
56
|
# TODO: create both "s3_client_input" and "s3_client_output" ???
|
|
59
57
|
self.s3_session_noaa_wcsd_zarr_pds = boto3.Session(
|
|
60
|
-
aws_access_key_id=os.environ.get(
|
|
61
|
-
aws_secret_access_key=os.environ.get(
|
|
58
|
+
aws_access_key_id=os.environ.get("OUTPUT_BUCKET_ACCESS_KEY"),
|
|
59
|
+
aws_secret_access_key=os.environ.get("OUTPUT_BUCKET_SECRET_ACCESS_KEY"),
|
|
62
60
|
region_name=self.s3_region,
|
|
63
61
|
)
|
|
64
62
|
self.s3_client_noaa_wcsd_zarr_pds = self.s3_session_noaa_wcsd_zarr_pds.client(
|
|
@@ -66,15 +64,15 @@ class S3Manager:
|
|
|
66
64
|
config=self.s3_client_config,
|
|
67
65
|
region_name=self.s3_region,
|
|
68
66
|
)
|
|
69
|
-
self.s3_resource_noaa_wcsd_zarr_pds =
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
67
|
+
self.s3_resource_noaa_wcsd_zarr_pds = (
|
|
68
|
+
self.s3_session_noaa_wcsd_zarr_pds.resource(
|
|
69
|
+
service_name="s3",
|
|
70
|
+
config=self.s3_client_config,
|
|
71
|
+
region_name=self.s3_region,
|
|
72
|
+
)
|
|
73
73
|
)
|
|
74
74
|
|
|
75
|
-
def get_client(
|
|
76
|
-
self
|
|
77
|
-
):
|
|
75
|
+
def get_client(self):
|
|
78
76
|
return self.s3_session.client(
|
|
79
77
|
service_name="s3",
|
|
80
78
|
config=self.__s3_client_config,
|
|
@@ -83,8 +81,8 @@ class S3Manager:
|
|
|
83
81
|
|
|
84
82
|
#####################################################################
|
|
85
83
|
def create_bucket(
|
|
86
|
-
|
|
87
|
-
|
|
84
|
+
self,
|
|
85
|
+
bucket_name: str,
|
|
88
86
|
):
|
|
89
87
|
self.s3_client.create_bucket(
|
|
90
88
|
Bucket=bucket_name,
|
|
@@ -95,18 +93,16 @@ class S3Manager:
|
|
|
95
93
|
)
|
|
96
94
|
|
|
97
95
|
#####################################################################
|
|
98
|
-
def list_buckets(
|
|
99
|
-
self
|
|
100
|
-
):
|
|
96
|
+
def list_buckets(self):
|
|
101
97
|
# client = self.get_client()
|
|
102
98
|
client = self.s3_client
|
|
103
99
|
return client.list_buckets()
|
|
104
100
|
|
|
105
101
|
#####################################################################
|
|
106
102
|
def upload_nodd_file(
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
103
|
+
self,
|
|
104
|
+
file_name: str,
|
|
105
|
+
key: str,
|
|
110
106
|
):
|
|
111
107
|
self.s3_client_noaa_wcsd_zarr_pds.upload_file(
|
|
112
108
|
Filename=file_name,
|
|
@@ -117,115 +113,120 @@ class S3Manager:
|
|
|
117
113
|
|
|
118
114
|
#####################################################################
|
|
119
115
|
def upload_files_with_thread_pool_executor(
|
|
120
|
-
|
|
121
|
-
|
|
116
|
+
self,
|
|
117
|
+
all_files: list,
|
|
122
118
|
):
|
|
123
119
|
# 'all_files' is passed a list of lists: [[local_path, s3_key], [...], ...]
|
|
124
120
|
all_uploads = []
|
|
125
121
|
try: # TODO: problem with threadpool here, missing child files
|
|
126
122
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
127
|
-
futures = [
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
123
|
+
futures = [
|
|
124
|
+
executor.submit(
|
|
125
|
+
self.upload_nodd_file,
|
|
126
|
+
all_file[0], # file_name
|
|
127
|
+
all_file[1], # key
|
|
128
|
+
)
|
|
129
|
+
for all_file in all_files
|
|
130
|
+
]
|
|
132
131
|
for future in as_completed(futures):
|
|
133
132
|
result = future.result()
|
|
134
133
|
if result:
|
|
135
134
|
all_uploads.extend(result)
|
|
136
135
|
except Exception as err:
|
|
137
136
|
print(err)
|
|
138
|
-
print(
|
|
137
|
+
print("Done uploading files using threading pool.")
|
|
139
138
|
return all_uploads
|
|
140
139
|
|
|
141
140
|
#####################################################################
|
|
142
141
|
def upload_zarr_files_to_bucket( # noaa-wcsd-model-pds
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
142
|
+
self,
|
|
143
|
+
local_directory,
|
|
144
|
+
remote_directory,
|
|
146
145
|
):
|
|
147
146
|
# Right now this is just for uploading a model store to s3
|
|
148
|
-
print(
|
|
147
|
+
print("Uploading files to output bucket.")
|
|
149
148
|
store_name = os.path.basename(local_directory)
|
|
150
149
|
all_files = []
|
|
151
150
|
for subdir, dirs, files in os.walk(local_directory):
|
|
152
151
|
for file in files:
|
|
153
152
|
local_path = os.path.join(subdir, file)
|
|
154
153
|
# s3_key = os.path.join(object_prefix, local_path)
|
|
155
|
-
s3_key = os.path.join(
|
|
154
|
+
s3_key = os.path.join(
|
|
155
|
+
remote_directory,
|
|
156
|
+
store_name,
|
|
157
|
+
subdir.split(store_name)[-1].strip("/"),
|
|
158
|
+
)
|
|
156
159
|
all_files.append([local_path, s3_key])
|
|
157
160
|
|
|
158
161
|
all_uploads = self.upload_files_with_thread_pool_executor(
|
|
159
162
|
all_files=all_files,
|
|
160
163
|
)
|
|
161
|
-
print(
|
|
164
|
+
print("Done uploading files to output bucket.")
|
|
162
165
|
return all_uploads
|
|
163
166
|
|
|
164
167
|
#####################################################################
|
|
165
168
|
# used: raw-to-model
|
|
166
169
|
def list_objects( # noaa-wcsd-pds and noaa-wcsd-model-pds
|
|
167
|
-
|
|
168
|
-
bucket_name,
|
|
169
|
-
prefix
|
|
170
|
+
self, bucket_name, prefix
|
|
170
171
|
):
|
|
171
172
|
# analog to "find_children_objects"
|
|
172
173
|
# Returns a list of key strings for each object in bucket defined by prefix
|
|
173
174
|
s3_client = self.s3_client
|
|
174
175
|
keys = []
|
|
175
|
-
paginator = s3_client.get_paginator(
|
|
176
|
+
paginator = s3_client.get_paginator("list_objects_v2")
|
|
176
177
|
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
|
|
177
178
|
for page in page_iterator:
|
|
178
|
-
if
|
|
179
|
-
keys.extend([k[
|
|
179
|
+
if "Contents" in page.keys():
|
|
180
|
+
keys.extend([k["Key"] for k in page["Contents"]])
|
|
180
181
|
return keys
|
|
181
182
|
|
|
182
183
|
def list_nodd_objects( # These are used by the geometry for uploading data
|
|
183
|
-
|
|
184
|
-
|
|
184
|
+
self,
|
|
185
|
+
prefix,
|
|
185
186
|
):
|
|
186
187
|
# Returns a list of key strings for each object in bucket defined by prefix
|
|
187
188
|
keys = []
|
|
188
|
-
paginator = self.s3_client_noaa_wcsd_zarr_pds.get_paginator(
|
|
189
|
+
paginator = self.s3_client_noaa_wcsd_zarr_pds.get_paginator("list_objects_v2")
|
|
189
190
|
for page in paginator.paginate(Bucket=self.output_bucket_name, Prefix=prefix):
|
|
190
|
-
if
|
|
191
|
-
keys.extend([k[
|
|
191
|
+
if "Contents" in page.keys():
|
|
192
|
+
keys.extend([k["Key"] for k in page["Contents"]])
|
|
192
193
|
return keys
|
|
193
194
|
|
|
194
195
|
#####################################################################
|
|
195
196
|
# TODO: change name to "directory"
|
|
196
|
-
def folder_exists_and_not_empty(
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
path: str
|
|
200
|
-
) -> bool:
|
|
201
|
-
if not path.endswith('/'):
|
|
202
|
-
path = path + '/'
|
|
197
|
+
def folder_exists_and_not_empty(self, bucket_name: str, path: str) -> bool:
|
|
198
|
+
if not path.endswith("/"):
|
|
199
|
+
path = path + "/"
|
|
203
200
|
s3_client = self.s3_client
|
|
204
|
-
resp = self.list_objects(
|
|
205
|
-
|
|
206
|
-
|
|
201
|
+
resp = self.list_objects(
|
|
202
|
+
bucket_name=bucket_name, prefix=path
|
|
203
|
+
) # TODO: this is returning root folder and doesn't include children or hidden folders
|
|
204
|
+
# resp = s3_client.list_objects(Bucket=bucket, Prefix=path, Delimiter='/', MaxKeys=1)
|
|
205
|
+
return "Contents" in resp
|
|
207
206
|
|
|
208
207
|
#####################################################################
|
|
209
208
|
# used
|
|
210
209
|
def __paginate_child_objects(
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
210
|
+
self,
|
|
211
|
+
bucket_name: str,
|
|
212
|
+
sub_prefix: str = None,
|
|
214
213
|
) -> list:
|
|
215
|
-
page_iterator = self.s3_client.get_paginator(
|
|
214
|
+
page_iterator = self.s3_client.get_paginator("list_objects_v2").paginate(
|
|
215
|
+
Bucket=bucket_name, Prefix=sub_prefix
|
|
216
|
+
)
|
|
216
217
|
objects = []
|
|
217
218
|
for page in page_iterator:
|
|
218
|
-
if
|
|
219
|
-
objects.extend(page[
|
|
219
|
+
if "Contents" in page.keys():
|
|
220
|
+
objects.extend(page["Contents"])
|
|
220
221
|
return objects
|
|
221
222
|
|
|
222
223
|
def get_child_objects(
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
224
|
+
self,
|
|
225
|
+
bucket_name: str,
|
|
226
|
+
sub_prefix: str,
|
|
227
|
+
file_suffix: str = None,
|
|
227
228
|
) -> list:
|
|
228
|
-
print(
|
|
229
|
+
print("Getting child objects")
|
|
229
230
|
raw_files = []
|
|
230
231
|
try:
|
|
231
232
|
children = self.__paginate_child_objects(
|
|
@@ -238,10 +239,10 @@ class S3Manager:
|
|
|
238
239
|
for child in children:
|
|
239
240
|
# Note: Any files with predicate 'NOISE' are to be ignored
|
|
240
241
|
# see: "Bell_M._Shimada/SH1507" cruise for more details.
|
|
241
|
-
if child[
|
|
242
|
-
|
|
243
|
-
):
|
|
244
|
-
raw_files.append(child[
|
|
242
|
+
if child["Key"].endswith(file_suffix) and not os.path.basename(
|
|
243
|
+
child["Key"]
|
|
244
|
+
).startswith("NOISE"):
|
|
245
|
+
raw_files.append(child["Key"])
|
|
245
246
|
return raw_files
|
|
246
247
|
except ClientError as err:
|
|
247
248
|
print(f"Problem was encountered while getting s3 files: {err}")
|
|
@@ -251,10 +252,10 @@ class S3Manager:
|
|
|
251
252
|
|
|
252
253
|
#####################################################################
|
|
253
254
|
def get_object( # TODO: Move this to index.py
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
255
|
+
# noaa-wcsd-pds or noaa-wcsd-model-pds
|
|
256
|
+
self,
|
|
257
|
+
bucket_name,
|
|
258
|
+
key_name,
|
|
258
259
|
):
|
|
259
260
|
# Meant for getting singular objects from a bucket, used by indexing lambda
|
|
260
261
|
print(f"Getting object {key_name} from {bucket_name}")
|
|
@@ -274,18 +275,14 @@ class S3Manager:
|
|
|
274
275
|
#####################################################################
|
|
275
276
|
# used raw-to-model
|
|
276
277
|
def download_file( # TODO: change to download_object
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
278
|
+
# noaa-wcsd-pds or noaa-wcsd-model-pds
|
|
279
|
+
self,
|
|
280
|
+
bucket_name,
|
|
281
|
+
key,
|
|
282
|
+
file_name,
|
|
282
283
|
):
|
|
283
|
-
self.s3_client.download_file(
|
|
284
|
-
|
|
285
|
-
Key=key,
|
|
286
|
-
Filename=file_name
|
|
287
|
-
)
|
|
288
|
-
print('downloaded file')
|
|
284
|
+
self.s3_client.download_file(Bucket=bucket_name, Key=key, Filename=file_name)
|
|
285
|
+
print("downloaded file")
|
|
289
286
|
|
|
290
287
|
#####################################################################
|
|
291
288
|
# not used
|
|
@@ -299,19 +296,20 @@ class S3Manager:
|
|
|
299
296
|
|
|
300
297
|
#####################################################################
|
|
301
298
|
def delete_nodd_objects( # nodd-bucket
|
|
302
|
-
|
|
303
|
-
|
|
299
|
+
self,
|
|
300
|
+
objects: list,
|
|
304
301
|
):
|
|
305
302
|
try:
|
|
306
|
-
print(
|
|
303
|
+
print(
|
|
304
|
+
f"Deleting {len(objects)} objects in {self.output_bucket_name} in batches."
|
|
305
|
+
)
|
|
307
306
|
objects_to_delete = []
|
|
308
307
|
for obj in objects:
|
|
309
|
-
objects_to_delete.append({
|
|
308
|
+
objects_to_delete.append({"Key": obj["Key"]})
|
|
310
309
|
# Note: request can contain a list of up to 1000 keys
|
|
311
310
|
for batch in chunked(ll=objects_to_delete, n=1000):
|
|
312
311
|
self.s3_client_noaa_wcsd_zarr_pds.delete_objects(
|
|
313
|
-
Bucket=self.output_bucket_name,
|
|
314
|
-
Delete={'Objects': batch}
|
|
312
|
+
Bucket=self.output_bucket_name, Delete={"Objects": batch}
|
|
315
313
|
)
|
|
316
314
|
print(f"Deleted files.")
|
|
317
315
|
except Exception as err:
|
|
@@ -319,38 +317,30 @@ class S3Manager:
|
|
|
319
317
|
|
|
320
318
|
#####################################################################
|
|
321
319
|
# not used TODO: remove
|
|
322
|
-
def put( # noaa-wcsd-model-pds
|
|
323
|
-
|
|
324
|
-
bucket_name,
|
|
325
|
-
key,
|
|
326
|
-
body
|
|
327
|
-
):
|
|
328
|
-
self.s3_client.put_object(
|
|
329
|
-
Bucket=bucket_name,
|
|
330
|
-
Key=key,
|
|
331
|
-
Body=body
|
|
332
|
-
)
|
|
320
|
+
def put(self, bucket_name, key, body): # noaa-wcsd-model-pds
|
|
321
|
+
self.s3_client.put_object(Bucket=bucket_name, Key=key, Body=body)
|
|
333
322
|
|
|
334
323
|
#####################################################################
|
|
335
324
|
def read_s3_json(
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
325
|
+
self,
|
|
326
|
+
ship_name,
|
|
327
|
+
cruise_name,
|
|
328
|
+
sensor_name,
|
|
329
|
+
file_name_stem,
|
|
341
330
|
) -> str:
|
|
342
331
|
try:
|
|
343
332
|
content_object = self.s3_resource_noaa_wcsd_zarr_pds.Object(
|
|
344
333
|
bucket_name=self.output_bucket_name,
|
|
345
|
-
key=f
|
|
334
|
+
key=f"spatial/geojson/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.json",
|
|
346
335
|
).get()
|
|
347
|
-
file_content = content_object[
|
|
336
|
+
file_content = content_object["Body"].read().decode("utf-8")
|
|
348
337
|
json_content = json.loads(file_content)
|
|
349
338
|
return json_content
|
|
350
339
|
except Exception as err: # Failure
|
|
351
|
-
print(f
|
|
340
|
+
print(f"Exception encountered reading s3 GeoJSON: {err}")
|
|
352
341
|
raise
|
|
353
342
|
|
|
354
343
|
#####################################################################
|
|
355
344
|
|
|
345
|
+
|
|
356
346
|
#########################################################################
|