water-column-sonar-processing 0.0.1__py3-none-any.whl → 26.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of water-column-sonar-processing might be problematic. Click here for more details.
- water_column_sonar_processing/__init__.py +13 -0
- water_column_sonar_processing/aws/__init__.py +7 -0
- water_column_sonar_processing/aws/dynamodb_manager.py +355 -0
- water_column_sonar_processing/aws/s3_manager.py +418 -0
- water_column_sonar_processing/aws/s3fs_manager.py +64 -0
- {model → water_column_sonar_processing}/aws/sns_manager.py +10 -21
- {model → water_column_sonar_processing}/aws/sqs_manager.py +11 -19
- water_column_sonar_processing/cruise/__init__.py +4 -0
- water_column_sonar_processing/cruise/create_empty_zarr_store.py +129 -0
- water_column_sonar_processing/cruise/datatree_manager.py +21 -0
- water_column_sonar_processing/cruise/resample_regrid.py +323 -0
- water_column_sonar_processing/geometry/__init__.py +13 -0
- water_column_sonar_processing/geometry/elevation_manager.py +111 -0
- water_column_sonar_processing/geometry/geometry_manager.py +241 -0
- water_column_sonar_processing/geometry/line_simplification.py +176 -0
- water_column_sonar_processing/geometry/pmtile_generation.py +266 -0
- water_column_sonar_processing/geometry/spatiotemporal.py +106 -0
- water_column_sonar_processing/index/__init__.py +3 -0
- water_column_sonar_processing/index/index_manager.py +381 -0
- water_column_sonar_processing/model/__init__.py +3 -0
- water_column_sonar_processing/model/zarr_manager.py +741 -0
- water_column_sonar_processing/processing/__init__.py +4 -0
- water_column_sonar_processing/processing/raw_to_netcdf.py +320 -0
- water_column_sonar_processing/processing/raw_to_zarr.py +331 -0
- water_column_sonar_processing/utility/__init__.py +13 -0
- {model → water_column_sonar_processing}/utility/cleaner.py +7 -7
- water_column_sonar_processing/utility/constants.py +118 -0
- {model → water_column_sonar_processing}/utility/pipeline_status.py +47 -24
- water_column_sonar_processing/utility/timestamp.py +12 -0
- water_column_sonar_processing-26.1.14.dist-info/METADATA +240 -0
- water_column_sonar_processing-26.1.14.dist-info/RECORD +34 -0
- {water_column_sonar_processing-0.0.1.dist-info → water_column_sonar_processing-26.1.14.dist-info}/WHEEL +1 -1
- {water_column_sonar_processing-0.0.1.dist-info → water_column_sonar_processing-26.1.14.dist-info/licenses}/LICENSE +1 -1
- water_column_sonar_processing-26.1.14.dist-info/top_level.txt +1 -0
- __init__.py +0 -0
- model/__init__.py +0 -0
- model/aws/__init__.py +0 -0
- model/aws/dynamodb_manager.py +0 -149
- model/aws/s3_manager.py +0 -356
- model/aws/s3fs_manager.py +0 -74
- model/cruise/__init__.py +0 -0
- model/cruise/create_empty_zarr_store.py +0 -166
- model/cruise/resample_regrid.py +0 -248
- model/geospatial/__init__.py +0 -0
- model/geospatial/geometry_manager.py +0 -194
- model/geospatial/geometry_simplification.py +0 -81
- model/geospatial/pmtile_generation.py +0 -74
- model/index/__init__.py +0 -0
- model/index/index.py +0 -228
- model/model.py +0 -138
- model/utility/__init__.py +0 -0
- model/utility/constants.py +0 -56
- model/utility/timestamp.py +0 -12
- model/zarr/__init__.py +0 -0
- model/zarr/bar.py +0 -28
- model/zarr/foo.py +0 -11
- model/zarr/zarr_manager.py +0 -298
- water_column_sonar_processing-0.0.1.dist-info/METADATA +0 -89
- water_column_sonar_processing-0.0.1.dist-info/RECORD +0 -32
- water_column_sonar_processing-0.0.1.dist-info/top_level.txt +0 -2
model/aws/dynamodb_manager.py
DELETED
|
@@ -1,149 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import boto3
|
|
3
|
-
import pandas as pd
|
|
4
|
-
from boto3.dynamodb.types import TypeSerializer, TypeDeserializer
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
#########################################################################
|
|
8
|
-
class DynamoDBManager:
|
|
9
|
-
#####################################################################
|
|
10
|
-
def __init__(self):
|
|
11
|
-
self.__dynamodb_session = boto3.Session(
|
|
12
|
-
aws_access_key_id=os.environ.get('ACCESS_KEY_ID'),
|
|
13
|
-
aws_secret_access_key=os.environ.get('SECRET_ACCESS_KEY'),
|
|
14
|
-
region_name=os.environ.get("AWS_REGION", default="us-east-1")
|
|
15
|
-
)
|
|
16
|
-
self.__dynamodb_resource = self.__dynamodb_session.resource(
|
|
17
|
-
service_name="dynamodb",
|
|
18
|
-
)
|
|
19
|
-
self.__dynamodb_client = self.__dynamodb_session.client(
|
|
20
|
-
service_name="dynamodb",
|
|
21
|
-
)
|
|
22
|
-
self.type_serializer = TypeSerializer() # https://stackoverflow.com/a/46738251
|
|
23
|
-
self.type_deserializer = TypeDeserializer()
|
|
24
|
-
|
|
25
|
-
#####################################################################
|
|
26
|
-
### defined in raw-to-zarr, not used
|
|
27
|
-
# def put_item(
|
|
28
|
-
# self,
|
|
29
|
-
# table_name,
|
|
30
|
-
# item
|
|
31
|
-
# ):
|
|
32
|
-
# response = boto3.Session().client(service_name='dynamodb').put_item(TableName=table_name, Item=item)
|
|
33
|
-
# status_code = response['ResponseMetadata']['HTTPStatusCode']
|
|
34
|
-
# assert (status_code == 200), "Problem, unable to update dynamodb table."
|
|
35
|
-
|
|
36
|
-
#####################################################################
|
|
37
|
-
def create_table(
|
|
38
|
-
self,
|
|
39
|
-
table_name,
|
|
40
|
-
key_schema,
|
|
41
|
-
attribute_definitions,
|
|
42
|
-
):
|
|
43
|
-
self.__dynamodb_client.create_table(
|
|
44
|
-
AttributeDefinitions=attribute_definitions,
|
|
45
|
-
TableName=table_name,
|
|
46
|
-
KeySchema=key_schema,
|
|
47
|
-
BillingMode="PAY_PER_REQUEST", # "PROVISIONED",
|
|
48
|
-
# ProvisionedThroughput={
|
|
49
|
-
# 'ReadCapacityUnits': 1_000,
|
|
50
|
-
# 'WriteCapacityUnits': 1_000
|
|
51
|
-
# }
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
#####################################################################
|
|
55
|
-
def get_item(
|
|
56
|
-
self,
|
|
57
|
-
table_name,
|
|
58
|
-
key
|
|
59
|
-
):
|
|
60
|
-
response = self.__dynamodb_client.get_item(TableName=table_name, Key=key)
|
|
61
|
-
item = None
|
|
62
|
-
if response['ResponseMetadata']['HTTPStatusCode'] == 200:
|
|
63
|
-
if 'Item' in response:
|
|
64
|
-
item = response['Item']
|
|
65
|
-
return item
|
|
66
|
-
|
|
67
|
-
#####################################################################
|
|
68
|
-
def update_item(
|
|
69
|
-
self,
|
|
70
|
-
table_name,
|
|
71
|
-
key,
|
|
72
|
-
expression_attribute_names,
|
|
73
|
-
expression_attribute_values,
|
|
74
|
-
update_expression
|
|
75
|
-
):
|
|
76
|
-
response = self.__dynamodb_client.update_item(
|
|
77
|
-
TableName=table_name,
|
|
78
|
-
Key=key,
|
|
79
|
-
ExpressionAttributeNames=expression_attribute_names,
|
|
80
|
-
ExpressionAttributeValues=expression_attribute_values,
|
|
81
|
-
UpdateExpression=update_expression
|
|
82
|
-
)
|
|
83
|
-
status_code = response['ResponseMetadata']['HTTPStatusCode']
|
|
84
|
-
# TODO: change to exception
|
|
85
|
-
assert (status_code == 200), "Problem, unable to update dynamodb table."
|
|
86
|
-
|
|
87
|
-
#####################################################################
|
|
88
|
-
def get_table_as_df(
|
|
89
|
-
self,
|
|
90
|
-
ship_name,
|
|
91
|
-
cruise_name,
|
|
92
|
-
sensor_name,
|
|
93
|
-
table_name,
|
|
94
|
-
):
|
|
95
|
-
expression_attribute_values = {
|
|
96
|
-
':cr': {'S': cruise_name},
|
|
97
|
-
':se': {'S': sensor_name},
|
|
98
|
-
':sh': {'S': ship_name},
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
filter_expression = 'CRUISE_NAME = :cr and SENSOR_NAME = :se and SHIP_NAME = :sh'
|
|
102
|
-
response = self.__dynamodb_client.scan(
|
|
103
|
-
TableName=table_name,
|
|
104
|
-
Select='ALL_ATTRIBUTES',
|
|
105
|
-
ExpressionAttributeValues=expression_attribute_values,
|
|
106
|
-
FilterExpression=filter_expression,
|
|
107
|
-
)
|
|
108
|
-
# Note: table.scan() has 1 MB limit on results so pagination is used
|
|
109
|
-
data = response['Items']
|
|
110
|
-
|
|
111
|
-
while 'LastEvaluatedKey' in response:
|
|
112
|
-
response = self.__dynamodb_client.scan(
|
|
113
|
-
TableName=table_name,
|
|
114
|
-
Select='ALL_ATTRIBUTES',
|
|
115
|
-
ExpressionAttributeValues=expression_attribute_values,
|
|
116
|
-
FilterExpression=filter_expression,
|
|
117
|
-
ExclusiveStartKey=response['LastEvaluatedKey']
|
|
118
|
-
)
|
|
119
|
-
data.extend(response['Items'])
|
|
120
|
-
|
|
121
|
-
deserializer = self.type_deserializer
|
|
122
|
-
df = pd.DataFrame([deserializer.deserialize({"M": i}) for i in data])
|
|
123
|
-
|
|
124
|
-
return df.sort_values(by='START_TIME', ignore_index=True)
|
|
125
|
-
|
|
126
|
-
#####################################################################
|
|
127
|
-
# is this used?
|
|
128
|
-
def get_table_item(
|
|
129
|
-
self,
|
|
130
|
-
table_name,
|
|
131
|
-
key,
|
|
132
|
-
):
|
|
133
|
-
# a bit more high level, uses resource to get table item
|
|
134
|
-
table = self.__dynamodb_resource.Table(table_name)
|
|
135
|
-
response = table.get_item(
|
|
136
|
-
Key=key
|
|
137
|
-
)
|
|
138
|
-
return response
|
|
139
|
-
|
|
140
|
-
#####################################################################
|
|
141
|
-
# TODO: add helper method to delete the data
|
|
142
|
-
def delete_cruise(
|
|
143
|
-
self,
|
|
144
|
-
table_name,
|
|
145
|
-
cruise_name,
|
|
146
|
-
):
|
|
147
|
-
pass
|
|
148
|
-
|
|
149
|
-
#########################################################################
|
model/aws/s3_manager.py
DELETED
|
@@ -1,356 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import boto3
|
|
4
|
-
import pandas as pd
|
|
5
|
-
from collections.abc import Generator
|
|
6
|
-
|
|
7
|
-
import geopandas
|
|
8
|
-
from botocore.config import Config
|
|
9
|
-
from boto3.s3.transfer import TransferConfig
|
|
10
|
-
from botocore.exceptions import ClientError
|
|
11
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
12
|
-
from concurrent.futures import as_completed
|
|
13
|
-
|
|
14
|
-
MAX_POOL_CONNECTIONS = 64
|
|
15
|
-
MAX_CONCURRENCY = 64
|
|
16
|
-
MAX_WORKERS = 64
|
|
17
|
-
GB = 1024 ** 3
|
|
18
|
-
|
|
19
|
-
#########################################################################
|
|
20
|
-
def chunked(ll: list, n: int) -> Generator:
|
|
21
|
-
# Yields successively n-sized chunks from ll.
|
|
22
|
-
for i in range(0, len(ll), n):
|
|
23
|
-
yield ll[i:i + n]
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class S3Manager:
|
|
27
|
-
#####################################################################
|
|
28
|
-
def __init__(
|
|
29
|
-
self,
|
|
30
|
-
# TODO: Need to allow passing in of credentials when writing to protected bucket
|
|
31
|
-
):
|
|
32
|
-
self.input_bucket_name = os.environ.get('INPUT_BUCKET_NAME')
|
|
33
|
-
self.output_bucket_name = os.environ.get('OUTPUT_BUCKET_NAME')
|
|
34
|
-
self.s3_region = os.environ.get("AWS_REGION", default="us-east-1")
|
|
35
|
-
self.s3_client_config = Config(max_pool_connections=MAX_POOL_CONNECTIONS)
|
|
36
|
-
self.s3_transfer_config = TransferConfig(
|
|
37
|
-
max_concurrency=MAX_CONCURRENCY,
|
|
38
|
-
use_threads=True,
|
|
39
|
-
max_bandwidth=None,
|
|
40
|
-
multipart_threshold=10 * GB
|
|
41
|
-
)
|
|
42
|
-
self.s3_session = boto3.Session(
|
|
43
|
-
aws_access_key_id=os.environ.get('ACCESS_KEY_ID'),
|
|
44
|
-
aws_secret_access_key=os.environ.get('SECRET_ACCESS_KEY'),
|
|
45
|
-
region_name=self.s3_region,
|
|
46
|
-
)
|
|
47
|
-
self.s3_client = self.s3_session.client(
|
|
48
|
-
service_name="s3",
|
|
49
|
-
config=self.s3_client_config,
|
|
50
|
-
region_name=self.s3_region,
|
|
51
|
-
)
|
|
52
|
-
self.s3_resource = boto3.resource(
|
|
53
|
-
service_name="s3",
|
|
54
|
-
config=self.s3_client_config,
|
|
55
|
-
region_name=self.s3_region,
|
|
56
|
-
)
|
|
57
|
-
# self.paginator = self.s3_client.get_paginator(operation_name='list_objects_v2')
|
|
58
|
-
# TODO: create both "s3_client_input" and "s3_client_output" ???
|
|
59
|
-
self.s3_session_noaa_wcsd_zarr_pds = boto3.Session(
|
|
60
|
-
aws_access_key_id=os.environ.get('OUTPUT_BUCKET_ACCESS_KEY'),
|
|
61
|
-
aws_secret_access_key=os.environ.get('OUTPUT_BUCKET_SECRET_ACCESS_KEY'),
|
|
62
|
-
region_name=self.s3_region,
|
|
63
|
-
)
|
|
64
|
-
self.s3_client_noaa_wcsd_zarr_pds = self.s3_session_noaa_wcsd_zarr_pds.client(
|
|
65
|
-
service_name="s3",
|
|
66
|
-
config=self.s3_client_config,
|
|
67
|
-
region_name=self.s3_region,
|
|
68
|
-
)
|
|
69
|
-
self.s3_resource_noaa_wcsd_zarr_pds = self.s3_session_noaa_wcsd_zarr_pds.resource(
|
|
70
|
-
service_name="s3",
|
|
71
|
-
config=self.s3_client_config,
|
|
72
|
-
region_name=self.s3_region,
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
def get_client(
|
|
76
|
-
self
|
|
77
|
-
):
|
|
78
|
-
return self.s3_session.client(
|
|
79
|
-
service_name="s3",
|
|
80
|
-
config=self.__s3_client_config,
|
|
81
|
-
region_name=self.s3_region,
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
#####################################################################
|
|
85
|
-
def create_bucket(
|
|
86
|
-
self,
|
|
87
|
-
bucket_name: str,
|
|
88
|
-
):
|
|
89
|
-
self.s3_client.create_bucket(
|
|
90
|
-
Bucket=bucket_name,
|
|
91
|
-
# Required when region is different then us-east-1
|
|
92
|
-
#
|
|
93
|
-
# TODO: if region is us-east-1, don't include this line somehow
|
|
94
|
-
# CreateBucketConfiguration={'LocationConstraint': self.__s3_region}
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
#####################################################################
|
|
98
|
-
def list_buckets(
|
|
99
|
-
self
|
|
100
|
-
):
|
|
101
|
-
# client = self.get_client()
|
|
102
|
-
client = self.s3_client
|
|
103
|
-
return client.list_buckets()
|
|
104
|
-
|
|
105
|
-
#####################################################################
|
|
106
|
-
def upload_nodd_file(
|
|
107
|
-
self,
|
|
108
|
-
file_name: str,
|
|
109
|
-
key: str,
|
|
110
|
-
):
|
|
111
|
-
self.s3_client_noaa_wcsd_zarr_pds.upload_file(
|
|
112
|
-
Filename=file_name,
|
|
113
|
-
Bucket=self.output_bucket_name,
|
|
114
|
-
Key=key,
|
|
115
|
-
)
|
|
116
|
-
return key
|
|
117
|
-
|
|
118
|
-
#####################################################################
|
|
119
|
-
def upload_files_with_thread_pool_executor(
|
|
120
|
-
self,
|
|
121
|
-
all_files: list,
|
|
122
|
-
):
|
|
123
|
-
# 'all_files' is passed a list of lists: [[local_path, s3_key], [...], ...]
|
|
124
|
-
all_uploads = []
|
|
125
|
-
try: # TODO: problem with threadpool here, missing child files
|
|
126
|
-
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
127
|
-
futures = [executor.submit(
|
|
128
|
-
self.upload_nodd_file,
|
|
129
|
-
all_file[0], # file_name
|
|
130
|
-
all_file[1] # key
|
|
131
|
-
) for all_file in all_files]
|
|
132
|
-
for future in as_completed(futures):
|
|
133
|
-
result = future.result()
|
|
134
|
-
if result:
|
|
135
|
-
all_uploads.extend(result)
|
|
136
|
-
except Exception as err:
|
|
137
|
-
print(err)
|
|
138
|
-
print('Done uploading files using threading pool.')
|
|
139
|
-
return all_uploads
|
|
140
|
-
|
|
141
|
-
#####################################################################
|
|
142
|
-
def upload_zarr_files_to_bucket( # noaa-wcsd-zarr-pds
|
|
143
|
-
self,
|
|
144
|
-
local_directory,
|
|
145
|
-
remote_directory,
|
|
146
|
-
):
|
|
147
|
-
# Right now this is just for uploading a zarr store to s3
|
|
148
|
-
print('Uploading files to output bucket.')
|
|
149
|
-
store_name = os.path.basename(local_directory)
|
|
150
|
-
all_files = []
|
|
151
|
-
for subdir, dirs, files in os.walk(local_directory):
|
|
152
|
-
for file in files:
|
|
153
|
-
local_path = os.path.join(subdir, file)
|
|
154
|
-
# s3_key = os.path.join(object_prefix, local_path)
|
|
155
|
-
s3_key = os.path.join(remote_directory, store_name, subdir.split(store_name)[-1].strip('/'))
|
|
156
|
-
all_files.append([local_path, s3_key])
|
|
157
|
-
|
|
158
|
-
all_uploads = self.upload_files_with_thread_pool_executor(
|
|
159
|
-
all_files=all_files,
|
|
160
|
-
)
|
|
161
|
-
print('Done uploading files to output bucket.')
|
|
162
|
-
return all_uploads
|
|
163
|
-
|
|
164
|
-
#####################################################################
|
|
165
|
-
# used: raw-to-zarr
|
|
166
|
-
def list_objects( # noaa-wcsd-pds and noaa-wcsd-zarr-pds
|
|
167
|
-
self,
|
|
168
|
-
bucket_name,
|
|
169
|
-
prefix
|
|
170
|
-
):
|
|
171
|
-
# analog to "find_children_objects"
|
|
172
|
-
# Returns a list of key strings for each object in bucket defined by prefix
|
|
173
|
-
s3_client = self.s3_client
|
|
174
|
-
keys = []
|
|
175
|
-
paginator = s3_client.get_paginator('list_objects_v2')
|
|
176
|
-
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
|
|
177
|
-
for page in page_iterator:
|
|
178
|
-
if 'Contents' in page.keys():
|
|
179
|
-
keys.extend([k['Key'] for k in page['Contents']])
|
|
180
|
-
return keys
|
|
181
|
-
|
|
182
|
-
def list_nodd_objects( # These are used by the geometry_manager for uploading data
|
|
183
|
-
self,
|
|
184
|
-
prefix,
|
|
185
|
-
):
|
|
186
|
-
# Returns a list of key strings for each object in bucket defined by prefix
|
|
187
|
-
keys = []
|
|
188
|
-
paginator = self.s3_client_noaa_wcsd_zarr_pds.get_paginator('list_objects_v2')
|
|
189
|
-
for page in paginator.paginate(Bucket=self.output_bucket_name, Prefix=prefix):
|
|
190
|
-
if 'Contents' in page.keys():
|
|
191
|
-
keys.extend([k['Key'] for k in page['Contents']])
|
|
192
|
-
return keys
|
|
193
|
-
|
|
194
|
-
#####################################################################
|
|
195
|
-
# TODO: change name to "directory"
|
|
196
|
-
def folder_exists_and_not_empty(
|
|
197
|
-
self,
|
|
198
|
-
bucket_name: str,
|
|
199
|
-
path: str
|
|
200
|
-
) -> bool:
|
|
201
|
-
if not path.endswith('/'):
|
|
202
|
-
path = path + '/'
|
|
203
|
-
s3_client = self.s3_client
|
|
204
|
-
resp = self.list_objects(bucket_name=bucket_name, prefix=path) # TODO: this is returning root folder and doesn't include children or hidden folders
|
|
205
|
-
#resp = s3_client.list_objects(Bucket=bucket, Prefix=path, Delimiter='/', MaxKeys=1)
|
|
206
|
-
return 'Contents' in resp
|
|
207
|
-
|
|
208
|
-
#####################################################################
|
|
209
|
-
# used
|
|
210
|
-
def __paginate_child_objects(
|
|
211
|
-
self,
|
|
212
|
-
bucket_name: str,
|
|
213
|
-
sub_prefix: str = None,
|
|
214
|
-
) -> list:
|
|
215
|
-
page_iterator = self.s3_client.get_paginator('list_objects_v2').paginate(Bucket=bucket_name, Prefix=sub_prefix)
|
|
216
|
-
objects = []
|
|
217
|
-
for page in page_iterator:
|
|
218
|
-
if 'Contents' in page.keys():
|
|
219
|
-
objects.extend(page['Contents'])
|
|
220
|
-
return objects
|
|
221
|
-
|
|
222
|
-
def get_child_objects(
|
|
223
|
-
self,
|
|
224
|
-
bucket_name: str,
|
|
225
|
-
sub_prefix: str,
|
|
226
|
-
file_suffix: str = None,
|
|
227
|
-
) -> list:
|
|
228
|
-
print('Getting child objects')
|
|
229
|
-
raw_files = []
|
|
230
|
-
try:
|
|
231
|
-
children = self.__paginate_child_objects(
|
|
232
|
-
bucket_name=bucket_name,
|
|
233
|
-
sub_prefix=sub_prefix,
|
|
234
|
-
)
|
|
235
|
-
if file_suffix is None:
|
|
236
|
-
raw_files = children
|
|
237
|
-
else:
|
|
238
|
-
for child in children:
|
|
239
|
-
# Note: Any files with predicate 'NOISE' are to be ignored
|
|
240
|
-
# see: "Bell_M._Shimada/SH1507" cruise for more details.
|
|
241
|
-
if child['Key'].endswith(file_suffix) and not os.path.basename(child['Key']).startswith(
|
|
242
|
-
'NOISE'
|
|
243
|
-
):
|
|
244
|
-
raw_files.append(child['Key'])
|
|
245
|
-
return raw_files
|
|
246
|
-
except ClientError as err:
|
|
247
|
-
print(f"Problem was encountered while getting s3 files: {err}")
|
|
248
|
-
raise
|
|
249
|
-
print(f"Found {len(raw_files)} files.")
|
|
250
|
-
return raw_files
|
|
251
|
-
|
|
252
|
-
#####################################################################
|
|
253
|
-
def get_object( # TODO: Move this to index.py
|
|
254
|
-
# noaa-wcsd-pds or noaa-wcsd-zarr-pds
|
|
255
|
-
self,
|
|
256
|
-
bucket_name,
|
|
257
|
-
key_name,
|
|
258
|
-
):
|
|
259
|
-
# Meant for getting singular objects from a bucket, used by indexing lambda
|
|
260
|
-
print(f"Getting object {key_name} from {bucket_name}")
|
|
261
|
-
try:
|
|
262
|
-
response = self.s3_client.get_object(
|
|
263
|
-
Bucket=bucket_name,
|
|
264
|
-
Key=key_name,
|
|
265
|
-
)
|
|
266
|
-
# status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
|
|
267
|
-
# if status == 200:
|
|
268
|
-
except ClientError as err:
|
|
269
|
-
print(f"Problem was encountered while getting s3 file: {err}")
|
|
270
|
-
raise
|
|
271
|
-
print(f"Done getting object {key_name} from {bucket_name}")
|
|
272
|
-
return response
|
|
273
|
-
|
|
274
|
-
#####################################################################
|
|
275
|
-
# used raw-to-zarr
|
|
276
|
-
def download_file( # TODO: change to download_object
|
|
277
|
-
# noaa-wcsd-pds or noaa-wcsd-zarr-pds
|
|
278
|
-
self,
|
|
279
|
-
bucket_name,
|
|
280
|
-
key,
|
|
281
|
-
file_name,
|
|
282
|
-
):
|
|
283
|
-
self.s3_client.download_file(
|
|
284
|
-
Bucket=bucket_name,
|
|
285
|
-
Key=key,
|
|
286
|
-
Filename=file_name
|
|
287
|
-
)
|
|
288
|
-
print('downloaded file')
|
|
289
|
-
|
|
290
|
-
#####################################################################
|
|
291
|
-
# not used
|
|
292
|
-
# def delete_nodd_object( # noaa-wcsd-zarr-pds
|
|
293
|
-
# self,
|
|
294
|
-
# bucket_name,
|
|
295
|
-
# key
|
|
296
|
-
# ): # -> dict:
|
|
297
|
-
# #return self.__s3_client.delete_object(Bucket=bucket_name, Key=key)
|
|
298
|
-
# self.s3_client.delete_object(Bucket=bucket_name, Key=key)
|
|
299
|
-
|
|
300
|
-
#####################################################################
|
|
301
|
-
def delete_nodd_objects( # nodd-bucket
|
|
302
|
-
self,
|
|
303
|
-
objects: list,
|
|
304
|
-
):
|
|
305
|
-
try:
|
|
306
|
-
print(f"Deleting {len(objects)} objects in {self.output_bucket_name} in batches.")
|
|
307
|
-
objects_to_delete = []
|
|
308
|
-
for obj in objects:
|
|
309
|
-
objects_to_delete.append({'Key': obj['Key']})
|
|
310
|
-
# Note: request can contain a list of up to 1000 keys
|
|
311
|
-
for batch in chunked(ll=objects_to_delete, n=1000):
|
|
312
|
-
self.s3_client_noaa_wcsd_zarr_pds.delete_objects(
|
|
313
|
-
Bucket=self.output_bucket_name,
|
|
314
|
-
Delete={'Objects': batch}
|
|
315
|
-
)
|
|
316
|
-
print(f"Deleted files.")
|
|
317
|
-
except Exception as err:
|
|
318
|
-
print(f"Problem was encountered while deleting objects: {err}")
|
|
319
|
-
|
|
320
|
-
#####################################################################
|
|
321
|
-
# not used TODO: remove
|
|
322
|
-
def put( # noaa-wcsd-zarr-pds
|
|
323
|
-
self,
|
|
324
|
-
bucket_name,
|
|
325
|
-
key,
|
|
326
|
-
body
|
|
327
|
-
):
|
|
328
|
-
self.s3_client.put_object(
|
|
329
|
-
Bucket=bucket_name,
|
|
330
|
-
Key=key,
|
|
331
|
-
Body=body
|
|
332
|
-
)
|
|
333
|
-
|
|
334
|
-
#####################################################################
|
|
335
|
-
def read_s3_json(
|
|
336
|
-
self,
|
|
337
|
-
ship_name,
|
|
338
|
-
cruise_name,
|
|
339
|
-
sensor_name,
|
|
340
|
-
file_name_stem,
|
|
341
|
-
) -> str:
|
|
342
|
-
try:
|
|
343
|
-
content_object = self.s3_resource_noaa_wcsd_zarr_pds.Object(
|
|
344
|
-
bucket_name=self.output_bucket_name,
|
|
345
|
-
key=f'spatial/geojson/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.json'
|
|
346
|
-
).get()
|
|
347
|
-
file_content = content_object['Body'].read().decode('utf-8')
|
|
348
|
-
json_content = json.loads(file_content)
|
|
349
|
-
return json_content
|
|
350
|
-
except Exception as err: # Failure
|
|
351
|
-
print(f'Exception encountered reading s3 GeoJSON: {err}')
|
|
352
|
-
raise
|
|
353
|
-
|
|
354
|
-
#####################################################################
|
|
355
|
-
|
|
356
|
-
#########################################################################
|
model/aws/s3fs_manager.py
DELETED
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import s3fs
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
# TODO: S3FS_LOGGING_LEVEL=DEBUG
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class S3FSManager:
|
|
9
|
-
#####################################################################
|
|
10
|
-
def __init__(
|
|
11
|
-
self,
|
|
12
|
-
):
|
|
13
|
-
self.__s3_region = os.environ.get("AWS_REGION", default="us-east-1")
|
|
14
|
-
self.s3fs = s3fs.S3FileSystem(
|
|
15
|
-
key=os.environ.get('OUTPUT_BUCKET_ACCESS_KEY'),
|
|
16
|
-
secret=os.environ.get('OUTPUT_BUCKET_SECRET_ACCESS_KEY'),
|
|
17
|
-
# asynchronous=True
|
|
18
|
-
# use_ssl=False,
|
|
19
|
-
# skip_instance_cache=True,
|
|
20
|
-
# default_block_size='100MB', # if no specific value is given at all time. The built-in default is 5MB
|
|
21
|
-
# client_kwargs={
|
|
22
|
-
# "region_name": self.__s3_region
|
|
23
|
-
# }
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
#####################################################################
|
|
27
|
-
def add_file(
|
|
28
|
-
self,
|
|
29
|
-
filename
|
|
30
|
-
):
|
|
31
|
-
full_path = f"{os.getenv('OUTPUT_BUCKET_NAME')}/testing/{filename}"
|
|
32
|
-
print(full_path)
|
|
33
|
-
|
|
34
|
-
self.s3fs.touch(full_path)
|
|
35
|
-
ff = self.s3fs.ls(f"{os.getenv('OUTPUT_BUCKET_NAME')}/")
|
|
36
|
-
|
|
37
|
-
print(ff)
|
|
38
|
-
|
|
39
|
-
#####################################################################
|
|
40
|
-
def upload_data(
|
|
41
|
-
self,
|
|
42
|
-
bucket_name,
|
|
43
|
-
file_path,
|
|
44
|
-
prefix
|
|
45
|
-
):
|
|
46
|
-
# TODO: this works in theory but use boto3 to upload files
|
|
47
|
-
s3_path = f"s3://{bucket_name}/{prefix}/"
|
|
48
|
-
s3_file_system = self.s3fs
|
|
49
|
-
s3_file_system.put(file_path, s3_path, recursive=True)
|
|
50
|
-
|
|
51
|
-
#####################################################################
|
|
52
|
-
def s3_map(
|
|
53
|
-
self,
|
|
54
|
-
s3_zarr_store_path, # f's3://{bucket}/{input_zarr_path}'
|
|
55
|
-
):
|
|
56
|
-
# The "s3_zarr_store_path" is defined as f's3://{bucket}/{input_zarr_path}'
|
|
57
|
-
# create=False, not false because will be writing
|
|
58
|
-
# return s3fs.S3Map(root=s3_zarr_store_path, s3=self.s3fs, check=True)
|
|
59
|
-
return s3fs.S3Map(root=s3_zarr_store_path, s3=self.s3fs) # create=False, not false because will be writing
|
|
60
|
-
|
|
61
|
-
#####################################################################
|
|
62
|
-
def exists(
|
|
63
|
-
self,
|
|
64
|
-
geo_json_s3_path,
|
|
65
|
-
):
|
|
66
|
-
s3_file_system = self.s3fs
|
|
67
|
-
return s3_file_system.exists(path=geo_json_s3_path)
|
|
68
|
-
|
|
69
|
-
#####################################################################
|
|
70
|
-
# def put(
|
|
71
|
-
# self
|
|
72
|
-
# ):
|
|
73
|
-
# s3_file_system = self.s3fs
|
|
74
|
-
# return
|
model/cruise/__init__.py
DELETED
|
File without changes
|