water-column-sonar-processing 0.0.1__py3-none-any.whl → 25.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of water-column-sonar-processing might be problematic. Click here for more details.
- water_column_sonar_processing/__init__.py +13 -0
- water_column_sonar_processing/aws/__init__.py +7 -0
- water_column_sonar_processing/aws/dynamodb_manager.py +355 -0
- water_column_sonar_processing/aws/s3_manager.py +420 -0
- water_column_sonar_processing/aws/s3fs_manager.py +72 -0
- {model → water_column_sonar_processing}/aws/sns_manager.py +10 -21
- {model → water_column_sonar_processing}/aws/sqs_manager.py +11 -19
- water_column_sonar_processing/cruise/__init__.py +4 -0
- water_column_sonar_processing/cruise/create_empty_zarr_store.py +191 -0
- water_column_sonar_processing/cruise/datatree_manager.py +21 -0
- water_column_sonar_processing/cruise/resample_regrid.py +339 -0
- water_column_sonar_processing/geometry/__init__.py +11 -0
- water_column_sonar_processing/geometry/elevation_manager.py +111 -0
- water_column_sonar_processing/geometry/geometry_manager.py +243 -0
- water_column_sonar_processing/geometry/line_simplification.py +176 -0
- water_column_sonar_processing/geometry/pmtile_generation.py +261 -0
- water_column_sonar_processing/index/__init__.py +3 -0
- water_column_sonar_processing/index/index_manager.py +384 -0
- water_column_sonar_processing/model/__init__.py +3 -0
- water_column_sonar_processing/model/zarr_manager.py +722 -0
- water_column_sonar_processing/process.py +149 -0
- water_column_sonar_processing/processing/__init__.py +4 -0
- water_column_sonar_processing/processing/raw_to_netcdf.py +320 -0
- water_column_sonar_processing/processing/raw_to_zarr.py +425 -0
- water_column_sonar_processing/utility/__init__.py +13 -0
- {model → water_column_sonar_processing}/utility/cleaner.py +7 -8
- water_column_sonar_processing/utility/constants.py +118 -0
- {model → water_column_sonar_processing}/utility/pipeline_status.py +47 -24
- water_column_sonar_processing/utility/timestamp.py +12 -0
- water_column_sonar_processing-25.11.1.dist-info/METADATA +182 -0
- water_column_sonar_processing-25.11.1.dist-info/RECORD +34 -0
- {water_column_sonar_processing-0.0.1.dist-info → water_column_sonar_processing-25.11.1.dist-info}/WHEEL +1 -1
- {water_column_sonar_processing-0.0.1.dist-info → water_column_sonar_processing-25.11.1.dist-info/licenses}/LICENSE +1 -1
- water_column_sonar_processing-25.11.1.dist-info/top_level.txt +1 -0
- __init__.py +0 -0
- model/__init__.py +0 -0
- model/aws/__init__.py +0 -0
- model/aws/dynamodb_manager.py +0 -149
- model/aws/s3_manager.py +0 -356
- model/aws/s3fs_manager.py +0 -74
- model/cruise/__init__.py +0 -0
- model/cruise/create_empty_zarr_store.py +0 -166
- model/cruise/resample_regrid.py +0 -248
- model/geospatial/__init__.py +0 -0
- model/geospatial/geometry_manager.py +0 -194
- model/geospatial/geometry_simplification.py +0 -81
- model/geospatial/pmtile_generation.py +0 -74
- model/index/__init__.py +0 -0
- model/index/index.py +0 -228
- model/model.py +0 -138
- model/utility/__init__.py +0 -0
- model/utility/constants.py +0 -56
- model/utility/timestamp.py +0 -12
- model/zarr/__init__.py +0 -0
- model/zarr/bar.py +0 -28
- model/zarr/foo.py +0 -11
- model/zarr/zarr_manager.py +0 -298
- water_column_sonar_processing-0.0.1.dist-info/METADATA +0 -89
- water_column_sonar_processing-0.0.1.dist-info/RECORD +0 -32
- water_column_sonar_processing-0.0.1.dist-info/top_level.txt +0 -2
model/aws/s3_manager.py
DELETED
|
@@ -1,356 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import boto3
|
|
4
|
-
import pandas as pd
|
|
5
|
-
from collections.abc import Generator
|
|
6
|
-
|
|
7
|
-
import geopandas
|
|
8
|
-
from botocore.config import Config
|
|
9
|
-
from boto3.s3.transfer import TransferConfig
|
|
10
|
-
from botocore.exceptions import ClientError
|
|
11
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
12
|
-
from concurrent.futures import as_completed
|
|
13
|
-
|
|
14
|
-
MAX_POOL_CONNECTIONS = 64
|
|
15
|
-
MAX_CONCURRENCY = 64
|
|
16
|
-
MAX_WORKERS = 64
|
|
17
|
-
GB = 1024 ** 3
|
|
18
|
-
|
|
19
|
-
#########################################################################
|
|
20
|
-
def chunked(ll: list, n: int) -> Generator:
|
|
21
|
-
# Yields successively n-sized chunks from ll.
|
|
22
|
-
for i in range(0, len(ll), n):
|
|
23
|
-
yield ll[i:i + n]
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class S3Manager:
|
|
27
|
-
#####################################################################
|
|
28
|
-
def __init__(
|
|
29
|
-
self,
|
|
30
|
-
# TODO: Need to allow passing in of credentials when writing to protected bucket
|
|
31
|
-
):
|
|
32
|
-
self.input_bucket_name = os.environ.get('INPUT_BUCKET_NAME')
|
|
33
|
-
self.output_bucket_name = os.environ.get('OUTPUT_BUCKET_NAME')
|
|
34
|
-
self.s3_region = os.environ.get("AWS_REGION", default="us-east-1")
|
|
35
|
-
self.s3_client_config = Config(max_pool_connections=MAX_POOL_CONNECTIONS)
|
|
36
|
-
self.s3_transfer_config = TransferConfig(
|
|
37
|
-
max_concurrency=MAX_CONCURRENCY,
|
|
38
|
-
use_threads=True,
|
|
39
|
-
max_bandwidth=None,
|
|
40
|
-
multipart_threshold=10 * GB
|
|
41
|
-
)
|
|
42
|
-
self.s3_session = boto3.Session(
|
|
43
|
-
aws_access_key_id=os.environ.get('ACCESS_KEY_ID'),
|
|
44
|
-
aws_secret_access_key=os.environ.get('SECRET_ACCESS_KEY'),
|
|
45
|
-
region_name=self.s3_region,
|
|
46
|
-
)
|
|
47
|
-
self.s3_client = self.s3_session.client(
|
|
48
|
-
service_name="s3",
|
|
49
|
-
config=self.s3_client_config,
|
|
50
|
-
region_name=self.s3_region,
|
|
51
|
-
)
|
|
52
|
-
self.s3_resource = boto3.resource(
|
|
53
|
-
service_name="s3",
|
|
54
|
-
config=self.s3_client_config,
|
|
55
|
-
region_name=self.s3_region,
|
|
56
|
-
)
|
|
57
|
-
# self.paginator = self.s3_client.get_paginator(operation_name='list_objects_v2')
|
|
58
|
-
# TODO: create both "s3_client_input" and "s3_client_output" ???
|
|
59
|
-
self.s3_session_noaa_wcsd_zarr_pds = boto3.Session(
|
|
60
|
-
aws_access_key_id=os.environ.get('OUTPUT_BUCKET_ACCESS_KEY'),
|
|
61
|
-
aws_secret_access_key=os.environ.get('OUTPUT_BUCKET_SECRET_ACCESS_KEY'),
|
|
62
|
-
region_name=self.s3_region,
|
|
63
|
-
)
|
|
64
|
-
self.s3_client_noaa_wcsd_zarr_pds = self.s3_session_noaa_wcsd_zarr_pds.client(
|
|
65
|
-
service_name="s3",
|
|
66
|
-
config=self.s3_client_config,
|
|
67
|
-
region_name=self.s3_region,
|
|
68
|
-
)
|
|
69
|
-
self.s3_resource_noaa_wcsd_zarr_pds = self.s3_session_noaa_wcsd_zarr_pds.resource(
|
|
70
|
-
service_name="s3",
|
|
71
|
-
config=self.s3_client_config,
|
|
72
|
-
region_name=self.s3_region,
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
def get_client(
|
|
76
|
-
self
|
|
77
|
-
):
|
|
78
|
-
return self.s3_session.client(
|
|
79
|
-
service_name="s3",
|
|
80
|
-
config=self.__s3_client_config,
|
|
81
|
-
region_name=self.s3_region,
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
#####################################################################
|
|
85
|
-
def create_bucket(
|
|
86
|
-
self,
|
|
87
|
-
bucket_name: str,
|
|
88
|
-
):
|
|
89
|
-
self.s3_client.create_bucket(
|
|
90
|
-
Bucket=bucket_name,
|
|
91
|
-
# Required when region is different then us-east-1
|
|
92
|
-
#
|
|
93
|
-
# TODO: if region is us-east-1, don't include this line somehow
|
|
94
|
-
# CreateBucketConfiguration={'LocationConstraint': self.__s3_region}
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
#####################################################################
|
|
98
|
-
def list_buckets(
|
|
99
|
-
self
|
|
100
|
-
):
|
|
101
|
-
# client = self.get_client()
|
|
102
|
-
client = self.s3_client
|
|
103
|
-
return client.list_buckets()
|
|
104
|
-
|
|
105
|
-
#####################################################################
|
|
106
|
-
def upload_nodd_file(
|
|
107
|
-
self,
|
|
108
|
-
file_name: str,
|
|
109
|
-
key: str,
|
|
110
|
-
):
|
|
111
|
-
self.s3_client_noaa_wcsd_zarr_pds.upload_file(
|
|
112
|
-
Filename=file_name,
|
|
113
|
-
Bucket=self.output_bucket_name,
|
|
114
|
-
Key=key,
|
|
115
|
-
)
|
|
116
|
-
return key
|
|
117
|
-
|
|
118
|
-
#####################################################################
|
|
119
|
-
def upload_files_with_thread_pool_executor(
|
|
120
|
-
self,
|
|
121
|
-
all_files: list,
|
|
122
|
-
):
|
|
123
|
-
# 'all_files' is passed a list of lists: [[local_path, s3_key], [...], ...]
|
|
124
|
-
all_uploads = []
|
|
125
|
-
try: # TODO: problem with threadpool here, missing child files
|
|
126
|
-
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
127
|
-
futures = [executor.submit(
|
|
128
|
-
self.upload_nodd_file,
|
|
129
|
-
all_file[0], # file_name
|
|
130
|
-
all_file[1] # key
|
|
131
|
-
) for all_file in all_files]
|
|
132
|
-
for future in as_completed(futures):
|
|
133
|
-
result = future.result()
|
|
134
|
-
if result:
|
|
135
|
-
all_uploads.extend(result)
|
|
136
|
-
except Exception as err:
|
|
137
|
-
print(err)
|
|
138
|
-
print('Done uploading files using threading pool.')
|
|
139
|
-
return all_uploads
|
|
140
|
-
|
|
141
|
-
#####################################################################
|
|
142
|
-
def upload_zarr_files_to_bucket( # noaa-wcsd-zarr-pds
|
|
143
|
-
self,
|
|
144
|
-
local_directory,
|
|
145
|
-
remote_directory,
|
|
146
|
-
):
|
|
147
|
-
# Right now this is just for uploading a zarr store to s3
|
|
148
|
-
print('Uploading files to output bucket.')
|
|
149
|
-
store_name = os.path.basename(local_directory)
|
|
150
|
-
all_files = []
|
|
151
|
-
for subdir, dirs, files in os.walk(local_directory):
|
|
152
|
-
for file in files:
|
|
153
|
-
local_path = os.path.join(subdir, file)
|
|
154
|
-
# s3_key = os.path.join(object_prefix, local_path)
|
|
155
|
-
s3_key = os.path.join(remote_directory, store_name, subdir.split(store_name)[-1].strip('/'))
|
|
156
|
-
all_files.append([local_path, s3_key])
|
|
157
|
-
|
|
158
|
-
all_uploads = self.upload_files_with_thread_pool_executor(
|
|
159
|
-
all_files=all_files,
|
|
160
|
-
)
|
|
161
|
-
print('Done uploading files to output bucket.')
|
|
162
|
-
return all_uploads
|
|
163
|
-
|
|
164
|
-
#####################################################################
|
|
165
|
-
# used: raw-to-zarr
|
|
166
|
-
def list_objects( # noaa-wcsd-pds and noaa-wcsd-zarr-pds
|
|
167
|
-
self,
|
|
168
|
-
bucket_name,
|
|
169
|
-
prefix
|
|
170
|
-
):
|
|
171
|
-
# analog to "find_children_objects"
|
|
172
|
-
# Returns a list of key strings for each object in bucket defined by prefix
|
|
173
|
-
s3_client = self.s3_client
|
|
174
|
-
keys = []
|
|
175
|
-
paginator = s3_client.get_paginator('list_objects_v2')
|
|
176
|
-
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
|
|
177
|
-
for page in page_iterator:
|
|
178
|
-
if 'Contents' in page.keys():
|
|
179
|
-
keys.extend([k['Key'] for k in page['Contents']])
|
|
180
|
-
return keys
|
|
181
|
-
|
|
182
|
-
def list_nodd_objects( # These are used by the geometry_manager for uploading data
|
|
183
|
-
self,
|
|
184
|
-
prefix,
|
|
185
|
-
):
|
|
186
|
-
# Returns a list of key strings for each object in bucket defined by prefix
|
|
187
|
-
keys = []
|
|
188
|
-
paginator = self.s3_client_noaa_wcsd_zarr_pds.get_paginator('list_objects_v2')
|
|
189
|
-
for page in paginator.paginate(Bucket=self.output_bucket_name, Prefix=prefix):
|
|
190
|
-
if 'Contents' in page.keys():
|
|
191
|
-
keys.extend([k['Key'] for k in page['Contents']])
|
|
192
|
-
return keys
|
|
193
|
-
|
|
194
|
-
#####################################################################
|
|
195
|
-
# TODO: change name to "directory"
|
|
196
|
-
def folder_exists_and_not_empty(
|
|
197
|
-
self,
|
|
198
|
-
bucket_name: str,
|
|
199
|
-
path: str
|
|
200
|
-
) -> bool:
|
|
201
|
-
if not path.endswith('/'):
|
|
202
|
-
path = path + '/'
|
|
203
|
-
s3_client = self.s3_client
|
|
204
|
-
resp = self.list_objects(bucket_name=bucket_name, prefix=path) # TODO: this is returning root folder and doesn't include children or hidden folders
|
|
205
|
-
#resp = s3_client.list_objects(Bucket=bucket, Prefix=path, Delimiter='/', MaxKeys=1)
|
|
206
|
-
return 'Contents' in resp
|
|
207
|
-
|
|
208
|
-
#####################################################################
|
|
209
|
-
# used
|
|
210
|
-
def __paginate_child_objects(
|
|
211
|
-
self,
|
|
212
|
-
bucket_name: str,
|
|
213
|
-
sub_prefix: str = None,
|
|
214
|
-
) -> list:
|
|
215
|
-
page_iterator = self.s3_client.get_paginator('list_objects_v2').paginate(Bucket=bucket_name, Prefix=sub_prefix)
|
|
216
|
-
objects = []
|
|
217
|
-
for page in page_iterator:
|
|
218
|
-
if 'Contents' in page.keys():
|
|
219
|
-
objects.extend(page['Contents'])
|
|
220
|
-
return objects
|
|
221
|
-
|
|
222
|
-
def get_child_objects(
|
|
223
|
-
self,
|
|
224
|
-
bucket_name: str,
|
|
225
|
-
sub_prefix: str,
|
|
226
|
-
file_suffix: str = None,
|
|
227
|
-
) -> list:
|
|
228
|
-
print('Getting child objects')
|
|
229
|
-
raw_files = []
|
|
230
|
-
try:
|
|
231
|
-
children = self.__paginate_child_objects(
|
|
232
|
-
bucket_name=bucket_name,
|
|
233
|
-
sub_prefix=sub_prefix,
|
|
234
|
-
)
|
|
235
|
-
if file_suffix is None:
|
|
236
|
-
raw_files = children
|
|
237
|
-
else:
|
|
238
|
-
for child in children:
|
|
239
|
-
# Note: Any files with predicate 'NOISE' are to be ignored
|
|
240
|
-
# see: "Bell_M._Shimada/SH1507" cruise for more details.
|
|
241
|
-
if child['Key'].endswith(file_suffix) and not os.path.basename(child['Key']).startswith(
|
|
242
|
-
'NOISE'
|
|
243
|
-
):
|
|
244
|
-
raw_files.append(child['Key'])
|
|
245
|
-
return raw_files
|
|
246
|
-
except ClientError as err:
|
|
247
|
-
print(f"Problem was encountered while getting s3 files: {err}")
|
|
248
|
-
raise
|
|
249
|
-
print(f"Found {len(raw_files)} files.")
|
|
250
|
-
return raw_files
|
|
251
|
-
|
|
252
|
-
#####################################################################
|
|
253
|
-
def get_object( # TODO: Move this to index.py
|
|
254
|
-
# noaa-wcsd-pds or noaa-wcsd-zarr-pds
|
|
255
|
-
self,
|
|
256
|
-
bucket_name,
|
|
257
|
-
key_name,
|
|
258
|
-
):
|
|
259
|
-
# Meant for getting singular objects from a bucket, used by indexing lambda
|
|
260
|
-
print(f"Getting object {key_name} from {bucket_name}")
|
|
261
|
-
try:
|
|
262
|
-
response = self.s3_client.get_object(
|
|
263
|
-
Bucket=bucket_name,
|
|
264
|
-
Key=key_name,
|
|
265
|
-
)
|
|
266
|
-
# status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
|
|
267
|
-
# if status == 200:
|
|
268
|
-
except ClientError as err:
|
|
269
|
-
print(f"Problem was encountered while getting s3 file: {err}")
|
|
270
|
-
raise
|
|
271
|
-
print(f"Done getting object {key_name} from {bucket_name}")
|
|
272
|
-
return response
|
|
273
|
-
|
|
274
|
-
#####################################################################
|
|
275
|
-
# used raw-to-zarr
|
|
276
|
-
def download_file( # TODO: change to download_object
|
|
277
|
-
# noaa-wcsd-pds or noaa-wcsd-zarr-pds
|
|
278
|
-
self,
|
|
279
|
-
bucket_name,
|
|
280
|
-
key,
|
|
281
|
-
file_name,
|
|
282
|
-
):
|
|
283
|
-
self.s3_client.download_file(
|
|
284
|
-
Bucket=bucket_name,
|
|
285
|
-
Key=key,
|
|
286
|
-
Filename=file_name
|
|
287
|
-
)
|
|
288
|
-
print('downloaded file')
|
|
289
|
-
|
|
290
|
-
#####################################################################
|
|
291
|
-
# not used
|
|
292
|
-
# def delete_nodd_object( # noaa-wcsd-zarr-pds
|
|
293
|
-
# self,
|
|
294
|
-
# bucket_name,
|
|
295
|
-
# key
|
|
296
|
-
# ): # -> dict:
|
|
297
|
-
# #return self.__s3_client.delete_object(Bucket=bucket_name, Key=key)
|
|
298
|
-
# self.s3_client.delete_object(Bucket=bucket_name, Key=key)
|
|
299
|
-
|
|
300
|
-
#####################################################################
|
|
301
|
-
def delete_nodd_objects( # nodd-bucket
|
|
302
|
-
self,
|
|
303
|
-
objects: list,
|
|
304
|
-
):
|
|
305
|
-
try:
|
|
306
|
-
print(f"Deleting {len(objects)} objects in {self.output_bucket_name} in batches.")
|
|
307
|
-
objects_to_delete = []
|
|
308
|
-
for obj in objects:
|
|
309
|
-
objects_to_delete.append({'Key': obj['Key']})
|
|
310
|
-
# Note: request can contain a list of up to 1000 keys
|
|
311
|
-
for batch in chunked(ll=objects_to_delete, n=1000):
|
|
312
|
-
self.s3_client_noaa_wcsd_zarr_pds.delete_objects(
|
|
313
|
-
Bucket=self.output_bucket_name,
|
|
314
|
-
Delete={'Objects': batch}
|
|
315
|
-
)
|
|
316
|
-
print(f"Deleted files.")
|
|
317
|
-
except Exception as err:
|
|
318
|
-
print(f"Problem was encountered while deleting objects: {err}")
|
|
319
|
-
|
|
320
|
-
#####################################################################
|
|
321
|
-
# not used TODO: remove
|
|
322
|
-
def put( # noaa-wcsd-zarr-pds
|
|
323
|
-
self,
|
|
324
|
-
bucket_name,
|
|
325
|
-
key,
|
|
326
|
-
body
|
|
327
|
-
):
|
|
328
|
-
self.s3_client.put_object(
|
|
329
|
-
Bucket=bucket_name,
|
|
330
|
-
Key=key,
|
|
331
|
-
Body=body
|
|
332
|
-
)
|
|
333
|
-
|
|
334
|
-
#####################################################################
|
|
335
|
-
def read_s3_json(
|
|
336
|
-
self,
|
|
337
|
-
ship_name,
|
|
338
|
-
cruise_name,
|
|
339
|
-
sensor_name,
|
|
340
|
-
file_name_stem,
|
|
341
|
-
) -> str:
|
|
342
|
-
try:
|
|
343
|
-
content_object = self.s3_resource_noaa_wcsd_zarr_pds.Object(
|
|
344
|
-
bucket_name=self.output_bucket_name,
|
|
345
|
-
key=f'spatial/geojson/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.json'
|
|
346
|
-
).get()
|
|
347
|
-
file_content = content_object['Body'].read().decode('utf-8')
|
|
348
|
-
json_content = json.loads(file_content)
|
|
349
|
-
return json_content
|
|
350
|
-
except Exception as err: # Failure
|
|
351
|
-
print(f'Exception encountered reading s3 GeoJSON: {err}')
|
|
352
|
-
raise
|
|
353
|
-
|
|
354
|
-
#####################################################################
|
|
355
|
-
|
|
356
|
-
#########################################################################
|
model/aws/s3fs_manager.py
DELETED
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import s3fs
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
# TODO: S3FS_LOGGING_LEVEL=DEBUG
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class S3FSManager:
|
|
9
|
-
#####################################################################
|
|
10
|
-
def __init__(
|
|
11
|
-
self,
|
|
12
|
-
):
|
|
13
|
-
self.__s3_region = os.environ.get("AWS_REGION", default="us-east-1")
|
|
14
|
-
self.s3fs = s3fs.S3FileSystem(
|
|
15
|
-
key=os.environ.get('OUTPUT_BUCKET_ACCESS_KEY'),
|
|
16
|
-
secret=os.environ.get('OUTPUT_BUCKET_SECRET_ACCESS_KEY'),
|
|
17
|
-
# asynchronous=True
|
|
18
|
-
# use_ssl=False,
|
|
19
|
-
# skip_instance_cache=True,
|
|
20
|
-
# default_block_size='100MB', # if no specific value is given at all time. The built-in default is 5MB
|
|
21
|
-
# client_kwargs={
|
|
22
|
-
# "region_name": self.__s3_region
|
|
23
|
-
# }
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
#####################################################################
|
|
27
|
-
def add_file(
|
|
28
|
-
self,
|
|
29
|
-
filename
|
|
30
|
-
):
|
|
31
|
-
full_path = f"{os.getenv('OUTPUT_BUCKET_NAME')}/testing/{filename}"
|
|
32
|
-
print(full_path)
|
|
33
|
-
|
|
34
|
-
self.s3fs.touch(full_path)
|
|
35
|
-
ff = self.s3fs.ls(f"{os.getenv('OUTPUT_BUCKET_NAME')}/")
|
|
36
|
-
|
|
37
|
-
print(ff)
|
|
38
|
-
|
|
39
|
-
#####################################################################
|
|
40
|
-
def upload_data(
|
|
41
|
-
self,
|
|
42
|
-
bucket_name,
|
|
43
|
-
file_path,
|
|
44
|
-
prefix
|
|
45
|
-
):
|
|
46
|
-
# TODO: this works in theory but use boto3 to upload files
|
|
47
|
-
s3_path = f"s3://{bucket_name}/{prefix}/"
|
|
48
|
-
s3_file_system = self.s3fs
|
|
49
|
-
s3_file_system.put(file_path, s3_path, recursive=True)
|
|
50
|
-
|
|
51
|
-
#####################################################################
|
|
52
|
-
def s3_map(
|
|
53
|
-
self,
|
|
54
|
-
s3_zarr_store_path, # f's3://{bucket}/{input_zarr_path}'
|
|
55
|
-
):
|
|
56
|
-
# The "s3_zarr_store_path" is defined as f's3://{bucket}/{input_zarr_path}'
|
|
57
|
-
# create=False, not false because will be writing
|
|
58
|
-
# return s3fs.S3Map(root=s3_zarr_store_path, s3=self.s3fs, check=True)
|
|
59
|
-
return s3fs.S3Map(root=s3_zarr_store_path, s3=self.s3fs) # create=False, not false because will be writing
|
|
60
|
-
|
|
61
|
-
#####################################################################
|
|
62
|
-
def exists(
|
|
63
|
-
self,
|
|
64
|
-
geo_json_s3_path,
|
|
65
|
-
):
|
|
66
|
-
s3_file_system = self.s3fs
|
|
67
|
-
return s3_file_system.exists(path=geo_json_s3_path)
|
|
68
|
-
|
|
69
|
-
#####################################################################
|
|
70
|
-
# def put(
|
|
71
|
-
# self
|
|
72
|
-
# ):
|
|
73
|
-
# s3_file_system = self.s3fs
|
|
74
|
-
# return
|
model/cruise/__init__.py
DELETED
|
File without changes
|
|
@@ -1,166 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import numcodecs
|
|
3
|
-
import numpy as np
|
|
4
|
-
from ..utility.cleaner import Cleaner
|
|
5
|
-
from ..aws.dynamodb_manager import DynamoDBManager
|
|
6
|
-
from ..aws.s3_manager import S3Manager
|
|
7
|
-
from ..zarr.zarr_manager import ZarrManager
|
|
8
|
-
|
|
9
|
-
numcodecs.blosc.use_threads = False
|
|
10
|
-
numcodecs.blosc.set_nthreads(1)
|
|
11
|
-
|
|
12
|
-
TEMPDIR = "/tmp"
|
|
13
|
-
|
|
14
|
-
# TODO: when ready switch to version 3 of zarr spec
|
|
15
|
-
# ZARR_V3_EXPERIMENTAL_API = 1
|
|
16
|
-
# creates the latlon data: foo = ep.consolidate.add_location(ds_Sv, echodata)
|
|
17
|
-
|
|
18
|
-
class CreateEmptyZarrStore:
|
|
19
|
-
#######################################################
|
|
20
|
-
def __init__(
|
|
21
|
-
self,
|
|
22
|
-
):
|
|
23
|
-
self.__overwrite = True
|
|
24
|
-
# TODO: create output_bucket and input_bucket variables here?
|
|
25
|
-
self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
|
|
26
|
-
self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
|
|
27
|
-
|
|
28
|
-
#######################################################
|
|
29
|
-
|
|
30
|
-
def upload_zarr_store_to_s3(
|
|
31
|
-
self,
|
|
32
|
-
local_directory: str,
|
|
33
|
-
object_prefix: str,
|
|
34
|
-
cruise_name: str,
|
|
35
|
-
) -> None:
|
|
36
|
-
print('uploading zarr store to s3')
|
|
37
|
-
s3_manager = S3Manager()
|
|
38
|
-
#
|
|
39
|
-
print('Starting upload with thread pool executor.')
|
|
40
|
-
# # 'all_files' is passed a list of lists: [[local_path, s3_key], [...], ...]
|
|
41
|
-
all_files = []
|
|
42
|
-
for subdir, dirs, files in os.walk(f"{local_directory}/{cruise_name}.zarr"):
|
|
43
|
-
for file in files:
|
|
44
|
-
local_path = os.path.join(subdir, file)
|
|
45
|
-
# 'level_2/Henry_B._Bigelow/HB0806/EK60/HB0806.zarr/.zattrs'
|
|
46
|
-
s3_key = f'{object_prefix}/{cruise_name}.zarr{local_path.split(f"{cruise_name}.zarr")[-1]}'
|
|
47
|
-
all_files.append([local_path, s3_key])
|
|
48
|
-
#
|
|
49
|
-
# print(all_files)
|
|
50
|
-
s3_manager.upload_files_with_thread_pool_executor(
|
|
51
|
-
all_files=all_files,
|
|
52
|
-
)
|
|
53
|
-
print('Done uploading with thread pool executor.')
|
|
54
|
-
# TODO: move to common place
|
|
55
|
-
|
|
56
|
-
#######################################################
|
|
57
|
-
def create_cruise_level_zarr_store(
|
|
58
|
-
self,
|
|
59
|
-
ship_name: str,
|
|
60
|
-
cruise_name: str,
|
|
61
|
-
sensor_name: str,
|
|
62
|
-
table_name: str
|
|
63
|
-
) -> None:
|
|
64
|
-
try:
|
|
65
|
-
# HB0806 - 123, HB0903 - 220
|
|
66
|
-
dynamo_db_manager = DynamoDBManager()
|
|
67
|
-
|
|
68
|
-
df = dynamo_db_manager.get_table_as_df(
|
|
69
|
-
table_name=table_name,
|
|
70
|
-
ship_name=ship_name,
|
|
71
|
-
cruise_name=cruise_name,
|
|
72
|
-
sensor_name=sensor_name
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
# filter the dataframe just for enums >= LEVEL_1_PROCESSING
|
|
76
|
-
# df[df['PIPELINE_STATUS'] < PipelineStatus.LEVEL_1_PROCESSING] = np.nan
|
|
77
|
-
|
|
78
|
-
# TODO: VERIFY GEOJSON EXISTS as prerequisite!!!
|
|
79
|
-
|
|
80
|
-
print(f"DataFrame shape: {df.shape}")
|
|
81
|
-
cruise_channels = list(set([i for sublist in df['CHANNELS'].dropna() for i in sublist]))
|
|
82
|
-
cruise_channels.sort()
|
|
83
|
-
|
|
84
|
-
consolidated_zarr_width = np.sum(df['NUM_PING_TIME_DROPNA'].dropna().astype(int))
|
|
85
|
-
|
|
86
|
-
# [3] calculate the max/min measurement resolutions for the whole cruise
|
|
87
|
-
cruise_min_echo_range = float(np.min(df['MIN_ECHO_RANGE'].dropna().astype(float)))
|
|
88
|
-
|
|
89
|
-
# [4] calculate the maximum of the max depth values
|
|
90
|
-
cruise_max_echo_range = float(np.max(df['MAX_ECHO_RANGE'].dropna().astype(float)))
|
|
91
|
-
print(f"cruise_min_echo_range: {cruise_min_echo_range}, cruise_max_echo_range: {cruise_max_echo_range}")
|
|
92
|
-
|
|
93
|
-
# [5] get number of channels
|
|
94
|
-
cruise_frequencies = [float(i) for i in df['FREQUENCIES'].dropna().values.flatten()[0]]
|
|
95
|
-
print(cruise_frequencies)
|
|
96
|
-
|
|
97
|
-
new_width = int(consolidated_zarr_width)
|
|
98
|
-
print(f"new_width: {new_width}")
|
|
99
|
-
#################################################################
|
|
100
|
-
store_name = f"{cruise_name}.zarr"
|
|
101
|
-
print(store_name)
|
|
102
|
-
################################################################
|
|
103
|
-
# Delete existing zarr store if it exists
|
|
104
|
-
s3_manager = S3Manager()
|
|
105
|
-
zarr_prefix = os.path.join("level_2", ship_name, cruise_name, sensor_name)
|
|
106
|
-
child_objects = s3_manager.get_child_objects(
|
|
107
|
-
bucket_name=self.output_bucket_name,
|
|
108
|
-
sub_prefix=zarr_prefix,
|
|
109
|
-
)
|
|
110
|
-
if len(child_objects) > 0:
|
|
111
|
-
s3_manager.delete_nodd_objects(
|
|
112
|
-
objects=child_objects,
|
|
113
|
-
)
|
|
114
|
-
################################################################
|
|
115
|
-
# Create new zarr store
|
|
116
|
-
zarr_manager = ZarrManager()
|
|
117
|
-
new_height = len(zarr_manager.get_depth_values(
|
|
118
|
-
min_echo_range=cruise_min_echo_range,
|
|
119
|
-
max_echo_range=cruise_max_echo_range
|
|
120
|
-
))
|
|
121
|
-
print(f"new_height: {new_height}")
|
|
122
|
-
|
|
123
|
-
zarr_manager.create_zarr_store(
|
|
124
|
-
path=TEMPDIR,
|
|
125
|
-
ship_name=ship_name,
|
|
126
|
-
cruise_name=cruise_name,
|
|
127
|
-
sensor_name=sensor_name,
|
|
128
|
-
frequencies=cruise_frequencies,
|
|
129
|
-
width=new_width,
|
|
130
|
-
min_echo_range=cruise_min_echo_range,
|
|
131
|
-
max_echo_range=cruise_max_echo_range,
|
|
132
|
-
calibration_status=True,
|
|
133
|
-
)
|
|
134
|
-
#################################################################
|
|
135
|
-
self.upload_zarr_store_to_s3(
|
|
136
|
-
local_directory=TEMPDIR,
|
|
137
|
-
object_prefix=zarr_prefix,
|
|
138
|
-
cruise_name=cruise_name,
|
|
139
|
-
)
|
|
140
|
-
# https://noaa-wcsd-zarr-pds.s3.amazonaws.com/index.html
|
|
141
|
-
#################################################################
|
|
142
|
-
# Verify count of the files uploaded
|
|
143
|
-
# count = self.__get_file_count(store_name=store_name)
|
|
144
|
-
# #
|
|
145
|
-
# raw_zarr_files = self.__get_s3_files( # TODO: just need count
|
|
146
|
-
# bucket_name=self.__output_bucket,
|
|
147
|
-
# sub_prefix=os.path.join(zarr_prefix, store_name),
|
|
148
|
-
# )
|
|
149
|
-
# if len(raw_zarr_files) != count:
|
|
150
|
-
# print(f'Problem writing {store_name} with proper count {count}.')
|
|
151
|
-
# raise Exception("File count doesnt equal number of s3 Zarr store files.")
|
|
152
|
-
# else:
|
|
153
|
-
# print("File counts match.")
|
|
154
|
-
#################################################################
|
|
155
|
-
# Success
|
|
156
|
-
# TODO: update enum in dynamodb
|
|
157
|
-
#################################################################
|
|
158
|
-
except Exception as err:
|
|
159
|
-
print(f"Problem trying to create new cruise zarr store: {err}")
|
|
160
|
-
finally:
|
|
161
|
-
cleaner = Cleaner()
|
|
162
|
-
cleaner.delete_local_files()
|
|
163
|
-
print("Done creating cruise level zarr store")
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
###########################################################
|