water-column-sonar-processing 25.1.7__py3-none-any.whl → 25.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of water-column-sonar-processing might be problematic. Click here for more details.
- water_column_sonar_processing/aws/dynamodb_manager.py +27 -32
- water_column_sonar_processing/aws/s3_manager.py +52 -64
- water_column_sonar_processing/aws/s3fs_manager.py +3 -9
- water_column_sonar_processing/cruise/create_empty_zarr_store.py +14 -14
- water_column_sonar_processing/cruise/datatree_manager.py +3 -6
- water_column_sonar_processing/cruise/resample_regrid.py +67 -49
- water_column_sonar_processing/geometry/__init__.py +7 -2
- water_column_sonar_processing/geometry/elevation_manager.py +16 -17
- water_column_sonar_processing/geometry/geometry_manager.py +25 -25
- water_column_sonar_processing/geometry/line_simplification.py +150 -0
- water_column_sonar_processing/geometry/pmtile_generation.py +99 -64
- water_column_sonar_processing/index/index_manager.py +67 -32
- water_column_sonar_processing/model/zarr_manager.py +54 -22
- water_column_sonar_processing/process.py +15 -13
- water_column_sonar_processing/processing/__init__.py +2 -2
- water_column_sonar_processing/processing/batch_downloader.py +66 -41
- water_column_sonar_processing/processing/raw_to_zarr.py +121 -82
- water_column_sonar_processing/utility/constants.py +17 -2
- water_column_sonar_processing/utility/pipeline_status.py +11 -15
- {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.1.dist-info}/METADATA +21 -12
- water_column_sonar_processing-25.3.1.dist-info/RECORD +34 -0
- {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.1.dist-info}/WHEEL +1 -1
- water_column_sonar_processing/geometry/geometry_simplification.py +0 -82
- water_column_sonar_processing-25.1.7.dist-info/RECORD +0 -34
- {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.1.dist-info/licenses}/LICENSE +0 -0
- {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.1.dist-info}/top_level.txt +0 -0
|
@@ -9,8 +9,8 @@ from boto3.dynamodb.types import TypeDeserializer, TypeSerializer
|
|
|
9
9
|
class DynamoDBManager:
|
|
10
10
|
#####################################################################
|
|
11
11
|
def __init__(
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
self,
|
|
13
|
+
# endpoint_url
|
|
14
14
|
):
|
|
15
15
|
# self.endpoint_url = endpoint_url
|
|
16
16
|
self.dynamodb_session = boto3.Session(
|
|
@@ -62,7 +62,7 @@ class DynamoDBManager:
|
|
|
62
62
|
{"AttributeName": "FILE_NAME", "AttributeType": "S"},
|
|
63
63
|
{"AttributeName": "CRUISE_NAME", "AttributeType": "S"},
|
|
64
64
|
],
|
|
65
|
-
BillingMode="PAY_PER_REQUEST"
|
|
65
|
+
BillingMode="PAY_PER_REQUEST",
|
|
66
66
|
# ProvisionedThroughput={
|
|
67
67
|
# 'ReadCapacityUnits': 1_000,
|
|
68
68
|
# 'WriteCapacityUnits': 1_000
|
|
@@ -70,7 +70,9 @@ class DynamoDBManager:
|
|
|
70
70
|
)
|
|
71
71
|
# TODO: after creating status is 'CREATING', wait until 'ACTIVE'
|
|
72
72
|
response = self.dynamodb_client.describe_table(TableName=table_name)
|
|
73
|
-
print(
|
|
73
|
+
print(
|
|
74
|
+
response
|
|
75
|
+
) # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb/client/describe_table.html
|
|
74
76
|
# sleep then response['Table']['TableStatus'] == 'ACTIVE'
|
|
75
77
|
|
|
76
78
|
#####################################################################
|
|
@@ -111,7 +113,7 @@ class DynamoDBManager:
|
|
|
111
113
|
expression_attribute_names,
|
|
112
114
|
expression_attribute_values,
|
|
113
115
|
update_expression,
|
|
114
|
-
):
|
|
116
|
+
): # TODO: convert to boolean
|
|
115
117
|
try:
|
|
116
118
|
response = self.dynamodb_client.update_item(
|
|
117
119
|
TableName=table_name,
|
|
@@ -120,7 +122,7 @@ class DynamoDBManager:
|
|
|
120
122
|
ExpressionAttributeValues=expression_attribute_values,
|
|
121
123
|
UpdateExpression=update_expression,
|
|
122
124
|
)
|
|
123
|
-
|
|
125
|
+
return response["ResponseMetadata"]["HTTPStatusCode"] # TODO: should be 200
|
|
124
126
|
# print(f"HTTPStatusCode: {status_code}")
|
|
125
127
|
# assert status_code == 200, "Problem, unable to update dynamodb table."
|
|
126
128
|
# assert response['ConsumedCapacity']['TableName'] == table_name
|
|
@@ -131,22 +133,23 @@ class DynamoDBManager:
|
|
|
131
133
|
# TODO: change to "get_cruise_as_df"
|
|
132
134
|
def get_table_as_df(
|
|
133
135
|
self,
|
|
134
|
-
ship_name,
|
|
136
|
+
# ship_name,
|
|
135
137
|
cruise_name,
|
|
136
|
-
sensor_name,
|
|
138
|
+
# sensor_name,
|
|
137
139
|
table_name,
|
|
138
140
|
) -> pd.DataFrame:
|
|
139
141
|
"""
|
|
140
142
|
To be used to initialize a cruise, deletes all entries associated with that cruise
|
|
141
143
|
in the database.
|
|
144
|
+
#TODO: cruise names isn't good enough, there could be two instrument for a cruise...
|
|
142
145
|
"""
|
|
143
146
|
filter_expression = "CRUISE_NAME = :cr"
|
|
144
147
|
response = self.dynamodb_client.scan(
|
|
145
148
|
TableName=table_name,
|
|
146
149
|
# Limit=1000,
|
|
147
|
-
Select=
|
|
150
|
+
Select="ALL_ATTRIBUTES", # or 'SPECIFIC_ATTRIBUTES',
|
|
148
151
|
# ExclusiveStartKey=where to pick up
|
|
149
|
-
#ReturnConsumedCapacity='INDEXES' | 'TOTAL' | 'NONE', ...not sure
|
|
152
|
+
# ReturnConsumedCapacity='INDEXES' | 'TOTAL' | 'NONE', ...not sure
|
|
150
153
|
# ProjectionExpression='#SH, #CR, #FN', # what to specifically return — from expression_attribute_names
|
|
151
154
|
FilterExpression=filter_expression,
|
|
152
155
|
# ExpressionAttributeNames={
|
|
@@ -154,36 +157,36 @@ class DynamoDBManager:
|
|
|
154
157
|
# '#CR': 'CRUISE_NAME',
|
|
155
158
|
# '#FN': 'FILE_NAME',
|
|
156
159
|
# },
|
|
157
|
-
ExpressionAttributeValues={
|
|
158
|
-
|
|
159
|
-
|
|
160
|
+
ExpressionAttributeValues={ # criteria
|
|
161
|
+
":cr": {
|
|
162
|
+
"S": cruise_name,
|
|
160
163
|
},
|
|
161
164
|
},
|
|
162
|
-
ConsistentRead=True
|
|
165
|
+
ConsistentRead=True,
|
|
163
166
|
# ExclusiveStartKey=response["LastEvaluatedKey"],
|
|
164
167
|
)
|
|
165
168
|
# Note: table.scan() has 1 MB limit on results so pagination is used
|
|
166
169
|
|
|
167
170
|
if len(response["Items"]) == 0 and "LastEvaluatedKey" not in response:
|
|
168
|
-
return pd.DataFrame()
|
|
171
|
+
return pd.DataFrame() # If no results, return empty dataframe
|
|
169
172
|
|
|
170
173
|
data = response["Items"]
|
|
171
174
|
|
|
172
|
-
while response.get(
|
|
175
|
+
while response.get("LastEvaluatedKey"): # "LastEvaluatedKey" in response:
|
|
173
176
|
response = self.dynamodb_client.scan(
|
|
174
177
|
TableName=table_name,
|
|
175
178
|
### Either 'Select' or 'ExpressionAttributeNames'/'ProjectionExpression'
|
|
176
|
-
Select=
|
|
179
|
+
Select="ALL_ATTRIBUTES", # or 'SPECIFIC_ATTRIBUTES',
|
|
177
180
|
FilterExpression=filter_expression,
|
|
178
|
-
#ProjectionExpression='#SH, #CR, #FN', # what to specifically return — from expression_attribute_names
|
|
181
|
+
# ProjectionExpression='#SH, #CR, #FN', # what to specifically return — from expression_attribute_names
|
|
179
182
|
# ExpressionAttributeNames={ # would need to specify all cols in df
|
|
180
183
|
# '#SH': 'SHIP_NAME',
|
|
181
184
|
# '#CR': 'CRUISE_NAME',
|
|
182
185
|
# '#FN': 'FILE_NAME',
|
|
183
186
|
# },
|
|
184
187
|
ExpressionAttributeValues={ # criteria
|
|
185
|
-
|
|
186
|
-
|
|
188
|
+
":cr": {
|
|
189
|
+
"S": cruise_name,
|
|
187
190
|
},
|
|
188
191
|
},
|
|
189
192
|
ConsistentRead=True,
|
|
@@ -268,14 +271,7 @@ class DynamoDBManager:
|
|
|
268
271
|
Finds all rows associated with a cruise and deletes them.
|
|
269
272
|
"""
|
|
270
273
|
response = self.dynamodb_client.delete_item(
|
|
271
|
-
Key={
|
|
272
|
-
"CRUISE_NAME": {
|
|
273
|
-
"S": cruise_name
|
|
274
|
-
},
|
|
275
|
-
"FILE_NAME": {
|
|
276
|
-
"S": file_name
|
|
277
|
-
}
|
|
278
|
-
},
|
|
274
|
+
Key={"CRUISE_NAME": {"S": cruise_name}, "FILE_NAME": {"S": file_name}},
|
|
279
275
|
TableName=table_name,
|
|
280
276
|
ReturnConsumedCapacity="TOTALS",
|
|
281
277
|
)
|
|
@@ -286,8 +282,8 @@ class DynamoDBManager:
|
|
|
286
282
|
|
|
287
283
|
#####################################################################
|
|
288
284
|
def describe_table(
|
|
289
|
-
|
|
290
|
-
|
|
285
|
+
self,
|
|
286
|
+
table_name,
|
|
291
287
|
):
|
|
292
288
|
"""
|
|
293
289
|
Get a description of the table. Used to verify that records were added/removed.
|
|
@@ -296,8 +292,6 @@ class DynamoDBManager:
|
|
|
296
292
|
print(response)
|
|
297
293
|
return response
|
|
298
294
|
|
|
299
|
-
|
|
300
|
-
|
|
301
295
|
#####################################################################
|
|
302
296
|
# TODO: from test_raw_to_zarr get enum and use here
|
|
303
297
|
# def __update_processing_status(
|
|
@@ -357,4 +351,5 @@ class DynamoDBManager:
|
|
|
357
351
|
# )
|
|
358
352
|
# print("Done updating processing status.")
|
|
359
353
|
|
|
354
|
+
|
|
360
355
|
#########################################################################
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
import boto3
|
|
4
|
-
from typing import Optional
|
|
5
3
|
from collections.abc import Generator
|
|
6
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
|
+
from typing import Optional
|
|
7
6
|
|
|
7
|
+
import boto3
|
|
8
8
|
import botocore
|
|
9
9
|
from boto3.s3.transfer import TransferConfig
|
|
10
10
|
from botocore.config import Config
|
|
@@ -17,10 +17,7 @@ GB = 1024**3
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
#########################################################################
|
|
20
|
-
def chunked(
|
|
21
|
-
ll: list,
|
|
22
|
-
n: int
|
|
23
|
-
) -> Generator:
|
|
20
|
+
def chunked(ll: list, n: int) -> Generator:
|
|
24
21
|
# Yields successively n-sized chunks from ll.
|
|
25
22
|
for i in range(0, len(ll), n):
|
|
26
23
|
yield ll[i : i + n]
|
|
@@ -70,14 +67,18 @@ class S3Manager:
|
|
|
70
67
|
region_name=self.s3_region,
|
|
71
68
|
endpoint_url=self.endpoint_url,
|
|
72
69
|
)
|
|
73
|
-
self.s3_resource_noaa_wcsd_zarr_pds =
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
70
|
+
self.s3_resource_noaa_wcsd_zarr_pds = (
|
|
71
|
+
self.s3_session_noaa_wcsd_zarr_pds.resource(
|
|
72
|
+
service_name="s3",
|
|
73
|
+
config=self.s3_client_config,
|
|
74
|
+
region_name=self.s3_region,
|
|
75
|
+
endpoint_url=self.endpoint_url,
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
self.paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
79
|
+
self.paginator_noaa_wcsd_zarr_pds = (
|
|
80
|
+
self.s3_client_noaa_wcsd_zarr_pds.get_paginator("list_objects_v2")
|
|
78
81
|
)
|
|
79
|
-
self.paginator = self.s3_client.get_paginator('list_objects_v2')
|
|
80
|
-
self.paginator_noaa_wcsd_zarr_pds = self.s3_client_noaa_wcsd_zarr_pds.get_paginator('list_objects_v2')
|
|
81
82
|
|
|
82
83
|
# def get_client(self): # TODO: do i need this?
|
|
83
84
|
# return self.s3_session.client(
|
|
@@ -96,21 +97,18 @@ class S3Manager:
|
|
|
96
97
|
buckets. It allows public read of all objects.
|
|
97
98
|
"""
|
|
98
99
|
# https://github.com/aodn/aodn_cloud_optimised/blob/e5035495e782783cc8b9e58711d63ed466420350/test_aodn_cloud_optimised/test_schema.py#L7
|
|
99
|
-
public_policy = {
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
}
|
|
110
|
-
response1 = self.s3_client.create_bucket(
|
|
111
|
-
Bucket=bucket_name,
|
|
112
|
-
ACL='public-read'
|
|
113
|
-
)
|
|
100
|
+
# public_policy = {
|
|
101
|
+
# "Version": "2012-10-17",
|
|
102
|
+
# "Statement": [
|
|
103
|
+
# {
|
|
104
|
+
# "Effect": "Allow",
|
|
105
|
+
# "Principal": "*",
|
|
106
|
+
# "Action": "s3:GetObject",
|
|
107
|
+
# "Resource": f"arn:aws:s3:::{bucket_name}/*",
|
|
108
|
+
# }
|
|
109
|
+
# ],
|
|
110
|
+
# }
|
|
111
|
+
response1 = self.s3_client.create_bucket(Bucket=bucket_name, ACL="public-read")
|
|
114
112
|
print(response1)
|
|
115
113
|
# response = self.s3_client.put_bucket_policy(
|
|
116
114
|
# Bucket=bucket_name, Policy=json.dumps(public_policy)
|
|
@@ -133,7 +131,9 @@ class S3Manager:
|
|
|
133
131
|
"""
|
|
134
132
|
Used to upload a single file, e.g. the GeoJSON file to the NODD bucket
|
|
135
133
|
"""
|
|
136
|
-
self.s3_resource_noaa_wcsd_zarr_pds.Bucket(output_bucket_name).upload_file(
|
|
134
|
+
self.s3_resource_noaa_wcsd_zarr_pds.Bucket(output_bucket_name).upload_file(
|
|
135
|
+
Filename=file_name, Key=key
|
|
136
|
+
)
|
|
137
137
|
return key
|
|
138
138
|
|
|
139
139
|
#####################################################################
|
|
@@ -167,10 +167,10 @@ class S3Manager:
|
|
|
167
167
|
#####################################################################
|
|
168
168
|
# TODO: this uses resource, try to use client
|
|
169
169
|
def upload_file(
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
170
|
+
self,
|
|
171
|
+
filename: str,
|
|
172
|
+
bucket_name: str,
|
|
173
|
+
key: str,
|
|
174
174
|
):
|
|
175
175
|
# self.s3_client.upload_file(Filename=filename, Bucket=bucket, Key=key)
|
|
176
176
|
self.s3_resource.Bucket(bucket_name).upload_file(Filename=filename, Key=key)
|
|
@@ -205,11 +205,7 @@ class S3Manager:
|
|
|
205
205
|
return all_uploads
|
|
206
206
|
|
|
207
207
|
#####################################################################
|
|
208
|
-
def check_if_object_exists(
|
|
209
|
-
self,
|
|
210
|
-
bucket_name,
|
|
211
|
-
key_name
|
|
212
|
-
) -> bool:
|
|
208
|
+
def check_if_object_exists(self, bucket_name, key_name) -> bool:
|
|
213
209
|
s3_manager2 = S3Manager()
|
|
214
210
|
s3_manager2.list_objects(bucket_name=bucket_name, prefix=key_name)
|
|
215
211
|
s3_client_noaa_wcsd_zarr_pds = self.s3_client_noaa_wcsd_zarr_pds
|
|
@@ -217,10 +213,10 @@ class S3Manager:
|
|
|
217
213
|
# response = s3_resource_noaa_wcsd_zarr_pds.Object(bucket_name, key_name).load()
|
|
218
214
|
s3_client_noaa_wcsd_zarr_pds.head_object(Bucket=bucket_name, Key=key_name)
|
|
219
215
|
except botocore.exceptions.ClientError as e:
|
|
220
|
-
if e.response[
|
|
216
|
+
if e.response["Error"]["Code"] == "404":
|
|
221
217
|
# The object does not exist.
|
|
222
218
|
return False
|
|
223
|
-
elif e.response[
|
|
219
|
+
elif e.response["Error"]["Code"] == 403:
|
|
224
220
|
# Unauthorized, including invalid bucket
|
|
225
221
|
return False
|
|
226
222
|
else:
|
|
@@ -230,11 +226,7 @@ class S3Manager:
|
|
|
230
226
|
|
|
231
227
|
#####################################################################
|
|
232
228
|
# used: raw-to-zarr
|
|
233
|
-
def list_objects( # noaa-wcsd-pds and noaa-wcsd-zarr-pds
|
|
234
|
-
self,
|
|
235
|
-
bucket_name,
|
|
236
|
-
prefix
|
|
237
|
-
):
|
|
229
|
+
def list_objects(self, bucket_name, prefix): # noaa-wcsd-pds and noaa-wcsd-zarr-pds
|
|
238
230
|
# TODO: this isn't working for geojson detecting objects!!!!!!!
|
|
239
231
|
# analog to "find_children_objects"
|
|
240
232
|
# Returns a list of key strings for each object in bucket defined by prefix
|
|
@@ -261,14 +253,10 @@ class S3Manager:
|
|
|
261
253
|
|
|
262
254
|
#####################################################################
|
|
263
255
|
# TODO: change name to "directory"
|
|
264
|
-
def folder_exists_and_not_empty(
|
|
265
|
-
self,
|
|
266
|
-
bucket_name: str,
|
|
267
|
-
path: str
|
|
268
|
-
) -> bool:
|
|
256
|
+
def folder_exists_and_not_empty(self, bucket_name: str, path: str) -> bool:
|
|
269
257
|
if not path.endswith("/"):
|
|
270
258
|
path = path + "/"
|
|
271
|
-
s3_client = self.s3_client
|
|
259
|
+
# s3_client = self.s3_client
|
|
272
260
|
resp = self.list_objects(
|
|
273
261
|
bucket_name=bucket_name, prefix=path
|
|
274
262
|
) # TODO: this is returning root folder and doesn't include children or hidden folders
|
|
@@ -350,7 +338,7 @@ class S3Manager:
|
|
|
350
338
|
self,
|
|
351
339
|
bucket_name,
|
|
352
340
|
key,
|
|
353
|
-
file_name,
|
|
341
|
+
file_name, # where the file will be saved
|
|
354
342
|
):
|
|
355
343
|
self.s3_client.download_file(Bucket=bucket_name, Key=key, Filename=file_name)
|
|
356
344
|
# TODO: if bottom file doesn't exist, don't fail downloader
|
|
@@ -364,9 +352,7 @@ class S3Manager:
|
|
|
364
352
|
objects: list,
|
|
365
353
|
):
|
|
366
354
|
try:
|
|
367
|
-
print(
|
|
368
|
-
f"Deleting {len(objects)} objects in {bucket_name} in batches."
|
|
369
|
-
)
|
|
355
|
+
print(f"Deleting {len(objects)} objects in {bucket_name} in batches.")
|
|
370
356
|
objects_to_delete = []
|
|
371
357
|
for obj in objects:
|
|
372
358
|
objects_to_delete.append({"Key": obj["Key"]})
|
|
@@ -375,29 +361,31 @@ class S3Manager:
|
|
|
375
361
|
self.s3_client_noaa_wcsd_zarr_pds.delete_objects(
|
|
376
362
|
Bucket=bucket_name, Delete={"Objects": batch}
|
|
377
363
|
)
|
|
378
|
-
print(
|
|
364
|
+
print("Deleted files.")
|
|
379
365
|
except Exception as err:
|
|
380
366
|
print(f"Problem was encountered while deleting objects: {err}")
|
|
381
367
|
|
|
382
368
|
#####################################################################
|
|
383
369
|
# TODO: need to test this!!!
|
|
384
370
|
def delete_nodd_object(
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
371
|
+
self,
|
|
372
|
+
bucket_name,
|
|
373
|
+
key_name,
|
|
388
374
|
):
|
|
389
375
|
try:
|
|
390
|
-
print(
|
|
391
|
-
|
|
376
|
+
print(f"Deleting {key_name} objects in {bucket_name}.")
|
|
377
|
+
self.s3_client_noaa_wcsd_zarr_pds.delete_object(
|
|
378
|
+
Bucket=bucket_name, Key=key_name
|
|
392
379
|
)
|
|
393
|
-
|
|
394
|
-
print(f"Deleted file.")
|
|
380
|
+
print("Deleted file.")
|
|
395
381
|
except Exception as err:
|
|
396
382
|
print(f"Problem was encountered while deleting objects: {err}")
|
|
397
383
|
|
|
398
384
|
#####################################################################
|
|
399
385
|
def put(self, bucket_name, key, body): # noaa-wcsd-model-pds
|
|
400
|
-
self.s3_client.put_object(
|
|
386
|
+
self.s3_client.put_object(
|
|
387
|
+
Bucket=bucket_name, Key=key, Body=body
|
|
388
|
+
) # "Body" can be a file
|
|
401
389
|
|
|
402
390
|
#####################################################################
|
|
403
391
|
def read_s3_json(
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import Optional
|
|
3
|
+
|
|
3
4
|
import s3fs
|
|
4
5
|
|
|
5
6
|
# TODO: S3FS_LOGGING_LEVEL=DEBUG
|
|
@@ -38,8 +39,7 @@ class S3FSManager:
|
|
|
38
39
|
# create=False, not false because will be writing
|
|
39
40
|
# return s3fs.S3Map(root=s3_zarr_store_path, s3=self.s3fs, check=True)
|
|
40
41
|
return s3fs.S3Map(
|
|
41
|
-
root=s3_zarr_store_path,
|
|
42
|
-
s3=self.s3fs
|
|
42
|
+
root=s3_zarr_store_path, s3=self.s3fs
|
|
43
43
|
) # create=False, not false because will be writing
|
|
44
44
|
|
|
45
45
|
#####################################################################
|
|
@@ -53,12 +53,7 @@ class S3FSManager:
|
|
|
53
53
|
# print(ff)
|
|
54
54
|
|
|
55
55
|
#####################################################################
|
|
56
|
-
def upload_data(
|
|
57
|
-
self,
|
|
58
|
-
bucket_name,
|
|
59
|
-
file_path,
|
|
60
|
-
prefix
|
|
61
|
-
):
|
|
56
|
+
def upload_data(self, bucket_name, file_path, prefix):
|
|
62
57
|
# TODO: this works in theory but use boto3 to upload files
|
|
63
58
|
s3_path = f"s3://{bucket_name}/{prefix}/"
|
|
64
59
|
s3_file_system = self.s3fs
|
|
@@ -72,5 +67,4 @@ class S3FSManager:
|
|
|
72
67
|
# s3_file_system =
|
|
73
68
|
return self.s3fs.exists(s3_path)
|
|
74
69
|
|
|
75
|
-
|
|
76
70
|
#####################################################################
|
|
@@ -4,8 +4,7 @@ import tempfile
|
|
|
4
4
|
import numcodecs
|
|
5
5
|
import numpy as np
|
|
6
6
|
|
|
7
|
-
from water_column_sonar_processing.aws import DynamoDBManager
|
|
8
|
-
from water_column_sonar_processing.aws import S3Manager
|
|
7
|
+
from water_column_sonar_processing.aws import DynamoDBManager, S3Manager
|
|
9
8
|
from water_column_sonar_processing.model import ZarrManager
|
|
10
9
|
from water_column_sonar_processing.utility import Cleaner
|
|
11
10
|
|
|
@@ -42,14 +41,12 @@ class CreateEmptyZarrStore:
|
|
|
42
41
|
print("Starting upload with thread pool executor.")
|
|
43
42
|
# # 'all_files' is passed a list of lists: [[local_path, s3_key], [...], ...]
|
|
44
43
|
all_files = []
|
|
45
|
-
for subdir, dirs, files in os.walk(
|
|
46
|
-
f"{local_directory}/{cruise_name}.zarr"
|
|
47
|
-
):
|
|
44
|
+
for subdir, dirs, files in os.walk(f"{local_directory}/{cruise_name}.zarr"):
|
|
48
45
|
for file in files:
|
|
49
46
|
local_path = os.path.join(subdir, file)
|
|
50
47
|
# TODO: find a better method for splitting strings here:
|
|
51
48
|
# 'level_2/Henry_B._Bigelow/HB0806/EK60/HB0806.zarr/.zattrs'
|
|
52
|
-
s3_key = f
|
|
49
|
+
s3_key = f"{object_prefix}/{cruise_name}.zarr{local_path.split(f'{cruise_name}.zarr')[-1]}"
|
|
53
50
|
all_files.append([local_path, s3_key])
|
|
54
51
|
#
|
|
55
52
|
# print(all_files)
|
|
@@ -77,9 +74,7 @@ class CreateEmptyZarrStore:
|
|
|
77
74
|
|
|
78
75
|
df = dynamo_db_manager.get_table_as_df(
|
|
79
76
|
table_name=table_name,
|
|
80
|
-
ship_name=ship_name,
|
|
81
77
|
cruise_name=cruise_name,
|
|
82
|
-
sensor_name=sensor_name,
|
|
83
78
|
)
|
|
84
79
|
|
|
85
80
|
# TODO: filter the dataframe just for enums >= LEVEL_1_PROCESSING
|
|
@@ -98,14 +93,17 @@ class CreateEmptyZarrStore:
|
|
|
98
93
|
)
|
|
99
94
|
|
|
100
95
|
# [3] calculate the max/min measurement resolutions for the whole cruise
|
|
101
|
-
cruise_min_echo_range =
|
|
102
|
-
|
|
96
|
+
cruise_min_echo_range = np.min(
|
|
97
|
+
(df["MIN_ECHO_RANGE"] + df["WATER_LEVEL"]).dropna().astype(float)
|
|
103
98
|
)
|
|
104
99
|
|
|
105
100
|
# [4] calculate the maximum of the max depth values
|
|
106
|
-
cruise_max_echo_range =
|
|
107
|
-
|
|
101
|
+
cruise_max_echo_range = np.max(
|
|
102
|
+
(df["MAX_ECHO_RANGE"] + df["WATER_LEVEL"]).dropna().astype(float)
|
|
108
103
|
)
|
|
104
|
+
|
|
105
|
+
cruise_min_epsilon = np.min(df["MIN_ECHO_RANGE"].dropna().astype(float))
|
|
106
|
+
|
|
109
107
|
print(
|
|
110
108
|
f"cruise_min_echo_range: {cruise_min_echo_range}, cruise_max_echo_range: {cruise_max_echo_range}"
|
|
111
109
|
)
|
|
@@ -140,12 +138,13 @@ class CreateEmptyZarrStore:
|
|
|
140
138
|
zarr_manager.get_depth_values(
|
|
141
139
|
min_echo_range=cruise_min_echo_range,
|
|
142
140
|
max_echo_range=cruise_max_echo_range,
|
|
141
|
+
cruise_min_epsilon=cruise_min_epsilon,
|
|
143
142
|
)
|
|
144
143
|
)
|
|
145
144
|
print(f"new_height: {new_height}")
|
|
146
145
|
|
|
147
146
|
zarr_manager.create_zarr_store(
|
|
148
|
-
path=tempdir.name,
|
|
147
|
+
path=tempdir.name, # TODO: need to use .name or problem
|
|
149
148
|
ship_name=ship_name,
|
|
150
149
|
cruise_name=cruise_name,
|
|
151
150
|
sensor_name=sensor_name,
|
|
@@ -153,12 +152,13 @@ class CreateEmptyZarrStore:
|
|
|
153
152
|
width=new_width,
|
|
154
153
|
min_echo_range=cruise_min_echo_range,
|
|
155
154
|
max_echo_range=cruise_max_echo_range,
|
|
155
|
+
cruise_min_epsilon=cruise_min_epsilon,
|
|
156
156
|
calibration_status=True,
|
|
157
157
|
)
|
|
158
158
|
#################################################################
|
|
159
159
|
self.upload_zarr_store_to_s3(
|
|
160
160
|
output_bucket_name=output_bucket_name,
|
|
161
|
-
local_directory=tempdir.name,
|
|
161
|
+
local_directory=tempdir.name, # TODO: need to use .name or problem
|
|
162
162
|
object_prefix=zarr_prefix,
|
|
163
163
|
cruise_name=cruise_name,
|
|
164
164
|
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
### https://xarray-datatree.readthedocs.io/en/latest/data-structures.html
|
|
2
|
-
import numpy as np
|
|
3
|
-
from datatree import DataTree
|
|
4
2
|
import xarray as xr
|
|
3
|
+
from datatree import DataTree
|
|
4
|
+
|
|
5
5
|
|
|
6
6
|
class DatatreeManager:
|
|
7
7
|
#######################################################
|
|
@@ -17,8 +17,5 @@ class DatatreeManager:
|
|
|
17
17
|
) -> None:
|
|
18
18
|
ds1 = xr.Dataset({"foo": "orange"})
|
|
19
19
|
dt = DataTree(name="root", data=ds1) # create root node
|
|
20
|
-
ds2 = xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])})
|
|
20
|
+
# ds2 = xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])})
|
|
21
21
|
return dt
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|