water-column-sonar-processing 25.3.2__py3-none-any.whl → 25.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of water-column-sonar-processing might be problematic. Click here for more details.
- water_column_sonar_processing/aws/dynamodb_manager.py +6 -6
- water_column_sonar_processing/aws/s3_manager.py +95 -90
- water_column_sonar_processing/aws/s3fs_manager.py +5 -3
- water_column_sonar_processing/aws/sqs_manager.py +1 -1
- water_column_sonar_processing/cruise/__init__.py +2 -1
- water_column_sonar_processing/cruise/create_empty_zarr_store.py +49 -43
- water_column_sonar_processing/cruise/create_empty_zarr_store_level_3.py +161 -0
- water_column_sonar_processing/cruise/datatree_manager.py +21 -21
- water_column_sonar_processing/cruise/resample_regrid.py +57 -47
- water_column_sonar_processing/dataset/__init__.py +3 -0
- water_column_sonar_processing/dataset/dataset_manager.py +205 -0
- water_column_sonar_processing/dataset/feature_manager.py +32 -0
- water_column_sonar_processing/geometry/geometry_manager.py +11 -12
- water_column_sonar_processing/geometry/line_simplification.py +26 -1
- water_column_sonar_processing/geometry/pmtile_generation.py +211 -247
- water_column_sonar_processing/index/index_manager.py +18 -17
- water_column_sonar_processing/model/zarr_manager.py +504 -256
- water_column_sonar_processing/processing/__init__.py +3 -2
- water_column_sonar_processing/processing/batch_downloader.py +11 -11
- water_column_sonar_processing/processing/raw_to_netcdf.py +319 -0
- water_column_sonar_processing/processing/raw_to_zarr.py +41 -31
- water_column_sonar_processing/utility/__init__.py +9 -2
- water_column_sonar_processing/utility/cleaner.py +1 -2
- water_column_sonar_processing/utility/constants.py +26 -7
- water_column_sonar_processing/utility/timestamp.py +1 -0
- water_column_sonar_processing-25.8.0.dist-info/METADATA +162 -0
- water_column_sonar_processing-25.8.0.dist-info/RECORD +39 -0
- {water_column_sonar_processing-25.3.2.dist-info → water_column_sonar_processing-25.8.0.dist-info}/WHEEL +1 -1
- water_column_sonar_processing-25.3.2.dist-info/licenses/LICENSE → water_column_sonar_processing-25.8.0.dist-info/licenses/LICENSE-MIT +1 -1
- water_column_sonar_processing-25.3.2.dist-info/METADATA +0 -170
- water_column_sonar_processing-25.3.2.dist-info/RECORD +0 -34
- {water_column_sonar_processing-25.3.2.dist-info → water_column_sonar_processing-25.8.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# from .cruise_sampler import CruiseSampler
|
|
2
2
|
from .batch_downloader import BatchDownloader
|
|
3
|
-
from .
|
|
3
|
+
from .raw_to_netcdf import RawToNetCDF
|
|
4
|
+
from .raw_to_zarr import RawToZarr, get_water_level
|
|
4
5
|
|
|
5
|
-
__all__ = ["RawToZarr", "BatchDownloader"]
|
|
6
|
+
__all__ = ["RawToZarr", "get_water_level", "RawToNetCDF", "BatchDownloader"]
|
|
@@ -10,7 +10,7 @@ import xbatcher
|
|
|
10
10
|
|
|
11
11
|
class BatchDownloader:
|
|
12
12
|
"""
|
|
13
|
-
Uses the xbatcher XbatchDownloader to download
|
|
13
|
+
Uses the xbatcher XbatchDownloader to download dataset from an xarray dataset. Connection
|
|
14
14
|
is established
|
|
15
15
|
"""
|
|
16
16
|
|
|
@@ -50,13 +50,13 @@ class BatchDownloader:
|
|
|
50
50
|
|
|
51
51
|
def get_toy_batch_generator(self) -> xbatcher.BatchGenerator:
|
|
52
52
|
"""
|
|
53
|
-
Returns a BatchGenerator with subsets of Sv
|
|
54
|
-
Note: this is synthetic
|
|
53
|
+
Returns a BatchGenerator with subsets of Sv dataset
|
|
54
|
+
Note: this is synthetic dataset, for a smaller toy example
|
|
55
55
|
"""
|
|
56
56
|
depth = np.arange(1, 21) # N meters
|
|
57
57
|
time = pd.date_range(start="2025-01-01", end="2025-01-31", freq="D") # N days
|
|
58
58
|
frequency = [1_000, 2_000, 3_000] # N frequencies
|
|
59
|
-
Sv = np.random.rand(len(depth), len(time), len(frequency)) # synthetic
|
|
59
|
+
Sv = np.random.rand(len(depth), len(time), len(frequency)) # synthetic dataset
|
|
60
60
|
cruise = xr.Dataset(
|
|
61
61
|
data_vars={"Sv": (["depth", "time", "frequency"], Sv)},
|
|
62
62
|
coords={
|
|
@@ -84,10 +84,10 @@ class BatchDownloader:
|
|
|
84
84
|
return batch_generator
|
|
85
85
|
|
|
86
86
|
def get_s3_batch_generator(self) -> xbatcher.BatchGenerator:
|
|
87
|
-
"""Returns a BatchGenerator with subsets of Sv
|
|
87
|
+
"""Returns a BatchGenerator with subsets of Sv dataset from s3 Zarr store"""
|
|
88
88
|
cruise = self.get_s3_zarr_store()
|
|
89
89
|
|
|
90
|
-
# TODO: temporarily limits to a smaller slice of the
|
|
90
|
+
# TODO: temporarily limits to a smaller slice of the dataset
|
|
91
91
|
cruise_select = (
|
|
92
92
|
cruise.where(cruise.depth < 100.0, drop=True).sel(
|
|
93
93
|
time=slice("2007-07-11T18:20:33", "2007-07-11T18:20:53")
|
|
@@ -111,19 +111,19 @@ class BatchDownloader:
|
|
|
111
111
|
preload_batch=False,
|
|
112
112
|
)
|
|
113
113
|
|
|
114
|
-
# TODO: need to raise exception if all the
|
|
114
|
+
# TODO: need to raise exception if all the dataset is nan
|
|
115
115
|
|
|
116
116
|
return batch_generator
|
|
117
117
|
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator
|
|
118
118
|
|
|
119
119
|
def get_s3_manual_batch_generator(self):
|
|
120
120
|
"""
|
|
121
|
-
Using just xarray (no xbatcher), iterate through the
|
|
122
|
-
Returns a BatchGenerator with subsets of Sv
|
|
121
|
+
Using just xarray (no xbatcher), iterate through the dataset and generate batches.
|
|
122
|
+
Returns a BatchGenerator with subsets of Sv dataset from s3 Zarr store.
|
|
123
123
|
"""
|
|
124
124
|
cruise = self.get_s3_zarr_store()
|
|
125
125
|
|
|
126
|
-
# TODO: temporarily limits to a smaller slice of the
|
|
126
|
+
# TODO: temporarily limits to a smaller slice of the dataset
|
|
127
127
|
cruise_select = cruise.where(cruise.depth < 100.0, drop=True).sel(
|
|
128
128
|
time=slice("2007-07-11T18:20:33", "2007-07-11T18:20:53")
|
|
129
129
|
)
|
|
@@ -143,7 +143,7 @@ class BatchDownloader:
|
|
|
143
143
|
preload_batch=True,
|
|
144
144
|
)
|
|
145
145
|
|
|
146
|
-
# TODO: need to raise exception if all the
|
|
146
|
+
# TODO: need to raise exception if all the dataset is nan
|
|
147
147
|
|
|
148
148
|
return batch_generator
|
|
149
149
|
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
import gc
|
|
2
|
+
import os
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path # , PurePath
|
|
5
|
+
|
|
6
|
+
import echopype as ep
|
|
7
|
+
import numcodecs
|
|
8
|
+
import numpy as np
|
|
9
|
+
from numcodecs import Blosc
|
|
10
|
+
|
|
11
|
+
from water_column_sonar_processing.aws import DynamoDBManager, S3Manager
|
|
12
|
+
from water_column_sonar_processing.geometry import GeometryManager
|
|
13
|
+
from water_column_sonar_processing.utility import Cleaner
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# This code is getting copied from echofish-aws-raw-to-zarr-lambda
|
|
17
|
+
class RawToNetCDF:
|
|
18
|
+
#######################################################
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
# output_bucket_access_key,
|
|
22
|
+
# output_bucket_secret_access_key,
|
|
23
|
+
# # overwrite_existing_zarr_store,
|
|
24
|
+
):
|
|
25
|
+
# TODO: revert to Blosc.BITSHUFFLE, troubleshooting misc error
|
|
26
|
+
self.__compressor = Blosc(cname="zstd", clevel=2) # shuffle=Blosc.NOSHUFFLE
|
|
27
|
+
self.__overwrite = True
|
|
28
|
+
self.__num_threads = numcodecs.blosc.get_nthreads()
|
|
29
|
+
# self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
|
|
30
|
+
# self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
|
|
31
|
+
# self.__table_name = table_name
|
|
32
|
+
# # self.__overwrite_existing_zarr_store = overwrite_existing_zarr_store
|
|
33
|
+
|
|
34
|
+
############################################################################
|
|
35
|
+
############################################################################
|
|
36
|
+
def __netcdf_info_to_table(
|
|
37
|
+
self,
|
|
38
|
+
# output_bucket_name,
|
|
39
|
+
table_name,
|
|
40
|
+
ship_name,
|
|
41
|
+
cruise_name,
|
|
42
|
+
sensor_name,
|
|
43
|
+
file_name,
|
|
44
|
+
# zarr_path,
|
|
45
|
+
min_echo_range,
|
|
46
|
+
max_echo_range,
|
|
47
|
+
num_ping_time_dropna,
|
|
48
|
+
start_time,
|
|
49
|
+
end_time,
|
|
50
|
+
frequencies,
|
|
51
|
+
channels,
|
|
52
|
+
water_level,
|
|
53
|
+
):
|
|
54
|
+
print("Writing Zarr information to DynamoDB table.")
|
|
55
|
+
dynamodb_manager = DynamoDBManager()
|
|
56
|
+
dynamodb_manager.update_item(
|
|
57
|
+
table_name=table_name,
|
|
58
|
+
key={
|
|
59
|
+
"FILE_NAME": {"S": file_name}, # Partition Key
|
|
60
|
+
"CRUISE_NAME": {"S": cruise_name}, # Sort Key
|
|
61
|
+
},
|
|
62
|
+
expression_attribute_names={
|
|
63
|
+
"#CH": "CHANNELS",
|
|
64
|
+
"#ET": "END_TIME",
|
|
65
|
+
# "#ED": "ERROR_DETAIL",
|
|
66
|
+
"#FR": "FREQUENCIES",
|
|
67
|
+
"#MA": "MAX_ECHO_RANGE",
|
|
68
|
+
"#MI": "MIN_ECHO_RANGE",
|
|
69
|
+
"#ND": "NUM_PING_TIME_DROPNA",
|
|
70
|
+
# "#PS": "PIPELINE_STATUS",
|
|
71
|
+
"#PT": "PIPELINE_TIME",
|
|
72
|
+
"#SE": "SENSOR_NAME",
|
|
73
|
+
"#SH": "SHIP_NAME",
|
|
74
|
+
"#ST": "START_TIME",
|
|
75
|
+
# "#ZB": "ZARR_BUCKET",
|
|
76
|
+
# "#ZP": "ZARR_PATH",
|
|
77
|
+
"#WL": "WATER_LEVEL",
|
|
78
|
+
},
|
|
79
|
+
expression_attribute_values={
|
|
80
|
+
":ch": {"L": [{"S": i} for i in channels]},
|
|
81
|
+
":et": {"S": end_time},
|
|
82
|
+
# ":ed": {"S": ""},
|
|
83
|
+
":fr": {"L": [{"N": str(i)} for i in frequencies]},
|
|
84
|
+
":ma": {"N": str(np.round(max_echo_range, 4))},
|
|
85
|
+
":mi": {"N": str(np.round(min_echo_range, 4))},
|
|
86
|
+
":nd": {"N": str(num_ping_time_dropna)},
|
|
87
|
+
# ":ps": {"S": "PROCESSING_RESAMPLE_AND_WRITE_TO_ZARR_STORE"},
|
|
88
|
+
# ":ps": {"S": PipelineStatus.LEVEL_1_PROCESSING.name},
|
|
89
|
+
":pt": {"S": datetime.now().isoformat(timespec="seconds") + "Z"},
|
|
90
|
+
":se": {"S": sensor_name},
|
|
91
|
+
":sh": {"S": ship_name},
|
|
92
|
+
":st": {"S": start_time},
|
|
93
|
+
":wl": {"N": str(np.round(water_level, 2))},
|
|
94
|
+
# ":zb": {"S": output_bucket_name},
|
|
95
|
+
# ":zp": {"S": zarr_path},
|
|
96
|
+
},
|
|
97
|
+
update_expression=(
|
|
98
|
+
"SET "
|
|
99
|
+
"#CH = :ch, "
|
|
100
|
+
"#ET = :et, "
|
|
101
|
+
# "#ED = :ed, "
|
|
102
|
+
"#FR = :fr, "
|
|
103
|
+
"#MA = :ma, "
|
|
104
|
+
"#MI = :mi, "
|
|
105
|
+
"#ND = :nd, "
|
|
106
|
+
# "#PS = :ps, "
|
|
107
|
+
"#PT = :pt, "
|
|
108
|
+
"#SE = :se, "
|
|
109
|
+
"#SH = :sh, "
|
|
110
|
+
"#ST = :st, "
|
|
111
|
+
"#WL = :wl"
|
|
112
|
+
# "#ZB = :zb, "
|
|
113
|
+
# "#ZP = :zp"
|
|
114
|
+
),
|
|
115
|
+
)
|
|
116
|
+
print("Done writing Zarr information to DynamoDB table.")
|
|
117
|
+
|
|
118
|
+
############################################################################
|
|
119
|
+
############################################################################
|
|
120
|
+
############################################################################
|
|
121
|
+
def __upload_files_to_output_bucket(
|
|
122
|
+
self,
|
|
123
|
+
output_bucket_name,
|
|
124
|
+
local_directory,
|
|
125
|
+
object_prefix,
|
|
126
|
+
endpoint_url,
|
|
127
|
+
):
|
|
128
|
+
# Note: this will be passed credentials if using NODD
|
|
129
|
+
s3_manager = S3Manager(endpoint_url=endpoint_url)
|
|
130
|
+
print("Uploading files using thread pool executor.")
|
|
131
|
+
all_files = []
|
|
132
|
+
for subdir, dirs, files in os.walk(local_directory):
|
|
133
|
+
for file in files:
|
|
134
|
+
local_path = os.path.join(subdir, file)
|
|
135
|
+
s3_key = os.path.join(object_prefix, local_path)
|
|
136
|
+
all_files.append([local_path, s3_key])
|
|
137
|
+
# all_files
|
|
138
|
+
all_uploads = s3_manager.upload_files_with_thread_pool_executor(
|
|
139
|
+
output_bucket_name=output_bucket_name,
|
|
140
|
+
all_files=all_files,
|
|
141
|
+
)
|
|
142
|
+
return all_uploads
|
|
143
|
+
|
|
144
|
+
def __upload_file_to_output_bucket(
|
|
145
|
+
self,
|
|
146
|
+
output_bucket_name,
|
|
147
|
+
local_directory,
|
|
148
|
+
object_prefix,
|
|
149
|
+
endpoint_url,
|
|
150
|
+
):
|
|
151
|
+
# Note: this will be passed credentials if using NODD
|
|
152
|
+
s3_manager = S3Manager(endpoint_url=endpoint_url)
|
|
153
|
+
print("Uploading files using thread pool executor.")
|
|
154
|
+
all_files = [local_directory]
|
|
155
|
+
all_uploads = s3_manager.upload_files_with_thread_pool_executor(
|
|
156
|
+
output_bucket_name=output_bucket_name,
|
|
157
|
+
all_files=all_files,
|
|
158
|
+
)
|
|
159
|
+
return all_uploads
|
|
160
|
+
|
|
161
|
+
############################################################################
|
|
162
|
+
def raw_to_netcdf(
|
|
163
|
+
self,
|
|
164
|
+
table_name,
|
|
165
|
+
input_bucket_name,
|
|
166
|
+
output_bucket_name,
|
|
167
|
+
ship_name,
|
|
168
|
+
cruise_name,
|
|
169
|
+
sensor_name,
|
|
170
|
+
raw_file_name,
|
|
171
|
+
endpoint_url=None,
|
|
172
|
+
include_bot=True,
|
|
173
|
+
):
|
|
174
|
+
"""
|
|
175
|
+
Downloads the raw files, processes them with echopype, and uploads files
|
|
176
|
+
to the nodd bucket.
|
|
177
|
+
|
|
178
|
+
Needs to create two files, one echopype opened file, one is Sv calibrated file
|
|
179
|
+
"""
|
|
180
|
+
print(f"Opening raw: {raw_file_name} and creating netcdf.")
|
|
181
|
+
try:
|
|
182
|
+
geometry_manager = GeometryManager()
|
|
183
|
+
cleaner = Cleaner()
|
|
184
|
+
cleaner.delete_local_files(
|
|
185
|
+
file_types=["*.nc", "*.json"]
|
|
186
|
+
) # TODO: include bot and raw?
|
|
187
|
+
|
|
188
|
+
s3_manager = S3Manager(endpoint_url=endpoint_url)
|
|
189
|
+
s3_file_path = (
|
|
190
|
+
f"dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{raw_file_name}"
|
|
191
|
+
)
|
|
192
|
+
bottom_file_name = f"{Path(raw_file_name).stem}.bot"
|
|
193
|
+
s3_bottom_file_path = f"dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{bottom_file_name}"
|
|
194
|
+
s3_manager.download_file(
|
|
195
|
+
bucket_name=input_bucket_name, key=s3_file_path, file_name=raw_file_name
|
|
196
|
+
)
|
|
197
|
+
# TODO: add the bottom file
|
|
198
|
+
if include_bot:
|
|
199
|
+
s3_manager.download_file(
|
|
200
|
+
bucket_name=input_bucket_name,
|
|
201
|
+
key=s3_bottom_file_path,
|
|
202
|
+
file_name=bottom_file_name,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
gc.collect()
|
|
206
|
+
print("Opening raw file with echopype.")
|
|
207
|
+
# s3_file_path = f"s3://{bucket_name}/dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}"
|
|
208
|
+
# s3_file_path = Path(f"s3://noaa-wcsd-pds/dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}")
|
|
209
|
+
echodata = ep.open_raw(
|
|
210
|
+
raw_file=raw_file_name,
|
|
211
|
+
sonar_model=sensor_name,
|
|
212
|
+
include_bot=include_bot,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
netcdf_name = f"{Path(raw_file_name).stem}.nc"
|
|
216
|
+
# Xarray Dataset to netcdf
|
|
217
|
+
echodata.to_netcdf(
|
|
218
|
+
save_path=netcdf_name,
|
|
219
|
+
compress=True,
|
|
220
|
+
overwrite=True,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
print("Compute volume backscattering strength (Sv) from raw dataset.")
|
|
224
|
+
ds_sv = ep.calibrate.compute_Sv(echodata)
|
|
225
|
+
ds_sv = ep.consolidate.add_depth(
|
|
226
|
+
ds_sv, echodata
|
|
227
|
+
) # TODO: consolidate with other depth values
|
|
228
|
+
# water_level = ds_sv["water_level"].values
|
|
229
|
+
gc.collect()
|
|
230
|
+
print("Done computing volume backscatter strength (Sv) from raw dataset.")
|
|
231
|
+
# Note: detected_seafloor_depth is located at echodata.vendor.detected_seafloor_depth
|
|
232
|
+
# but is not written out with ds_sv
|
|
233
|
+
if "detected_seafloor_depth" in list(echodata.vendor.variables):
|
|
234
|
+
ds_sv["detected_seafloor_depth"] = (
|
|
235
|
+
echodata.vendor.detected_seafloor_depth
|
|
236
|
+
)
|
|
237
|
+
#
|
|
238
|
+
# frequencies = echodata.environment.frequency_nominal.values
|
|
239
|
+
#################################################################
|
|
240
|
+
# Get GPS coordinates, just overwrite the lat lon values
|
|
241
|
+
gps_data, lat, lon = geometry_manager.read_echodata_gps_data(
|
|
242
|
+
echodata=echodata,
|
|
243
|
+
output_bucket_name=output_bucket_name,
|
|
244
|
+
ship_name=ship_name,
|
|
245
|
+
cruise_name=cruise_name,
|
|
246
|
+
sensor_name=sensor_name,
|
|
247
|
+
file_name=raw_file_name,
|
|
248
|
+
endpoint_url=endpoint_url,
|
|
249
|
+
write_geojson=False,
|
|
250
|
+
)
|
|
251
|
+
ds_sv = ep.consolidate.add_location(ds_sv, echodata)
|
|
252
|
+
ds_sv.latitude.values = (
|
|
253
|
+
lat # overwriting echopype gps values to include missing values
|
|
254
|
+
)
|
|
255
|
+
ds_sv.longitude.values = lon
|
|
256
|
+
# gps_data, lat, lon = self.__get_gps_data(echodata=echodata)
|
|
257
|
+
|
|
258
|
+
# Create the netcdf
|
|
259
|
+
netcdf_name_computed_Sv = f"{Path(raw_file_name).stem}_computed_Sv.nc"
|
|
260
|
+
|
|
261
|
+
# Xarray Dataset to netcdf
|
|
262
|
+
ds_sv.to_netcdf(
|
|
263
|
+
path=netcdf_name_computed_Sv,
|
|
264
|
+
mode="w",
|
|
265
|
+
)
|
|
266
|
+
gc.collect()
|
|
267
|
+
#################################################################
|
|
268
|
+
# output_netcdf_prefix = f"level_1/{ship_name}/{cruise_name}/{sensor_name}/"
|
|
269
|
+
#################################################################
|
|
270
|
+
# If netcdf already exists then delete
|
|
271
|
+
s3_manager = S3Manager(endpoint_url=endpoint_url)
|
|
272
|
+
child_objects = s3_manager.get_child_objects(
|
|
273
|
+
bucket_name=output_bucket_name,
|
|
274
|
+
sub_prefix=f"level_1/{ship_name}/{cruise_name}/{sensor_name}/{Path(raw_file_name).stem}.nc",
|
|
275
|
+
)
|
|
276
|
+
if len(child_objects) > 0:
|
|
277
|
+
print(
|
|
278
|
+
"NetCDF dataset already exists in s3, deleting existing and continuing."
|
|
279
|
+
)
|
|
280
|
+
s3_manager.delete_nodd_objects(
|
|
281
|
+
bucket_name=output_bucket_name,
|
|
282
|
+
objects=child_objects,
|
|
283
|
+
)
|
|
284
|
+
child_objects_computed_Sv = s3_manager.get_child_objects(
|
|
285
|
+
bucket_name=output_bucket_name,
|
|
286
|
+
sub_prefix=f"level_1/{ship_name}/{cruise_name}/{sensor_name}/{Path(raw_file_name).stem}_computed_Sv.nc",
|
|
287
|
+
)
|
|
288
|
+
if len(child_objects_computed_Sv) > 0:
|
|
289
|
+
print("data already exists in s3, deleting existing and continuing.")
|
|
290
|
+
s3_manager.delete_nodd_objects(
|
|
291
|
+
bucket_name=output_bucket_name,
|
|
292
|
+
objects=child_objects_computed_Sv,
|
|
293
|
+
)
|
|
294
|
+
#################################################################
|
|
295
|
+
s3_manager.upload_file(
|
|
296
|
+
filename=netcdf_name,
|
|
297
|
+
bucket_name=output_bucket_name,
|
|
298
|
+
key=f"level_1/{ship_name}/{cruise_name}/{sensor_name}/{Path(raw_file_name).stem}.nc",
|
|
299
|
+
)
|
|
300
|
+
s3_manager.upload_file(
|
|
301
|
+
filename=netcdf_name_computed_Sv,
|
|
302
|
+
bucket_name=output_bucket_name,
|
|
303
|
+
key=f"level_1/{ship_name}/{cruise_name}/{sensor_name}/{Path(raw_file_name).stem}_computed_Sv.nc",
|
|
304
|
+
)
|
|
305
|
+
except Exception as err:
|
|
306
|
+
print(f"Exception encountered creating local netcdf with echopype: {err}")
|
|
307
|
+
raise RuntimeError(f"Problem creating local netcdf, {err}")
|
|
308
|
+
finally:
|
|
309
|
+
gc.collect()
|
|
310
|
+
cleaner.delete_local_files(
|
|
311
|
+
file_types=["*.raw", "*.bot", "*.zarr", "*.nc", "*.json"]
|
|
312
|
+
)
|
|
313
|
+
print("Done creating local zarr store.")
|
|
314
|
+
|
|
315
|
+
############################################################################
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
################################################################################
|
|
319
|
+
############################################################################
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import gc
|
|
2
2
|
import os
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from pathlib import Path
|
|
4
|
+
from pathlib import Path
|
|
5
5
|
|
|
6
6
|
import echopype as ep
|
|
7
7
|
import numcodecs
|
|
@@ -13,6 +13,16 @@ from water_column_sonar_processing.geometry import GeometryManager
|
|
|
13
13
|
from water_column_sonar_processing.utility import Cleaner
|
|
14
14
|
|
|
15
15
|
|
|
16
|
+
def get_water_level(ds):
|
|
17
|
+
"""
|
|
18
|
+
needs to be mocked up so thats why this is broken out
|
|
19
|
+
"""
|
|
20
|
+
if "water_level" in ds.keys():
|
|
21
|
+
return ds.water_level.values
|
|
22
|
+
else:
|
|
23
|
+
return 0.0
|
|
24
|
+
|
|
25
|
+
|
|
16
26
|
# This code is getting copied from echofish-aws-raw-to-zarr-lambda
|
|
17
27
|
class RawToZarr:
|
|
18
28
|
#######################################################
|
|
@@ -35,13 +45,11 @@ class RawToZarr:
|
|
|
35
45
|
############################################################################
|
|
36
46
|
def __zarr_info_to_table(
|
|
37
47
|
self,
|
|
38
|
-
# output_bucket_name,
|
|
39
48
|
table_name,
|
|
40
49
|
ship_name,
|
|
41
50
|
cruise_name,
|
|
42
|
-
sensor_name,
|
|
51
|
+
sensor_name, # : Constants, TODO: convert to enum
|
|
43
52
|
file_name,
|
|
44
|
-
# zarr_path,
|
|
45
53
|
min_echo_range,
|
|
46
54
|
max_echo_range,
|
|
47
55
|
num_ping_time_dropna,
|
|
@@ -67,13 +75,10 @@ class RawToZarr:
|
|
|
67
75
|
"#MA": "MAX_ECHO_RANGE",
|
|
68
76
|
"#MI": "MIN_ECHO_RANGE",
|
|
69
77
|
"#ND": "NUM_PING_TIME_DROPNA",
|
|
70
|
-
# "#PS": "PIPELINE_STATUS",
|
|
71
78
|
"#PT": "PIPELINE_TIME",
|
|
72
79
|
"#SE": "SENSOR_NAME",
|
|
73
80
|
"#SH": "SHIP_NAME",
|
|
74
81
|
"#ST": "START_TIME",
|
|
75
|
-
# "#ZB": "ZARR_BUCKET",
|
|
76
|
-
# "#ZP": "ZARR_PATH",
|
|
77
82
|
"#WL": "WATER_LEVEL",
|
|
78
83
|
},
|
|
79
84
|
expression_attribute_values={
|
|
@@ -84,33 +89,25 @@ class RawToZarr:
|
|
|
84
89
|
":ma": {"N": str(np.round(max_echo_range, 4))},
|
|
85
90
|
":mi": {"N": str(np.round(min_echo_range, 4))},
|
|
86
91
|
":nd": {"N": str(num_ping_time_dropna)},
|
|
87
|
-
# ":ps": {"S": "PROCESSING_RESAMPLE_AND_WRITE_TO_ZARR_STORE"},
|
|
88
|
-
# ":ps": {"S": PipelineStatus.LEVEL_1_PROCESSING.name},
|
|
89
92
|
":pt": {"S": datetime.now().isoformat(timespec="seconds") + "Z"},
|
|
90
93
|
":se": {"S": sensor_name},
|
|
91
94
|
":sh": {"S": ship_name},
|
|
92
95
|
":st": {"S": start_time},
|
|
93
96
|
":wl": {"N": str(np.round(water_level, 2))},
|
|
94
|
-
# ":zb": {"S": output_bucket_name},
|
|
95
|
-
# ":zp": {"S": zarr_path},
|
|
96
97
|
},
|
|
97
98
|
update_expression=(
|
|
98
99
|
"SET "
|
|
99
100
|
"#CH = :ch, "
|
|
100
101
|
"#ET = :et, "
|
|
101
|
-
# "#ED = :ed, "
|
|
102
102
|
"#FR = :fr, "
|
|
103
103
|
"#MA = :ma, "
|
|
104
104
|
"#MI = :mi, "
|
|
105
105
|
"#ND = :nd, "
|
|
106
|
-
# "#PS = :ps, "
|
|
107
106
|
"#PT = :pt, "
|
|
108
107
|
"#SE = :se, "
|
|
109
108
|
"#SH = :sh, "
|
|
110
109
|
"#ST = :st, "
|
|
111
110
|
"#WL = :wl"
|
|
112
|
-
# "#ZB = :zb, "
|
|
113
|
-
# "#ZP = :zp"
|
|
114
111
|
),
|
|
115
112
|
)
|
|
116
113
|
print("Done writing Zarr information to DynamoDB table.")
|
|
@@ -120,16 +117,20 @@ class RawToZarr:
|
|
|
120
117
|
############################################################################
|
|
121
118
|
def __upload_files_to_output_bucket(
|
|
122
119
|
self,
|
|
123
|
-
output_bucket_name,
|
|
124
|
-
local_directory,
|
|
125
|
-
object_prefix,
|
|
120
|
+
output_bucket_name: str,
|
|
121
|
+
local_directory: str, # e.g. 'D20070724-T042400.zarr' # TODO: problem: if this is not in the current directory
|
|
122
|
+
object_prefix: str, # e.g. "level_1/Henry_B._Bigelow/HB0706/EK60/"
|
|
126
123
|
endpoint_url,
|
|
127
124
|
):
|
|
128
125
|
# Note: this will be passed credentials if using NODD
|
|
126
|
+
# TODO: this will not work if the local_directory is anywhere other than the current folder
|
|
127
|
+
# see test_s3_manager test_upload...pool_executor for solution
|
|
129
128
|
s3_manager = S3Manager(endpoint_url=endpoint_url)
|
|
130
129
|
print("Uploading files using thread pool executor.")
|
|
131
130
|
all_files = []
|
|
132
|
-
for subdir, dirs, files in os.walk(
|
|
131
|
+
for subdir, dirs, files in os.walk(
|
|
132
|
+
local_directory
|
|
133
|
+
): # os.path.basename(s3_manager_test_path.joinpath("HB0707.zarr/"))
|
|
133
134
|
for file in files:
|
|
134
135
|
local_path = os.path.join(subdir, file)
|
|
135
136
|
s3_key = os.path.join(object_prefix, local_path)
|
|
@@ -141,6 +142,8 @@ class RawToZarr:
|
|
|
141
142
|
)
|
|
142
143
|
return all_uploads
|
|
143
144
|
|
|
145
|
+
############################################################################
|
|
146
|
+
|
|
144
147
|
############################################################################
|
|
145
148
|
def raw_to_zarr(
|
|
146
149
|
self,
|
|
@@ -167,11 +170,11 @@ class RawToZarr:
|
|
|
167
170
|
|
|
168
171
|
s3_manager = S3Manager(endpoint_url=endpoint_url)
|
|
169
172
|
s3_file_path = (
|
|
170
|
-
f"
|
|
173
|
+
f"dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{raw_file_name}"
|
|
171
174
|
)
|
|
172
175
|
bottom_file_name = f"{Path(raw_file_name).stem}.bot"
|
|
173
176
|
s3_bottom_file_path = (
|
|
174
|
-
f"
|
|
177
|
+
f"dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{bottom_file_name}"
|
|
175
178
|
)
|
|
176
179
|
s3_manager.download_file(
|
|
177
180
|
bucket_name=input_bucket_name, key=s3_file_path, file_name=raw_file_name
|
|
@@ -187,8 +190,8 @@ class RawToZarr:
|
|
|
187
190
|
try:
|
|
188
191
|
gc.collect()
|
|
189
192
|
print("Opening raw file with echopype.")
|
|
190
|
-
# s3_file_path = f"s3://{bucket_name}/
|
|
191
|
-
# s3_file_path = Path(f"s3://noaa-wcsd-pds/
|
|
193
|
+
# s3_file_path = f"s3://{bucket_name}/dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}"
|
|
194
|
+
# s3_file_path = Path(f"s3://noaa-wcsd-pds/dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}")
|
|
192
195
|
echodata = ep.open_raw(
|
|
193
196
|
raw_file=raw_file_name,
|
|
194
197
|
sonar_model=sensor_name,
|
|
@@ -197,14 +200,16 @@ class RawToZarr:
|
|
|
197
200
|
# max_chunk_size=300,
|
|
198
201
|
# storage_options={'anon': True } # 'endpoint_url': self.endpoint_url} # this was creating problems
|
|
199
202
|
)
|
|
200
|
-
print("Compute volume backscattering strength (Sv) from raw
|
|
203
|
+
print("Compute volume backscattering strength (Sv) from raw dataset.")
|
|
201
204
|
ds_sv = ep.calibrate.compute_Sv(echodata)
|
|
202
205
|
ds_sv = ep.consolidate.add_depth(
|
|
203
206
|
ds_sv, echodata
|
|
204
207
|
) # TODO: consolidate with other depth values
|
|
205
|
-
|
|
208
|
+
|
|
209
|
+
water_level = get_water_level(ds_sv)
|
|
210
|
+
|
|
206
211
|
gc.collect()
|
|
207
|
-
print("Done computing volume backscatter strength (Sv) from raw
|
|
212
|
+
print("Done computing volume backscatter strength (Sv) from raw dataset.")
|
|
208
213
|
# Note: detected_seafloor_depth is located at echodata.vendor.detected_seafloor_depth
|
|
209
214
|
# but is not written out with ds_sv
|
|
210
215
|
if "detected_seafloor_depth" in list(echodata.vendor.variables):
|
|
@@ -237,7 +242,14 @@ class RawToZarr:
|
|
|
237
242
|
# TODO revert this so that smaller diffs can be used
|
|
238
243
|
# The most minimum the resolution can be is as small as 0.25 meters
|
|
239
244
|
min_echo_range = np.round(np.nanmin(np.diff(ds_sv.echo_range.values)), 2)
|
|
245
|
+
# For the HB0710 cruise the depths vary from 499.7215 @19cm to 2999.4805 @ 1cm. Moving that back
|
|
246
|
+
# inline with the
|
|
247
|
+
min_echo_range = np.max(
|
|
248
|
+
[0.20, min_echo_range]
|
|
249
|
+
) # TODO: experiment with 0.25 and 0.50
|
|
250
|
+
|
|
240
251
|
max_echo_range = float(np.nanmax(ds_sv.echo_range))
|
|
252
|
+
|
|
241
253
|
# This is the number of missing values found throughout the lat/lon
|
|
242
254
|
num_ping_time_dropna = lat[~np.isnan(lat)].shape[0] # symmetric to lon
|
|
243
255
|
#
|
|
@@ -269,7 +281,7 @@ class RawToZarr:
|
|
|
269
281
|
)
|
|
270
282
|
if len(child_objects) > 0:
|
|
271
283
|
print(
|
|
272
|
-
"Zarr store
|
|
284
|
+
"Zarr store dataset already exists in s3, deleting existing and continuing."
|
|
273
285
|
)
|
|
274
286
|
s3_manager.delete_nodd_objects(
|
|
275
287
|
bucket_name=output_bucket_name,
|
|
@@ -284,13 +296,11 @@ class RawToZarr:
|
|
|
284
296
|
)
|
|
285
297
|
#################################################################
|
|
286
298
|
self.__zarr_info_to_table(
|
|
287
|
-
# output_bucket_name=output_bucket_name,
|
|
288
299
|
table_name=table_name,
|
|
289
300
|
ship_name=ship_name,
|
|
290
301
|
cruise_name=cruise_name,
|
|
291
302
|
sensor_name=sensor_name,
|
|
292
303
|
file_name=raw_file_name,
|
|
293
|
-
# zarr_path=os.path.join(output_zarr_prefix, store_name),
|
|
294
304
|
min_echo_range=min_echo_range,
|
|
295
305
|
max_echo_range=max_echo_range,
|
|
296
306
|
num_ping_time_dropna=num_ping_time_dropna,
|
|
@@ -334,7 +344,7 @@ class RawToZarr:
|
|
|
334
344
|
# #######################################################################
|
|
335
345
|
# store_name = f"{os.path.splitext(input_file_name)[0]}.zarr"
|
|
336
346
|
# output_zarr_prefix = f"level_1/{ship_name}/{cruise_name}/{sensor_name}"
|
|
337
|
-
# bucket_key = f"
|
|
347
|
+
# bucket_key = f"dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{input_file_name}"
|
|
338
348
|
# zarr_prefix = os.path.join("level_1", ship_name, cruise_name, sensor_name)
|
|
339
349
|
# #
|
|
340
350
|
# os.chdir(TEMPDIR) # Lambdas require use of temp directory
|
|
@@ -348,7 +358,7 @@ class RawToZarr:
|
|
|
348
358
|
# secret_access_key=self.__output_bucket_secret_access_key
|
|
349
359
|
# )
|
|
350
360
|
# if len(s3_objects) > 0:
|
|
351
|
-
# print('Zarr store
|
|
361
|
+
# print('Zarr store dataset already exists in s3, deleting existing and continuing.')
|
|
352
362
|
# self.__s3.delete_objects(
|
|
353
363
|
# bucket_name=self.__output_bucket,
|
|
354
364
|
# objects=s3_objects,
|
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
from .cleaner import Cleaner
|
|
2
|
-
from .constants import Constants, Coordinates
|
|
2
|
+
from .constants import Constants, Coordinates, Instruments
|
|
3
3
|
from .pipeline_status import PipelineStatus
|
|
4
4
|
from .timestamp import Timestamp
|
|
5
5
|
|
|
6
|
-
__all__ = [
|
|
6
|
+
__all__ = [
|
|
7
|
+
"Cleaner",
|
|
8
|
+
"Instruments",
|
|
9
|
+
"Constants",
|
|
10
|
+
"Coordinates",
|
|
11
|
+
"PipelineStatus",
|
|
12
|
+
"Timestamp",
|
|
13
|
+
]
|
|
@@ -5,8 +5,7 @@ import shutil
|
|
|
5
5
|
|
|
6
6
|
###########################################################
|
|
7
7
|
class Cleaner:
|
|
8
|
-
|
|
9
|
-
def delete_local_files(file_types=["*.raw*", "*.model"]): # '*.json'
|
|
8
|
+
def delete_local_files(self, file_types=["*.raw*", "*.model"]): # '*.json'
|
|
10
9
|
# TODO: add .zarr to this
|
|
11
10
|
print("Deleting all local raw and model files")
|
|
12
11
|
for i in file_types:
|